public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 0/4] x86: some more optimization plus a new pseudo insn form
@ 2023-06-16  7:29 Jan Beulich
  2023-06-16  7:30 ` [PATCH 1/4] x86: optimize pre-AVX512 {,V}PCMPEQQ with identical sources Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Jan Beulich @ 2023-06-16  7:29 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

While the first three patches here merely may require establishing
how far we may want to go with optimizations which are possible,
the last patch is at least partly RFC, for going beyond what
vendor documentation states / suggests. Constructive comments
welcome.

1: optimize pre-AVX512 {,V}PCMPEQQ with identical sources
2: optimize pre-AVX512 {,V}PCMPGT* with identical sources
3: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ
4: provide a 128-bit VBROADCASTSD pseudo

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/4] x86: optimize pre-AVX512 {,V}PCMPEQQ with identical sources
  2023-06-16  7:29 [PATCH 0/4] x86: some more optimization plus a new pseudo insn form Jan Beulich
@ 2023-06-16  7:30 ` Jan Beulich
  2023-06-16  7:31 ` [PATCH 2/4] x86: optimize pre-AVX512 {,V}PCMPGT* " Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2023-06-16  7:30 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

The {,V}PCMPEQD alternative is 1 byte shorter in many cases.
---
It's not really clear whether the same would be worthwhile for AVX512
forms: Some could be expressed via KXNOR* (when no masking is in effect)
or KOR* (when masking is in effect), but others cannot. And while in
pre-AVX512 code these patterns are likely to be used to produce all-ones
idioms, this looks less likely in AVX512.

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4563,6 +4563,23 @@ optimize_encoding (void)
 	i.types[j].bitfield.disp8
 	  = fits_in_disp8 (i.op[j].disps->X_add_number);
     }
+  else if (optimize_for_space
+	   && i.tm.base_opcode == 0x29
+	   && i.tm.opcode_space == SPACE_0F38
+	   && i.operands == i.reg_operands
+	   && i.op[0].regs == i.op[1].regs
+	   && (!i.tm.opcode_modifier.vex
+	       || !(i.op[0].regs->reg_flags & RegRex))
+	   && !is_evex_encoding (&i.tm))
+    {
+      /* Optimize: -Os:
+         pcmpeqq %xmmN, %xmmN          -> pcmpeqd %xmmN, %xmmN
+         vpcmpeqq %xmmN, %xmmN, %xmmM  -> vpcmpeqd %xmmN, %xmmN, %xmmM (N < 8)
+         vpcmpeqq %ymmN, %ymmN, %ymmM  -> vpcmpeqd %ymmN, %ymmN, %ymmM (N < 8)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0x76;
+    }
 }
 
 /* Return non-zero for load instruction.  */
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -161,4 +161,7 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%eax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%eax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%eax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqd %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %ymm2,%ymm2,%ymm0
 #pass
--- a/gas/testsuite/gas/i386/optimize-2.s
+++ b/gas/testsuite/gas/i386/optimize-2.s
@@ -180,3 +180,7 @@ _start:
 	vporq		(%eax){1to2}, %xmm2, %xmm3
 	vpxord		(%eax){1to4}, %xmm2, %xmm3
 	vpxorq		(%eax){1to4}, %ymm2, %ymm3
+
+	pcmpeqq		%xmm2, %xmm2
+	vpcmpeqq	%xmm2, %xmm2, %xmm0
+	vpcmpeqq	%ymm2, %ymm2, %ymm0
--- a/gas/testsuite/gas/i386/optimize-2b.d
+++ b/gas/testsuite/gas/i386/optimize-2b.d
@@ -162,4 +162,7 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%eax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%eax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%eax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqq %xmm2,%xmm2
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm2,%ymm2,%ymm0
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -199,4 +199,10 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%rax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%rax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%rax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqd %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pcmpeqd %xmm12,%xmm12
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm12,%xmm12,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm12,%ymm12,%ymm0
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.s
@@ -221,3 +221,11 @@ _start:
 	vporq		(%rax){1to2}, %xmm2, %xmm3
 	vpxord		(%rax){1to4}, %xmm2, %xmm3
 	vpxorq		(%rax){1to4}, %ymm2, %ymm3
+
+	pcmpeqq		%xmm2, %xmm2
+	vpcmpeqq	%xmm2, %xmm2, %xmm0
+	vpcmpeqq	%ymm2, %ymm2, %ymm0
+
+	pcmpeqq		%xmm12, %xmm12
+	vpcmpeqq	%xmm12, %xmm12, %xmm0
+	vpcmpeqq	%ymm12, %ymm12, %ymm0
--- a/gas/testsuite/gas/i386/x86-64-optimize-3b.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3b.d
@@ -200,4 +200,10 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%rax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%rax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%rax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqq %xmm2,%xmm2
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pcmpeqq %xmm12,%xmm12
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm12,%xmm12,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm12,%ymm12,%ymm0
 #pass
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1363,7 +1363,7 @@ pblendvb, 0x664c, AVX, Modrm|Vex128|Spac
 pblendvb, 0x660f3810, SSE4_1, Modrm|NoSuf, { Acc|Xmmword, RegXMM|Unspecified|BaseIndex, RegXMM }
 pblendvb, 0x660f3810, SSE4_1, Modrm|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
 pblendw<sse41>, 0x660f3a0e, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { Imm8|Imm8S, RegXMM|Unspecified|BaseIndex, RegXMM }
-pcmpeqq<sse41>, 0x660f3829, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
+pcmpeqq<sse41>, 0x660f3829, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf|Optimize, { RegXMM|Unspecified|BaseIndex, RegXMM }
 pextr<bw><sse41>, 0x660f3a14 | <bw:opc>, <sse41:cpu>, RegMem|<sse41:attr>|NoSuf|IgnoreSize|NoRex64, { Imm8, RegXMM, Reg32|Reg64 }
 pextr<bw><sse41>, 0x660f3a14 | <bw:opc>, <sse41:cpu>, Modrm|<sse41:attr>|NoSuf, { Imm8, RegXMM, <bw:elem>|Unspecified|BaseIndex }
 pextrd<sse41>, 0x660f3a16, <sse41:cpu>, Modrm|<sse41:attr>|NoSuf|IgnoreSize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
@@ -1592,7 +1592,7 @@ vpblendvb, 0x664c, AVX|AVX2, Modrm|Vex|S
 vpblendw, 0x660e, AVX|AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpeq<bw>, 0x6674 | <bw:opc>, AVX|AVX2, Modrm|C|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpeqd, 0x6676, AVX|AVX2, Modrm|C|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpcmpeqq, 0x6629, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpcmpeqq, 0x6629, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf|Optimize, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpestri, 0x6661, AVX|No64, Modrm|Vex|Space0F3A|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpestri, 0x6661, AVX|x64, Modrm|Vex|Space0F3A|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Imm8, Xmmword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpestrm, 0x6660, AVX|No64, Modrm|Vex|Space0F3A|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/4] x86: optimize pre-AVX512 {,V}PCMPGT* with identical sources
  2023-06-16  7:29 [PATCH 0/4] x86: some more optimization plus a new pseudo insn form Jan Beulich
  2023-06-16  7:30 ` [PATCH 1/4] x86: optimize pre-AVX512 {,V}PCMPEQQ with identical sources Jan Beulich
@ 2023-06-16  7:31 ` Jan Beulich
  2023-06-16  7:31 ` [PATCH 3/4] x86: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ Jan Beulich
  2023-06-16  7:32 ` [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo Jan Beulich
  3 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2023-06-16  7:31 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

These are better expressed by the zeroing idiom {,V}PXOR. In some cases
this also results in a shorter encoding.
---
Thoughts towards doing the same for {,V}PSUB{,U}S{B,W,D,Q}, anyone?

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4580,6 +4580,46 @@ optimize_encoding (void)
       i.tm.opcode_space = SPACE_0F;
       i.tm.base_opcode = 0x76;
     }
+  else if (((i.tm.base_opcode >= 0x64
+	     && i.tm.base_opcode <= 0x66
+	     && i.tm.opcode_space == SPACE_0F)
+	    || (i.tm.base_opcode == 0x37
+		&& i.tm.opcode_space == SPACE_0F38))
+	   && i.operands == i.reg_operands
+	   && i.op[0].regs == i.op[1].regs
+	   && !is_evex_encoding (&i.tm))
+    {
+      /* Optimize: -O:
+         pcmpgt[bwd] %mmN, %mmN             -> pxor %mmN, %mmN
+         pcmpgt[bwdq] %xmmN, %xmmN          -> pxor %xmmN, %xmmN
+         vpcmpgt[bwdq] %xmmN, %xmmN, %xmmM  -> vpxor %xmmN, %xmmN, %xmmM (N < 8)
+         vpcmpgt[bwdq] %xmmN, %xmmN, %xmmM  -> vpxor %xmm0, %xmm0, %xmmM (N > 7)
+         vpcmpgt[bwdq] %ymmN, %ymmN, %ymmM  -> vpxor %ymmN, %ymmN, %ymmM (N < 8)
+         vpcmpgt[bwdq] %ymmN, %ymmN, %ymmM  -> vpxor %ymm0, %ymm0, %ymmM (N > 7)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0xef;
+      if (i.tm.opcode_modifier.vex && (i.op[0].regs->reg_flags & RegRex))
+	{
+	  if (i.operands == 2)
+	    {
+	      gas_assert (i.tm.opcode_modifier.sse2avx);
+
+	      i.operands = 3;
+	      i.reg_operands = 3;
+	      i.tm.operands = 3;
+
+	      i.op[2].regs = i.op[0].regs;
+	      i.types[2] = i.types[0];
+	      i.flags[2] = i.flags[0];
+	      i.tm.operand_types[2] = i.tm.operand_types[0];
+
+	      i.tm.opcode_modifier.sse2avx = 0;
+	    }
+	  i.op[0].regs -= i.op[0].regs->reg_num + 8;
+	  i.op[1].regs = i.op[0].regs;
+	}
+    }
 }
 
 /* Return non-zero for load instruction.  */
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -147,6 +147,21 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%eax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -171,6 +171,25 @@ _start:
 	vpxord		128(%eax), %ymm2, %ymm3
 	vpxorq		128(%eax), %ymm2, %ymm3
 
+	pcmpgtb		%mm2, %mm2
+	pcmpgtb		%xmm2, %xmm2
+	vpcmpgtb	%xmm2, %xmm2, %xmm0
+	vpcmpgtb	%ymm2, %ymm2, %ymm0
+
+	pcmpgtw		%mm2, %mm2
+	pcmpgtw		%xmm2, %xmm2
+	vpcmpgtw	%xmm2, %xmm2, %xmm0
+	vpcmpgtw	%ymm2, %ymm2, %ymm0
+
+	pcmpgtd		%mm2, %mm2
+	pcmpgtd		%xmm2, %xmm2
+	vpcmpgtd	%xmm2, %xmm2, %xmm0
+	vpcmpgtd	%ymm2, %ymm2, %ymm0
+
+	pcmpgtq		%xmm2, %xmm2
+	vpcmpgtq	%xmm2, %xmm2, %xmm0
+	vpcmpgtq	%ymm2, %ymm2, %ymm0
+
 	bt	$15, %ax
 	bt	$16, %ax
 	btc	$15, %ax
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -148,6 +148,21 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%eax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -147,6 +147,21 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%eax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -147,6 +147,21 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%eax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%eax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm2,%ymm0
  +[a-f0-9]+:	0f ba e0 0f          	bt     \$0xf,%eax
  +[a-f0-9]+:	66 0f ba e0 10       	bt     \$0x10,%ax
  +[a-f0-9]+:	0f ba f8 0f          	btc    \$0xf,%eax
--- a/gas/testsuite/gas/i386/x86-64.exp
+++ b/gas/testsuite/gas/i386/x86-64.exp
@@ -520,6 +520,7 @@ run_dump_test "x86-64-optimize-1"
 run_dump_test "x86-64-optimize-2"
 run_dump_test "x86-64-optimize-2a"
 run_dump_test "x86-64-optimize-2b"
+run_dump_test "x86-64-optimize-2c"
 run_dump_test "x86-64-optimize-3"
 run_dump_test "x86-64-optimize-3b"
 run_dump_test "x86-64-optimize-4"
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.d
@@ -203,4 +203,23 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.s
@@ -226,3 +226,26 @@ _start:
 	vporq		128(%rax), %ymm2, %ymm3
 	vpxord		128(%rax), %ymm2, %ymm3
 	vpxorq		128(%rax), %ymm2, %ymm3
+
+	pcmpgtb		%mm2, %mm2
+	pcmpgtb		%xmm2, %xmm2
+	pcmpgtb		%xmm12, %xmm12
+	vpcmpgtb	%xmm2, %xmm2, %xmm8
+	vpcmpgtb	%ymm12, %ymm12, %ymm1
+
+	pcmpgtw		%mm2, %mm2
+	pcmpgtw		%xmm2, %xmm2
+	pcmpgtw		%xmm12, %xmm12
+	vpcmpgtw	%xmm2, %xmm2, %xmm8
+	vpcmpgtw	%ymm12, %ymm12, %ymm1
+
+	pcmpgtd		%mm2, %mm2
+	pcmpgtd		%xmm2, %xmm2
+	pcmpgtd		%xmm12, %xmm12
+	vpcmpgtd	%xmm2, %xmm2, %xmm8
+	vpcmpgtd	%ymm12, %ymm12, %ymm1
+
+	pcmpgtq		%xmm2, %xmm2
+	pcmpgtq		%xmm12, %xmm12
+	vpcmpgtq	%xmm2, %xmm2, %xmm8
+	vpcmpgtq	%ymm12, %ymm12, %ymm1
--- a/gas/testsuite/gas/i386/x86-64-optimize-2a.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
@@ -204,4 +204,23 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-2b.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2b.d
@@ -203,4 +203,23 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
 #pass
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2c.d
@@ -0,0 +1,226 @@
+#source: x86-64-optimize-2.s
+#as: -O -msse2avx
+#objdump: -drw
+#name: x86-64 optimized encoding 2c with -O and SSE2AVX
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+ +[a-f0-9]+:	62 71 f5 4f 55 f9    	vandnpd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 55 f9          	vandnpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 55 f9          	vandnpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 55 f9          	vandnpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 55 c1    	vandnpd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 55 c1    	vandnpd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 55 c9    	vandnpd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 55 c9    	vandnpd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 74 4f 55 f9    	vandnps %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 70 55 f9          	vandnps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 55 f9          	vandnps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 55 f9          	vandnps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 74 48 55 c1    	vandnps %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 74 28 55 c1    	vandnps %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 74 40 55 c9    	vandnps %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 74 20 55 c9    	vandnps %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 71 75 4f df f9    	vpandnd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 df c1    	vpandnd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 df c1    	vpandnd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 df c9    	vpandnd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 df c9    	vpandnd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f df f9    	vpandnq %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 df f9          	vpandn %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 df c1    	vpandnq %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 df c1    	vpandnq %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 df c9    	vpandnq %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 df c9    	vpandnq %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f 57 f9    	vxorpd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 57 f9          	vxorpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 57 f9          	vxorpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 57 f9          	vxorpd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 57 c1    	vxorpd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 57 c1    	vxorpd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 57 c9    	vxorpd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 57 c9    	vxorpd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 74 4f 57 f9    	vxorps %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 70 57 f9          	vxorps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 57 f9          	vxorps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 70 57 f9          	vxorps %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 74 48 57 c1    	vxorps %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 74 28 57 c1    	vxorps %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 74 40 57 c9    	vxorps %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 74 20 57 c9    	vxorps %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 71 75 4f ef f9    	vpxord %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 ef c1    	vpxord %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 ef c1    	vpxord %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 ef c9    	vpxord %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 ef c9    	vpxord %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f ef f9    	vpxorq %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 ef f9          	vpxor  %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 ef c1    	vpxorq %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 ef c1    	vpxorq %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 ef c9    	vpxorq %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 ef c9    	vpxorq %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 75 4f f8 f9    	vpsubb %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 f8 f9          	vpsubb %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f8 f9          	vpsubb %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f8 f9          	vpsubb %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 f8 c1    	vpsubb %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 f8 c1    	vpsubb %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 f8 c9    	vpsubb %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 f8 c9    	vpsubb %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 75 4f f9 f9    	vpsubw %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 f9 f9          	vpsubw %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f9 f9          	vpsubw %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 f9 f9          	vpsubw %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 f9 c1    	vpsubw %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 f9 c1    	vpsubw %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 f9 c9    	vpsubw %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 f9 c9    	vpsubw %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 75 4f fa f9    	vpsubd %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 fa f9          	vpsubd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fa f9          	vpsubd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fa f9          	vpsubd %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 75 48 fa c1    	vpsubd %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 75 28 fa c1    	vpsubd %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 75 40 fa c9    	vpsubd %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 75 20 fa c9    	vpsubd %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	62 71 f5 4f fb f9    	vpsubq %zmm1,%zmm1,%zmm15\{%k7\}
+ +[a-f0-9]+:	c5 71 fb f9          	vpsubq %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fb f9          	vpsubq %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	c5 71 fb f9          	vpsubq %xmm1,%xmm1,%xmm15
+ +[a-f0-9]+:	62 e1 f5 48 fb c1    	vpsubq %zmm1,%zmm1,%zmm16
+ +[a-f0-9]+:	62 e1 f5 28 fb c1    	vpsubq %ymm1,%ymm1,%ymm16
+ +[a-f0-9]+:	62 b1 f5 40 fb c9    	vpsubq %zmm17,%zmm17,%zmm1
+ +[a-f0-9]+:	62 b1 f5 20 fb c9    	vpsubq %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
+ +[a-f0-9]+:	c5 .*	vpand  %xmm2,%xmm3,%xmm4
+ +[a-f0-9]+:	c4 .*	vpand  %xmm12,%xmm3,%xmm4
+ +[a-f0-9]+:	c5 .*	vpandn %xmm2,%xmm13,%xmm4
+ +[a-f0-9]+:	c5 .*	vpandn %xmm2,%xmm3,%xmm14
+ +[a-f0-9]+:	c5 .*	vpor   %xmm2,%xmm3,%xmm4
+ +[a-f0-9]+:	c4 .*	vpor   %xmm12,%xmm3,%xmm4
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm13,%xmm4
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm3,%xmm14
+ +[a-f0-9]+:	c5 .*	vpand  %ymm2,%ymm3,%ymm4
+ +[a-f0-9]+:	c4 .*	vpand  %ymm12,%ymm3,%ymm4
+ +[a-f0-9]+:	c5 .*	vpandn %ymm2,%ymm13,%ymm4
+ +[a-f0-9]+:	c5 .*	vpandn %ymm2,%ymm3,%ymm14
+ +[a-f0-9]+:	c5 .*	vpor   %ymm2,%ymm3,%ymm4
+ +[a-f0-9]+:	c4 .*	vpor   %ymm12,%ymm3,%ymm4
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm13,%ymm4
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm2,%ymm3,%ymm14
+ +[a-f0-9]+:	c5 .*	vpand  0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpand  0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpandn 0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpandn 0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpor   0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpor   0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpxor  0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpxor  0x70\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpandd 0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpandq 0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpandnd 0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpandnq 0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpord  0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%xmm2,%xmm3
+ +[a-f0-9]+:	c5 .*	vpand  0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpand  0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpandn 0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpandn 0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpor   0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpor   0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpxor  0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	c5 .*	vpxor  0x60\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpandd 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpandq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpandnd 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpandnq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpord  0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm0,%xmm0,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm0,%xmm0,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm0,%xmm0,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm0,%xmm0,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+#pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.d
@@ -203,6 +203,25 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	\{evex\} vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	\{evex\} vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.d
@@ -203,6 +203,25 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxord 0x80\(%rax\),%ymm2,%ymm3
  +[a-f0-9]+:	62 .*	vpxorq 0x80\(%rax\),%ymm2,%ymm3
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	0f .*	pxor   %mm2,%mm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
+ +[a-f0-9]+:	66 .*	pxor   %xmm2,%xmm2
+ +[a-f0-9]+:	66 .*	pxor   %xmm12,%xmm12
+ +[a-f0-9]+:	c5 .*	vpxor  %xmm2,%xmm2,%xmm8
+ +[a-f0-9]+:	c5 .*	vpxor  %ymm0,%ymm0,%ymm1
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	\{evex\} vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	\{evex\} vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1025,8 +1025,8 @@ pand<mmx>, 0x<mmx:pfx>0fdb, <mmx:cpu>, M
 pandn<mmx>, 0x<mmx:pfx>0fdf, <mmx:cpu>, Modrm|<mmx:attr>|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
 pcmpeq<bw><mmx>, 0x<mmx:pfx>0f74 | <bw:opc>, <mmx:cpu>, Modrm|<mmx:attr>|C|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
 pcmpeqd<mmx>, 0x<mmx:pfx>0f76, <mmx:cpu>, Modrm|<mmx:attr>|C|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
-pcmpgt<bw><mmx>, 0x<mmx:pfx>0f64 | <bw:opc>, <mmx:cpu>, Modrm|<mmx:attr>|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
-pcmpgtd<mmx>, 0x<mmx:pfx>0f66, <mmx:cpu>, Modrm|<mmx:attr>|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
+pcmpgt<bw><mmx>, 0x<mmx:pfx>0f64 | <bw:opc>, <mmx:cpu>, Modrm|<mmx:attr>|NoSuf|Optimize, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
+pcmpgtd<mmx>, 0x<mmx:pfx>0f66, <mmx:cpu>, Modrm|<mmx:attr>|NoSuf|Optimize, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
 pmaddwd<mmx>, 0x<mmx:pfx>0ff5, <mmx:cpu>, Modrm|<mmx:attr>|C|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
 pmulhw<mmx>, 0x<mmx:pfx>0fe5, <mmx:cpu>, Modrm|<mmx:attr>|C|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
 pmullw<mmx>, 0x<mmx:pfx>0fd5, <mmx:cpu>, Modrm|<mmx:attr>|C|NoSuf, { <mmx:reg>|<mmx:mem>|Unspecified|BaseIndex, <mmx:reg> }
@@ -1405,7 +1405,7 @@ rounds<sd><sse41>, 0x660f3a0a | <sd:opc>
 
 <sse42:cpu:attr:vvvv, $avx:AVX:Vex128|VexW0|SSE2AVX:VexVVVV, $sse:SSE4_2::>
 
-pcmpgtq<sse42>, 0x660f3837, <sse42:cpu>, Modrm|<sse42:attr>|<sse42:vvvv>|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
+pcmpgtq<sse42>, 0x660f3837, <sse42:cpu>, Modrm|<sse42:attr>|<sse42:vvvv>|NoSuf|Optimize, { RegXMM|Unspecified|BaseIndex, RegXMM }
 pcmpestri<sse42>, 0x660f3a61, <sse42:cpu>|No64, Modrm|<sse42:attr>|NoSuf, { Imm8, RegXMM|Unspecified|BaseIndex, RegXMM }
 pcmpestri, 0x6661, AVX|x64, Modrm|Vex|Space0F3A|IgnoreSize|No_bSuf|No_wSuf|No_sSuf|SSE2AVX, { Imm8, Xmmword|Unspecified|BaseIndex|RegXMM, RegXMM }
 pcmpestri, 0x660f3a61, SSE4_2|x64, Modrm|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Imm8, Xmmword|Unspecified|BaseIndex|RegXMM, RegXMM }
@@ -1597,9 +1597,9 @@ vpcmpestri, 0x6661, AVX|No64, Modrm|Vex|
 vpcmpestri, 0x6661, AVX|x64, Modrm|Vex|Space0F3A|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Imm8, Xmmword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpestrm, 0x6660, AVX|No64, Modrm|Vex|Space0F3A|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpestrm, 0x6660, AVX|x64, Modrm|Vex|Space0F3A|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Imm8, Xmmword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpcmpgt<bw>, 0x6664 | <bw:opc>, AVX|AVX2, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpcmpgtd, 0x6666, AVX|AVX2, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpcmpgtq, 0x6637, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpcmpgt<bw>, 0x6664 | <bw:opc>, AVX|AVX2, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf|Optimize, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpcmpgtd, 0x6666, AVX|AVX2, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf|Optimize, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpcmpgtq, 0x6637, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf|Optimize, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpistri, 0x6663, AVX, Modrm|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpistrm, 0x6662, AVX, Modrm|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vperm2f128, 0x6606, AVX, Modrm|Vex256|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/4] x86: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ
  2023-06-16  7:29 [PATCH 0/4] x86: some more optimization plus a new pseudo insn form Jan Beulich
  2023-06-16  7:30 ` [PATCH 1/4] x86: optimize pre-AVX512 {,V}PCMPEQQ with identical sources Jan Beulich
  2023-06-16  7:31 ` [PATCH 2/4] x86: optimize pre-AVX512 {,V}PCMPGT* " Jan Beulich
@ 2023-06-16  7:31 ` Jan Beulich
  2023-06-16  7:32 ` [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo Jan Beulich
  3 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2023-06-16  7:31 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

The alternative is 1 byte shorter when the source is %xmm0-7, as a
2-byte VEX prefix can then be used.

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4620,6 +4620,33 @@ optimize_encoding (void)
 	  i.op[1].regs = i.op[0].regs;
 	}
     }
+  else if (optimize_for_space
+	   && i.tm.base_opcode == 0x59
+	   && i.tm.opcode_space == SPACE_0F38
+	   && i.operands == i.reg_operands
+	   && i.tm.opcode_modifier.vex
+	   && !(i.op[0].regs->reg_flags & RegRex)
+	   && i.op[0].regs->reg_type.bitfield.xmmword
+	   && i.vec_encoding != vex_encoding_vex3)
+    {
+      /* Optimize: -Os:
+         vpbroadcastq %xmmN, %xmmM  -> vpunpcklqdq %xmmN, %xmmN, %xmmM (N < 8)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0x6c;
+      i.tm.opcode_modifier.vexvvvv = 1;
+
+      ++i.operands;
+      ++i.reg_operands;
+      ++i.tm.operands;
+
+      i.op[2].regs = i.op[0].regs;
+      i.types[2] = i.types[0];
+      i.flags[2] = i.flags[0];
+      i.tm.operand_types[2] = i.tm.operand_types[0];
+
+      swap_2_operands (1, 2);
+    }
 }
 
 /* Return non-zero for load instruction.  */
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -164,4 +164,5 @@ Disassembly of section .text:
  +[a-f0-9]+:	66 .*	pcmpeqd %xmm2,%xmm2
  +[a-f0-9]+:	c5 .*	vpcmpeqd %xmm2,%xmm2,%xmm0
  +[a-f0-9]+:	c5 .*	vpcmpeqd %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	c5 .*	vpunpcklqdq %xmm2,%xmm2,%xmm0
 #pass
--- a/gas/testsuite/gas/i386/optimize-2.s
+++ b/gas/testsuite/gas/i386/optimize-2.s
@@ -184,3 +184,5 @@ _start:
 	pcmpeqq		%xmm2, %xmm2
 	vpcmpeqq	%xmm2, %xmm2, %xmm0
 	vpcmpeqq	%ymm2, %ymm2, %ymm0
+
+	vpbroadcastq	%xmm2, %xmm0
--- a/gas/testsuite/gas/i386/optimize-2b.d
+++ b/gas/testsuite/gas/i386/optimize-2b.d
@@ -165,4 +165,5 @@ Disassembly of section .text:
  +[a-f0-9]+:	66 .*	pcmpeqq %xmm2,%xmm2
  +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm2,%xmm2,%xmm0
  +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	c4 .*	vpbroadcastq %xmm2,%xmm0
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -205,4 +205,6 @@ Disassembly of section .text:
  +[a-f0-9]+:	66 .*	pcmpeqd %xmm12,%xmm12
  +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm12,%xmm12,%xmm0
  +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm12,%ymm12,%ymm0
+ +[a-f0-9]+:	c5 .*	vpunpcklqdq %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c4 .*	vpbroadcastq %xmm12,%xmm0
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.s
@@ -229,3 +229,6 @@ _start:
 	pcmpeqq		%xmm12, %xmm12
 	vpcmpeqq	%xmm12, %xmm12, %xmm0
 	vpcmpeqq	%ymm12, %ymm12, %ymm0
+
+	vpbroadcastq	%xmm2, %xmm0
+	vpbroadcastq	%xmm12, %xmm0
--- a/gas/testsuite/gas/i386/x86-64-optimize-3b.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3b.d
@@ -206,4 +206,6 @@ Disassembly of section .text:
  +[a-f0-9]+:	66 .*	pcmpeqq %xmm12,%xmm12
  +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm12,%xmm12,%xmm0
  +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm12,%ymm12,%ymm0
+ +[a-f0-9]+:	c4 .*	vpbroadcastq %xmm2,%xmm0
+ +[a-f0-9]+:	c4 .*	vpbroadcastq %xmm12,%xmm0
 #pass
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1734,7 +1734,7 @@ vbroadcastsd, 0x6619, AVX2, Modrm|Vex=2|
 vbroadcastss, 0x6618, AVX2, Modrm|Vex|Space0F38|VexW=1|NoSuf, { RegXMM, RegXMM|RegYMM }
 vpblendd, 0x6602, AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpbroadcast<bw>, 0x6678 | <bw:opc>, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf, { <bw:elem>|Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM }
-vpbroadcast<dq>, 0x6658 | <dq:opc>, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf, { <dq:elem>|Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM }
+vpbroadcast<dq>, 0x6658 | <dq:opc>, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf|Optimize, { <dq:elem>|Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM }
 vperm2i128, 0x6646, AVX2, Modrm|Vex=2|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }
 vpermd, 0x6636, AVX2, Modrm|Vex256|Space0F38|VexVVVV|VexW0|NoSuf, { Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }
 vpermpd, 0x6601, AVX2, Modrm|Vex=2|Space0F3A|VexW1|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo
  2023-06-16  7:29 [PATCH 0/4] x86: some more optimization plus a new pseudo insn form Jan Beulich
                   ` (2 preceding siblings ...)
  2023-06-16  7:31 ` [PATCH 3/4] x86: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ Jan Beulich
@ 2023-06-16  7:32 ` Jan Beulich
  2023-06-16 16:59   ` H.J. Lu
  3 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2023-06-16  7:32 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

VBROADCASTSD not supporting 128-bit destinations in any of their AVX,
AVX2, or AVX512F incarnations is presumably because of VMOVDDUP
precisely supporting this very operation. (It is therefore different
from e.g. VPBROADCASTQ, which has no exact equivalent.) Still its
absence has led to people using VPBROADCASTQ as substitution; this could
have been avoided if such a pseudo had been supported from the very
beginning.

Note that the pseudos try to match what the real instructions would have
used as closely as possible, i.e. VexW0 instead of VexWIG for the AVX
and AVX2 forms as well as AVX2 in the first place for the register
source form.
---
For being the first example of us supplying such, this is partly RFC. On
top of that a question is also whether to indeed have split AVX/AVX2
templates, when in principle one (allowing for both memory and register
source) could do.

--- a/gas/testsuite/gas/i386/avx.d
+++ b/gas/testsuite/gas/i386/avx.d
@@ -927,6 +927,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 35 21       	vpmovzxdq \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd \(%ecx\),%xmm4
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd %xmm4,\(%ecx\)
 [ 	]*[a-f0-9]+:	c5 f8 13 21          	vmovlps %xmm4,\(%ecx\)
@@ -2768,6 +2769,8 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd \(%ecx\),%xmm4
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup \(%ecx\),%xmm4
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd %xmm4,\(%ecx\)
--- a/gas/testsuite/gas/i386/avx.s
+++ b/gas/testsuite/gas/i386/avx.s
@@ -982,6 +982,7 @@ _start:
 	vucomisd (%ecx),%xmm4
 
 # Tests for op mem64, xmm
+	vbroadcastsd (%ecx),%xmm4
 	vmovsd (%ecx),%xmm4
 
 # Tests for op xmm, mem64
@@ -2953,6 +2954,8 @@ _start:
 	vucomisd xmm4,[ecx]
 
 # Tests for op mem64, xmm
+	vbroadcastsd xmm4,QWORD PTR [ecx]
+	vbroadcastsd xmm4,[ecx]
 	vmovsd xmm4,QWORD PTR [ecx]
 	vmovsd xmm4,[ecx]
 
--- a/gas/testsuite/gas/i386/avx-16bit.d
+++ b/gas/testsuite/gas/i386/avx-16bit.d
@@ -928,6 +928,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	67 c4 e2 79 35 21    	vpmovzxdq \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	67 c5 f9 2e 21       	vucomisd \(%ecx\),%xmm4
+[ 	]*[a-f0-9]+:	67 c5 fb 12 21       	vmovddup \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	67 c5 fb 10 21       	vmovsd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	67 c5 f9 13 21       	vmovlpd %xmm4,\(%ecx\)
 [ 	]*[a-f0-9]+:	67 c5 f8 13 21       	vmovlps %xmm4,\(%ecx\)
@@ -2769,6 +2770,8 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	67 c5 f9 2e 21       	vucomisd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	67 c5 f9 2e 21       	vucomisd \(%ecx\),%xmm4
+[ 	]*[a-f0-9]+:	67 c5 fb 12 21       	vmovddup \(%ecx\),%xmm4
+[ 	]*[a-f0-9]+:	67 c5 fb 12 21       	vmovddup \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	67 c5 fb 10 21       	vmovsd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	67 c5 fb 10 21       	vmovsd \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	67 c5 f9 13 21       	vmovlpd %xmm4,\(%ecx\)
--- a/gas/testsuite/gas/i386/avx-intel.d
+++ b/gas/testsuite/gas/i386/avx-intel.d
@@ -928,6 +928,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 35 21       	vpmovzxdq xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd xmm6,xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd xmm4,QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd QWORD PTR \[ecx\],xmm4
 [ 	]*[a-f0-9]+:	c5 f8 13 21          	vmovlps QWORD PTR \[ecx\],xmm4
@@ -2769,6 +2770,8 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd xmm6,xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd xmm4,QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup xmm4,QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd xmm4,QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd QWORD PTR \[ecx\],xmm4
--- a/gas/testsuite/gas/i386/avx2.d
+++ b/gas/testsuite/gas/i386/avx2.d
@@ -73,6 +73,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 78 21       	vpbroadcastb \(%ecx\),%xmm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb %xmm4,%ymm6
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb \(%ecx\),%ymm4
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c4 e2 5d 8c 31       	vpmaskmovd \(%ecx\),%ymm4,%ymm6
 [ 	]*[a-f0-9]+:	c4 e2 4d 8e 21       	vpmaskmovd %ymm4,%ymm6,\(%ecx\)
@@ -177,5 +178,6 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb %xmm4,%ymm6
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb \(%ecx\),%ymm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb \(%ecx\),%ymm4
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss %xmm4,%xmm6
 #pass
--- a/gas/testsuite/gas/i386/avx2.s
+++ b/gas/testsuite/gas/i386/avx2.s
@@ -114,6 +114,7 @@ _start:
 	vpbroadcastb (%ecx),%ymm4
 
 # Tests for op xmm, xmm
+	vbroadcastsd %xmm4,%xmm6
 	vbroadcastss %xmm4,%xmm6
 
 	.intel_syntax noprefix
@@ -265,4 +266,5 @@ _start:
 	vpbroadcastb ymm4,[ecx]
 
 # Tests for op xmm, xmm
+	vbroadcastsd xmm6,xmm4
 	vbroadcastss xmm6,xmm4
--- a/gas/testsuite/gas/i386/avx2-intel.d
+++ b/gas/testsuite/gas/i386/avx2-intel.d
@@ -74,6 +74,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 78 21       	vpbroadcastb xmm4,BYTE PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb ymm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb ymm4,BYTE PTR \[ecx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup xmm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss xmm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 5d 8c 31       	vpmaskmovd ymm6,ymm4,YMMWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c4 e2 4d 8e 21       	vpmaskmovd YMMWORD PTR \[ecx\],ymm6,ymm4
@@ -178,5 +179,6 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb ymm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb ymm4,BYTE PTR \[ecx\]
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb ymm4,BYTE PTR \[ecx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup xmm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss xmm6,xmm4
 #pass
--- a/gas/testsuite/gas/i386/avx512f_vl.d
+++ b/gas/testsuite/gas/i386/avx512f_vl.d
@@ -155,6 +155,15 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 0x800\(%edx\),%ymm6\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a 72 80[ 	]*vbroadcasti32x4 -0x800\(%edx\),%ymm6\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 -0x810\(%edx\),%ymm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 31[ 	]*vmovddup \(%ecx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 31[ 	]*vmovddup \(%ecx\),%xmm6\{%k7\}\{z\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b4 f4 c0 1d fe ff[ 	]*vmovddup -0x1e240\(%esp,%esi,8\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 7f[ 	]*vmovddup 0x3f8\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 00 04 00 00[ 	]*vmovddup 0x400\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 80[ 	]*vmovddup -0x400\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 f8 fb ff ff[ 	]*vmovddup -0x408\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 f5[ 	]*vmovddup %xmm5,%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 f5[ 	]*vmovddup %xmm5,%xmm6\{%k7\}\{z\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 31[ 	]*vbroadcastsd \(%ecx\),%ymm6\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd af 19 31[ 	]*vbroadcastsd \(%ecx\),%ymm6\{%k7\}\{z\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 b4 f4 c0 1d fe ff[ 	]*vbroadcastsd -0x1e240\(%esp,%esi,8\),%ymm6\{%k7\}
@@ -5850,6 +5859,15 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 0x800\(%edx\),%ymm6\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a 72 80[ 	]*vbroadcasti32x4 -0x800\(%edx\),%ymm6\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 -0x810\(%edx\),%ymm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 31[ 	]*vmovddup \(%ecx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 31[ 	]*vmovddup \(%ecx\),%xmm6\{%k7\}\{z\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b4 f4 c0 1d fe ff[ 	]*vmovddup -0x1e240\(%esp,%esi,8\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 7f[ 	]*vmovddup 0x3f8\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 00 04 00 00[ 	]*vmovddup 0x400\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 80[ 	]*vmovddup -0x400\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 f8 fb ff ff[ 	]*vmovddup -0x408\(%edx\),%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 f5[ 	]*vmovddup %xmm5,%xmm6\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 f5[ 	]*vmovddup %xmm5,%xmm6\{%k7\}\{z\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 31[ 	]*vbroadcastsd \(%ecx\),%ymm6\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd af 19 31[ 	]*vbroadcastsd \(%ecx\),%ymm6\{%k7\}\{z\}
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 b4 f4 c0 1d fe ff[ 	]*vbroadcastsd -0x1e240\(%esp,%esi,8\),%ymm6\{%k7\}
--- a/gas/testsuite/gas/i386/avx512f_vl.s
+++ b/gas/testsuite/gas/i386/avx512f_vl.s
@@ -149,6 +149,15 @@ _start:
 	vbroadcasti32x4	2048(%edx), %ymm6{%k7}	 # AVX512{F,VL}
 	vbroadcasti32x4	-2048(%edx), %ymm6{%k7}	 # AVX512{F,VL} Disp8
 	vbroadcasti32x4	-2064(%edx), %ymm6{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	(%ecx), %xmm6{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	(%ecx), %xmm6{%k7}{z}	 # AVX512{F,VL}
+	vbroadcastsd	-123456(%esp,%esi,8), %xmm6{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	1016(%edx), %xmm6{%k7}	 # AVX512{F,VL} Disp8
+	vbroadcastsd	1024(%edx), %xmm6{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	-1024(%edx), %xmm6{%k7}	 # AVX512{F,VL} Disp8
+	vbroadcastsd	-1032(%edx), %xmm6{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	%xmm5, %xmm6{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	%xmm5, %xmm6{%k7}{z}	 # AVX512{F,VL}
 	vbroadcastsd	(%ecx), %ymm6{%k7}	 # AVX512{F,VL}
 	vbroadcastsd	(%ecx), %ymm6{%k7}{z}	 # AVX512{F,VL}
 	vbroadcastsd	-123456(%esp,%esi,8), %ymm6{%k7}	 # AVX512{F,VL}
@@ -5846,6 +5855,15 @@ _start:
 	vbroadcasti32x4	ymm6{k7}, XMMWORD PTR [edx+2048]	 # AVX512{F,VL}
 	vbroadcasti32x4	ymm6{k7}, XMMWORD PTR [edx-2048]	 # AVX512{F,VL} Disp8
 	vbroadcasti32x4	ymm6{k7}, XMMWORD PTR [edx-2064]	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}, QWORD PTR [ecx]	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}{z}, QWORD PTR [ecx]	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}, QWORD PTR [esp+esi*8-123456]	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}, QWORD PTR [edx+1016]	 # AVX512{F,VL} Disp8
+	vbroadcastsd	xmm6{k7}, QWORD PTR [edx+1024]	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}, QWORD PTR [edx-1024]	 # AVX512{F,VL} Disp8
+	vbroadcastsd	xmm6{k7}, QWORD PTR [edx-1032]	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}, xmm5	 # AVX512{F,VL}
+	vbroadcastsd	xmm6{k7}{z}, xmm5	 # AVX512{F,VL}
 	vbroadcastsd	ymm6{k7}, QWORD PTR [ecx]	 # AVX512{F,VL}
 	vbroadcastsd	ymm6{k7}{z}, QWORD PTR [ecx]	 # AVX512{F,VL}
 	vbroadcastsd	ymm6{k7}, QWORD PTR [esp+esi*8-123456]	 # AVX512{F,VL}
--- a/gas/testsuite/gas/i386/avx512f_vl-intel.d
+++ b/gas/testsuite/gas/i386/avx512f_vl-intel.d
@@ -155,6 +155,15 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 ymm6\{k7\},XMMWORD PTR \[edx\+0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a 72 80[ 	]*vbroadcasti32x4 ymm6\{k7\},XMMWORD PTR \[edx-0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 ymm6\{k7\},XMMWORD PTR \[edx-0x810\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 31[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 31[ 	]*vmovddup xmm6\{k7\}\{z\},QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b4 f4 c0 1d fe ff[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[esp\+esi\*8-0x1e240\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 7f[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx\+0x3f8\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 00 04 00 00[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx\+0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 80[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx-0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 f8 fb ff ff[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx-0x408\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 f5[ 	]*vmovddup xmm6\{k7\},xmm5
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 f5[ 	]*vmovddup xmm6\{k7\}\{z\},xmm5
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 31[ 	]*vbroadcastsd ymm6\{k7\},QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd af 19 31[ 	]*vbroadcastsd ymm6\{k7\}\{z\},QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 b4 f4 c0 1d fe ff[ 	]*vbroadcastsd ymm6\{k7\},QWORD PTR \[esp\+esi\*8-0x1e240\]
@@ -5850,6 +5859,15 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 ymm6\{k7\},XMMWORD PTR \[edx\+0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a 72 80[ 	]*vbroadcasti32x4 ymm6\{k7\},XMMWORD PTR \[edx-0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 7d 2f 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 ymm6\{k7\},XMMWORD PTR \[edx-0x810\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 31[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 31[ 	]*vmovddup xmm6\{k7\}\{z\},QWORD PTR \[ecx\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b4 f4 c0 1d fe ff[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[esp\+esi\*8-0x1e240\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 7f[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx\+0x3f8\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 00 04 00 00[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx\+0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 72 80[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx-0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 b2 f8 fb ff ff[ 	]*vmovddup xmm6\{k7\},QWORD PTR \[edx-0x408\]
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 0f 12 f5[ 	]*vmovddup xmm6\{k7\},xmm5
+[ 	]*[a-f0-9]+:[ 	]*62 f1 ff 8f 12 f5[ 	]*vmovddup xmm6\{k7\}\{z\},xmm5
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 31[ 	]*vbroadcastsd ymm6\{k7\},QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd af 19 31[ 	]*vbroadcastsd ymm6\{k7\}\{z\},QWORD PTR \[ecx\]
 [ 	]*[a-f0-9]+:[ 	]*62 f2 fd 2f 19 b4 f4 c0 1d fe ff[ 	]*vbroadcastsd ymm6\{k7\},QWORD PTR \[esp\+esi\*8-0x1e240\]
--- a/gas/testsuite/gas/i386/x86-64-avx.d
+++ b/gas/testsuite/gas/i386/x86-64-avx.d
@@ -875,6 +875,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 35 21       	vpmovzxdq \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd \(%rcx\),%xmm4
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd %xmm4,\(%rcx\)
 [ 	]*[a-f0-9]+:	c5 f8 13 21          	vmovlps %xmm4,\(%rcx\)
@@ -2818,6 +2819,8 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd \(%rcx\),%xmm4
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup \(%rcx\),%xmm4
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd %xmm4,\(%rcx\)
--- a/gas/testsuite/gas/i386/x86-64-avx.s
+++ b/gas/testsuite/gas/i386/x86-64-avx.s
@@ -930,6 +930,7 @@ _start:
 	vucomisd (%rcx),%xmm4
 
 # Tests for op mem64, xmm
+	vbroadcastsd (%rcx),%xmm4
 	vmovsd (%rcx),%xmm4
 
 # Tests for op xmm, mem64
@@ -3024,6 +3025,8 @@ _start:
 	vucomisd xmm4,[rcx]
 
 # Tests for op mem64, xmm
+	vbroadcastsd xmm4,QWORD PTR [rcx]
+	vbroadcastsd xmm4,[rcx]
 	vmovsd xmm4,QWORD PTR [rcx]
 	vmovsd xmm4,[rcx]
 
--- a/gas/testsuite/gas/i386/x86-64-avx-intel.d
+++ b/gas/testsuite/gas/i386/x86-64-avx-intel.d
@@ -876,6 +876,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 35 21       	vpmovzxdq xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd xmm6,xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd xmm4,QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd QWORD PTR \[rcx\],xmm4
 [ 	]*[a-f0-9]+:	c5 f8 13 21          	vmovlps QWORD PTR \[rcx\],xmm4
@@ -2819,6 +2820,8 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c5 f9 2e f4          	vucomisd xmm6,xmm4
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 f9 2e 21          	vucomisd xmm4,QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup xmm4,QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 21          	vmovddup xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 fb 10 21          	vmovsd xmm4,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c5 f9 13 21          	vmovlpd QWORD PTR \[rcx\],xmm4
--- a/gas/testsuite/gas/i386/x86-64-avx2.d
+++ b/gas/testsuite/gas/i386/x86-64-avx2.d
@@ -73,6 +73,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 78 21       	vpbroadcastb \(%rcx\),%xmm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb %xmm4,%ymm6
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb \(%rcx\),%ymm4
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c4 e2 5d 8c 31       	vpmaskmovd \(%rcx\),%ymm4,%ymm6
 [ 	]*[a-f0-9]+:	c4 e2 4d 8e 21       	vpmaskmovd %ymm4,%ymm6,\(%rcx\)
@@ -177,5 +178,6 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb %xmm4,%ymm6
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb \(%rcx\),%ymm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb \(%rcx\),%ymm4
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup %xmm4,%xmm6
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss %xmm4,%xmm6
 #pass
--- a/gas/testsuite/gas/i386/x86-64-avx2.s
+++ b/gas/testsuite/gas/i386/x86-64-avx2.s
@@ -114,6 +114,7 @@ _start:
 	vpbroadcastb (%rcx),%ymm4
 
 # Tests for op xmm, xmm
+	vbroadcastsd %xmm4,%xmm6
 	vbroadcastss %xmm4,%xmm6
 
 	.intel_syntax noprefix
@@ -265,4 +266,5 @@ _start:
 	vpbroadcastb ymm4,[rcx]
 
 # Tests for op xmm, xmm
+	vbroadcastsd xmm6,xmm4
 	vbroadcastss xmm6,xmm4
--- a/gas/testsuite/gas/i386/x86-64-avx2-intel.d
+++ b/gas/testsuite/gas/i386/x86-64-avx2-intel.d
@@ -74,6 +74,7 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 79 78 21       	vpbroadcastb xmm4,BYTE PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb ymm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb ymm4,BYTE PTR \[rcx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup xmm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss xmm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 5d 8c 31       	vpmaskmovd ymm6,ymm4,YMMWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c4 e2 4d 8e 21       	vpmaskmovd YMMWORD PTR \[rcx\],ymm6,ymm4
@@ -178,5 +179,6 @@ Disassembly of section .text:
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 f4       	vpbroadcastb ymm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb ymm4,BYTE PTR \[rcx\]
 [ 	]*[a-f0-9]+:	c4 e2 7d 78 21       	vpbroadcastb ymm4,BYTE PTR \[rcx\]
+[ 	]*[a-f0-9]+:	c5 fb 12 f4          	vmovddup xmm6,xmm4
 [ 	]*[a-f0-9]+:	c4 e2 79 18 f4       	vbroadcastss xmm6,xmm4
 #pass
--- a/gas/testsuite/gas/i386/x86-64-avx512f_vl.d
+++ b/gas/testsuite/gas/i386/x86-64-avx512f_vl.d
@@ -167,6 +167,17 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 0x800\(%rdx\),%ymm30
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a 72 80[ 	]*vbroadcasti32x4 -0x800\(%rdx\),%ymm30
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 -0x810\(%rdx\),%ymm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 31[ 	]*vmovddup \(%rcx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 0f 12 31[ 	]*vmovddup \(%rcx\),%xmm30\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 8f 12 31[ 	]*vmovddup \(%rcx\),%xmm30\{%k7\}\{z\}
+[ 	]*[a-f0-9]+:[ 	]*62 21 ff 08 12 b4 f0 23 01 00 00[ 	]*vmovddup 0x123\(%rax,%r14,8\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 7f[ 	]*vmovddup 0x3f8\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 00 04 00 00[ 	]*vmovddup 0x400\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 80[ 	]*vmovddup -0x400\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 f8 fb ff ff[ 	]*vmovddup -0x408\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 08 12 f5[ 	]*vmovddup %xmm29,%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 0f 12 f5[ 	]*vmovddup %xmm29,%xmm30\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 8f 12 f5[ 	]*vmovddup %xmm29,%xmm30\{%k7\}\{z\}
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 28 19 31[ 	]*vbroadcastsd \(%rcx\),%ymm30
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 2f 19 31[ 	]*vbroadcastsd \(%rcx\),%ymm30\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd af 19 31[ 	]*vbroadcastsd \(%rcx\),%ymm30\{%k7\}\{z\}
@@ -6474,6 +6485,17 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 0x800\(%rdx\),%ymm30
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a 72 80[ 	]*vbroadcasti32x4 -0x800\(%rdx\),%ymm30
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 -0x810\(%rdx\),%ymm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 31[ 	]*vmovddup \(%rcx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 0f 12 31[ 	]*vmovddup \(%rcx\),%xmm30\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 8f 12 31[ 	]*vmovddup \(%rcx\),%xmm30\{%k7\}\{z\}
+[ 	]*[a-f0-9]+:[ 	]*62 21 ff 08 12 b4 f0 34 12 00 00[ 	]*vmovddup 0x1234\(%rax,%r14,8\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 7f[ 	]*vmovddup 0x3f8\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 00 04 00 00[ 	]*vmovddup 0x400\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 80[ 	]*vmovddup -0x400\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 f8 fb ff ff[ 	]*vmovddup -0x408\(%rdx\),%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 08 12 f5[ 	]*vmovddup %xmm29,%xmm30
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 0f 12 f5[ 	]*vmovddup %xmm29,%xmm30\{%k7\}
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 8f 12 f5[ 	]*vmovddup %xmm29,%xmm30\{%k7\}\{z\}
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 28 19 31[ 	]*vbroadcastsd \(%rcx\),%ymm30
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 2f 19 31[ 	]*vbroadcastsd \(%rcx\),%ymm30\{%k7\}
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd af 19 31[ 	]*vbroadcastsd \(%rcx\),%ymm30\{%k7\}\{z\}
--- a/gas/testsuite/gas/i386/x86-64-avx512f_vl.s
+++ b/gas/testsuite/gas/i386/x86-64-avx512f_vl.s
@@ -161,6 +161,17 @@ _start:
 	vbroadcasti32x4	2048(%rdx), %ymm30	 # AVX512{F,VL}
 	vbroadcasti32x4	-2048(%rdx), %ymm30	 # AVX512{F,VL} Disp8
 	vbroadcasti32x4	-2064(%rdx), %ymm30	 # AVX512{F,VL}
+	vbroadcastsd	(%rcx), %xmm30	 # AVX512{F,VL}
+	vbroadcastsd	(%rcx), %xmm30{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	(%rcx), %xmm30{%k7}{z}	 # AVX512{F,VL}
+	vbroadcastsd	0x123(%rax,%r14,8), %xmm30	 # AVX512{F,VL}
+	vbroadcastsd	1016(%rdx), %xmm30	 # AVX512{F,VL} Disp8
+	vbroadcastsd	1024(%rdx), %xmm30	 # AVX512{F,VL}
+	vbroadcastsd	-1024(%rdx), %xmm30	 # AVX512{F,VL} Disp8
+	vbroadcastsd	-1032(%rdx), %xmm30	 # AVX512{F,VL}
+	vbroadcastsd	%xmm29, %xmm30	 # AVX512{F,VL}
+	vbroadcastsd	%xmm29, %xmm30{%k7}	 # AVX512{F,VL}
+	vbroadcastsd	%xmm29, %xmm30{%k7}{z}	 # AVX512{F,VL}
 	vbroadcastsd	(%rcx), %ymm30	 # AVX512{F,VL}
 	vbroadcastsd	(%rcx), %ymm30{%k7}	 # AVX512{F,VL}
 	vbroadcastsd	(%rcx), %ymm30{%k7}{z}	 # AVX512{F,VL}
@@ -6470,6 +6481,17 @@ _start:
 	vbroadcasti32x4	ymm30, XMMWORD PTR [rdx+2048]	 # AVX512{F,VL}
 	vbroadcasti32x4	ymm30, XMMWORD PTR [rdx-2048]	 # AVX512{F,VL} Disp8
 	vbroadcasti32x4	ymm30, XMMWORD PTR [rdx-2064]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30, QWORD PTR [rcx]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30{k7}, QWORD PTR [rcx]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30{k7}{z}, QWORD PTR [rcx]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30, QWORD PTR [rax+r14*8+0x1234]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30, QWORD PTR [rdx+1016]	 # AVX512{F,VL} Disp8
+	vbroadcastsd	xmm30, QWORD PTR [rdx+1024]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30, QWORD PTR [rdx-1024]	 # AVX512{F,VL} Disp8
+	vbroadcastsd	xmm30, QWORD PTR [rdx-1032]	 # AVX512{F,VL}
+	vbroadcastsd	xmm30, xmm29	 # AVX512{F,VL}
+	vbroadcastsd	xmm30{k7}, xmm29	 # AVX512{F,VL}
+	vbroadcastsd	xmm30{k7}{z}, xmm29	 # AVX512{F,VL}
 	vbroadcastsd	ymm30, QWORD PTR [rcx]	 # AVX512{F,VL}
 	vbroadcastsd	ymm30{k7}, QWORD PTR [rcx]	 # AVX512{F,VL}
 	vbroadcastsd	ymm30{k7}{z}, QWORD PTR [rcx]	 # AVX512{F,VL}
--- a/gas/testsuite/gas/i386/x86-64-avx512f_vl-intel.d
+++ b/gas/testsuite/gas/i386/x86-64-avx512f_vl-intel.d
@@ -167,6 +167,17 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 ymm30,XMMWORD PTR \[rdx\+0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a 72 80[ 	]*vbroadcasti32x4 ymm30,XMMWORD PTR \[rdx-0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 ymm30,XMMWORD PTR \[rdx-0x810\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 31[ 	]*vmovddup xmm30,QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 0f 12 31[ 	]*vmovddup xmm30\{k7\},QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 8f 12 31[ 	]*vmovddup xmm30\{k7\}\{z\},QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:[ 	]*62 21 ff 08 12 b4 f0 23 01 00 00[ 	]*vmovddup xmm30,QWORD PTR \[rax\+r14\*8\+0x123\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 7f[ 	]*vmovddup xmm30,QWORD PTR \[rdx\+0x3f8\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 00 04 00 00[ 	]*vmovddup xmm30,QWORD PTR \[rdx\+0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 80[ 	]*vmovddup xmm30,QWORD PTR \[rdx-0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 f8 fb ff ff[ 	]*vmovddup xmm30,QWORD PTR \[rdx-0x408\]
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 08 12 f5[ 	]*vmovddup xmm30,xmm29
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 0f 12 f5[ 	]*vmovddup xmm30\{k7\},xmm29
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 8f 12 f5[ 	]*vmovddup xmm30\{k7\}\{z\},xmm29
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 28 19 31[ 	]*vbroadcastsd ymm30,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 2f 19 31[ 	]*vbroadcastsd ymm30\{k7\},QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd af 19 31[ 	]*vbroadcastsd ymm30\{k7\}\{z\},QWORD PTR \[rcx\]
@@ -6474,6 +6485,17 @@ Disassembly of section \.text:
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 00 08 00 00[ 	]*vbroadcasti32x4 ymm30,XMMWORD PTR \[rdx\+0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a 72 80[ 	]*vbroadcasti32x4 ymm30,XMMWORD PTR \[rdx-0x800\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 7d 28 5a b2 f0 f7 ff ff[ 	]*vbroadcasti32x4 ymm30,XMMWORD PTR \[rdx-0x810\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 31[ 	]*vmovddup xmm30,QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 0f 12 31[ 	]*vmovddup xmm30\{k7\},QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 8f 12 31[ 	]*vmovddup xmm30\{k7\}\{z\},QWORD PTR \[rcx\]
+[ 	]*[a-f0-9]+:[ 	]*62 21 ff 08 12 b4 f0 34 12 00 00[ 	]*vmovddup xmm30,QWORD PTR \[rax\+r14\*8\+0x1234\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 7f[ 	]*vmovddup xmm30,QWORD PTR \[rdx\+0x3f8\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 00 04 00 00[ 	]*vmovddup xmm30,QWORD PTR \[rdx\+0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 72 80[ 	]*vmovddup xmm30,QWORD PTR \[rdx-0x400\]
+[ 	]*[a-f0-9]+:[ 	]*62 61 ff 08 12 b2 f8 fb ff ff[ 	]*vmovddup xmm30,QWORD PTR \[rdx-0x408\]
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 08 12 f5[ 	]*vmovddup xmm30,xmm29
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 0f 12 f5[ 	]*vmovddup xmm30\{k7\},xmm29
+[ 	]*[a-f0-9]+:[ 	]*62 01 ff 8f 12 f5[ 	]*vmovddup xmm30\{k7\}\{z\},xmm29
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 28 19 31[ 	]*vbroadcastsd ymm30,QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd 2f 19 31[ 	]*vbroadcastsd ymm30\{k7\},QWORD PTR \[rcx\]
 [ 	]*[a-f0-9]+:[ 	]*62 62 fd af 19 31[ 	]*vbroadcastsd ymm30\{k7\}\{z\},QWORD PTR \[rcx\]
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1495,6 +1495,8 @@ vblendp<sd>, 0x660c | <sd:opc>, AVX, Mod
 vblendvp<sd>, 0x664a | <sd:opc>, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { RegXMM|RegYMM, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vbroadcastf128, 0x661a, AVX, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
 vbroadcastsd, 0x6619, AVX, Modrm|Vex256|Space0F38|VexW0|NoSuf, { Qword|Unspecified|BaseIndex, RegYMM }
+// As an extension, provide a 128-bit form as well, utilizing vmovddup.
+vbroadcastsd, 0xf212, AVX, Modrm|Vex128|Space0F|VexW0|NoSuf, { Qword|Unspecified|BaseIndex, RegXMM }
 vbroadcastss, 0x6618, AVX, Modrm|Vex128|Space0F38|VexW0|NoSuf, { Dword|Unspecified|BaseIndex, RegXMM|RegYMM }
 vcmp<frel>p<sd>, 0x<sd:ppfx>c2/0x<frel:imm>, AVX, Modrm|<frel:comm>|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
 vcmp<frel>s<sd>, 0x<sd:spfx>c2/0x<frel:imm>, AVX, Modrm|<frel:comm>|VexLIG|Space0F|VexVVVV|VexWIG|NoSuf|ImmExt, { RegXMM|<sd:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
@@ -1731,6 +1733,8 @@ vpmovzxwq, 0x6634, AVX2, Modrm|Vex=2|Spa
 
 vbroadcasti128, 0x665A, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
 vbroadcastsd, 0x6619, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { RegXMM, RegYMM }
+// As an extension, provide a 128-bit form as well, utilizing vmovddup.
+vbroadcastsd, 0xf212, AVX2, Modrm|Vex128|Space0F|VexW0|NoSuf, { RegXMM, RegXMM }
 vbroadcastss, 0x6618, AVX2, Modrm|Vex|Space0F38|VexW=1|NoSuf, { RegXMM, RegXMM|RegYMM }
 vpblendd, 0x6602, AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpbroadcast<bw>, 0x6678 | <bw:opc>, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf, { <bw:elem>|Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM }
@@ -2128,6 +2132,8 @@ vbroadcasti64x4, 0x665B, AVX512F, Modrm|
 
 vbroadcastss, 0x6618, AVX512F, Modrm|Masking|Space0F38|VexW0|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vbroadcastsd, 0x6619, AVX512F, Modrm|Masking|Space0F38|VexW1|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
+// As an extension, provide a 128-bit form as well, utilizing vmovddup.
+vbroadcastsd, 0xf212, AVX512F|AVX512VL, Modrm|EVex128|Masking|Space0F|VexW1|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
 
 vpbroadcast<dq>, 0x6658 | <dq:opc>, AVX512F, Modrm|Masking|Space0F38|<dq:vexw>|Disp8MemShift|NoSuf, { RegXMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vpbroadcast<dq>, 0x667c, AVX512F, Modrm|Masking|Space0F38|<dq:vexw64>|NoSuf, { <dq:gpr>, RegXMM|RegYMM|RegZMM }


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo
  2023-06-16  7:32 ` [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo Jan Beulich
@ 2023-06-16 16:59   ` H.J. Lu
  2023-06-19  7:20     ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: H.J. Lu @ 2023-06-16 16:59 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Binutils

On Fri, Jun 16, 2023 at 12:32 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> VBROADCASTSD not supporting 128-bit destinations in any of their AVX,
> AVX2, or AVX512F incarnations is presumably because of VMOVDDUP
> precisely supporting this very operation. (It is therefore different
> from e.g. VPBROADCASTQ, which has no exact equivalent.) Still its
> absence has led to people using VPBROADCASTQ as substitution; this could
> have been avoided if such a pseudo had been supported from the very
> beginning.
>
> Note that the pseudos try to match what the real instructions would have
> used as closely as possible, i.e. VexW0 instead of VexWIG for the AVX
> and AVX2 forms as well as AVX2 in the first place for the register
> source form.
> ---
> For being the first example of us supplying such, this is partly RFC. On
> top of that a question is also whether to indeed have split AVX/AVX2
> templates, when in principle one (allowing for both memory and register
> source) could do.
>

I don't think assembler should invent such instructions.

-- 
H.J.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo
  2023-06-16 16:59   ` H.J. Lu
@ 2023-06-19  7:20     ` Jan Beulich
  2023-06-20 16:07       ` H.J. Lu
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2023-06-19  7:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Binutils

On 16.06.2023 18:59, H.J. Lu wrote:
> On Fri, Jun 16, 2023 at 12:32 AM Jan Beulich <jbeulich@suse.com> wrote:
>> VBROADCASTSD not supporting 128-bit destinations in any of their AVX,
>> AVX2, or AVX512F incarnations is presumably because of VMOVDDUP
>> precisely supporting this very operation. (It is therefore different
>> from e.g. VPBROADCASTQ, which has no exact equivalent.) Still its
>> absence has led to people using VPBROADCASTQ as substitution; this could
>> have been avoided if such a pseudo had been supported from the very
>> beginning.
>>
>> Note that the pseudos try to match what the real instructions would have
>> used as closely as possible, i.e. VexW0 instead of VexWIG for the AVX
>> and AVX2 forms as well as AVX2 in the first place for the register
>> source form.
>> ---
>> For being the first example of us supplying such, this is partly RFC. On
>> top of that a question is also whether to indeed have split AVX/AVX2
>> templates, when in principle one (allowing for both memory and register
>> source) could do.
>>
> 
> I don't think assembler should invent such instructions.

May I ask about the "why" behind this? If such a pseudo had been there
from the beginning, an admittedly minor mistake like that corrected by
gcc commit a4df0ce78d6f likely wouldn't have been made, because no
special casing of V2DFmode would have been necessary in the first place.

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo
  2023-06-19  7:20     ` Jan Beulich
@ 2023-06-20 16:07       ` H.J. Lu
  2023-06-21  9:01         ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: H.J. Lu @ 2023-06-20 16:07 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Binutils

On Mon, Jun 19, 2023 at 12:20 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 16.06.2023 18:59, H.J. Lu wrote:
> > On Fri, Jun 16, 2023 at 12:32 AM Jan Beulich <jbeulich@suse.com> wrote:
> >> VBROADCASTSD not supporting 128-bit destinations in any of their AVX,
> >> AVX2, or AVX512F incarnations is presumably because of VMOVDDUP
> >> precisely supporting this very operation. (It is therefore different
> >> from e.g. VPBROADCASTQ, which has no exact equivalent.) Still its
> >> absence has led to people using VPBROADCASTQ as substitution; this could
> >> have been avoided if such a pseudo had been supported from the very
> >> beginning.
> >>
> >> Note that the pseudos try to match what the real instructions would have
> >> used as closely as possible, i.e. VexW0 instead of VexWIG for the AVX
> >> and AVX2 forms as well as AVX2 in the first place for the register
> >> source form.
> >> ---
> >> For being the first example of us supplying such, this is partly RFC. On
> >> top of that a question is also whether to indeed have split AVX/AVX2
> >> templates, when in principle one (allowing for both memory and register
> >> source) could do.
> >>
> >
> > I don't think assembler should invent such instructions.
>
> May I ask about the "why" behind this? If such a pseudo had been there
> from the beginning, an admittedly minor mistake like that corrected by
> gcc commit a4df0ce78d6f likely wouldn't have been made, because no
> special casing of V2DFmode would have been necessary in the first place.
>
> Jan

All x86 instructions should come from the x86 SDM.

-- 
H.J.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo
  2023-06-20 16:07       ` H.J. Lu
@ 2023-06-21  9:01         ` Jan Beulich
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2023-06-21  9:01 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Binutils

On 20.06.2023 18:07, H.J. Lu wrote:
> On Mon, Jun 19, 2023 at 12:20 AM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 16.06.2023 18:59, H.J. Lu wrote:
>>> On Fri, Jun 16, 2023 at 12:32 AM Jan Beulich <jbeulich@suse.com> wrote:
>>>> VBROADCASTSD not supporting 128-bit destinations in any of their AVX,
>>>> AVX2, or AVX512F incarnations is presumably because of VMOVDDUP
>>>> precisely supporting this very operation. (It is therefore different
>>>> from e.g. VPBROADCASTQ, which has no exact equivalent.) Still its
>>>> absence has led to people using VPBROADCASTQ as substitution; this could
>>>> have been avoided if such a pseudo had been supported from the very
>>>> beginning.
>>>>
>>>> Note that the pseudos try to match what the real instructions would have
>>>> used as closely as possible, i.e. VexW0 instead of VexWIG for the AVX
>>>> and AVX2 forms as well as AVX2 in the first place for the register
>>>> source form.
>>>> ---
>>>> For being the first example of us supplying such, this is partly RFC. On
>>>> top of that a question is also whether to indeed have split AVX/AVX2
>>>> templates, when in principle one (allowing for both memory and register
>>>> source) could do.
>>>>
>>>
>>> I don't think assembler should invent such instructions.
>>
>> May I ask about the "why" behind this? If such a pseudo had been there
>> from the beginning, an admittedly minor mistake like that corrected by
>> gcc commit a4df0ce78d6f likely wouldn't have been made, because no
>> special casing of V2DFmode would have been necessary in the first place.
> 
> All x86 instructions should come from the x86 SDM.

Ehem. See "clr" for an example where syntax doesn't matter (IOW I wasn't
really right in saying this is the first example). There are also various
AT&T-invented mnemonics we support (and - wrongly - even in Intel syntax).
There are further insn forms (number and/or kind of operands) which aren't
backed by the SDM.

I'm afraid I can't take this single sentence as an answer to my question
of "Why?" Even less so with not addressing at all the reason I gave why I
think we should have had such a pseudo from the beginning.

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-06-21  9:01 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-16  7:29 [PATCH 0/4] x86: some more optimization plus a new pseudo insn form Jan Beulich
2023-06-16  7:30 ` [PATCH 1/4] x86: optimize pre-AVX512 {,V}PCMPEQQ with identical sources Jan Beulich
2023-06-16  7:31 ` [PATCH 2/4] x86: optimize pre-AVX512 {,V}PCMPGT* " Jan Beulich
2023-06-16  7:31 ` [PATCH 3/4] x86: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ Jan Beulich
2023-06-16  7:32 ` [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo Jan Beulich
2023-06-16 16:59   ` H.J. Lu
2023-06-19  7:20     ` Jan Beulich
2023-06-20 16:07       ` H.J. Lu
2023-06-21  9:01         ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).