public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] x86: fold a number of VEX and EVEX templates
@ 2023-09-19 15:43 Jan Beulich
  2023-09-19 15:44 ` [PATCH v2 1/4] x86: fold certain " Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Jan Beulich @ 2023-09-19 15:43 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

The RFC reason of the the last two patches was addressed. Still it's up
for debate whether in particular the last patch is worth it (see also
there).

1: fold certain VEX and EVEX templates
2: fold VAES/VPCLMULQDQ VEX and EVEX templates
3: fold FMA VEX and EVEX templates
4: fold F16C VEX and EVEX templates

Jan

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 1/4] x86: fold certain VEX and EVEX templates
  2023-09-19 15:43 [PATCH v2 0/4] x86: fold a number of VEX and EVEX templates Jan Beulich
@ 2023-09-19 15:44 ` Jan Beulich
  2023-09-19 15:45 ` [PATCH v2 2/4] x86: fold VAES/VPCLMULQDQ " Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Jan Beulich @ 2023-09-19 15:44 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu, Lili Cui

In anticipation of APX introduce logic to reduce the number of templates
we have now, allowing to limit some the number of ones we then need to
gain.

The fundamental requirements are that
- attributes be compatible, which specifically means VexW needs to be
  the same in the templates (which often isn't the case, for VEX
  encodings having far more WIG tha, EVEX ones),
- the EVEX form being AVX512F (with or without AVX512VL), not any of its
  extensions (the same will then be required for APX - it'll need to be
  APX_F).

Note that in check_register() there's now a redundant zmm check. Since
this logic will need revisiting for APX anyway, I'd like to keep it that
way for now. (Similarly a couple of if()-s which could be folded are
kept separate, to reduce code churn when adding APX support.)
---
RFC: Of course there are quite a few code changes, so there is the
     question of the savings being worth it.

The AVX512F constraint may be possible to relax, but the change is big
enough already.
---
v2: Deal with need_evex_encoding() being unreliable ahead of operand
    parsing (the difference really is largely benign here, but becomes
    relevant in subsequent changes).

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -436,6 +436,7 @@ struct _i386_insn
 	vex_encoding_vex,
 	vex_encoding_vex3,
 	vex_encoding_evex,
+	vex_encoding_evex512,
 	vex_encoding_error
       } vec_encoding;
 
@@ -1872,6 +1873,13 @@ cpu_flags_and_not (i386_cpu_flags x, i38
 
 static const i386_cpu_flags avx512 = CPU_ANY_AVX512F_FLAGS;
 
+static INLINE bool need_evex_encoding (void)
+{
+  return i.vec_encoding == vex_encoding_evex
+	|| i.vec_encoding == vex_encoding_evex512
+	|| i.mask.reg;
+}
+
 #define CPU_FLAGS_ARCH_MATCH		0x1
 #define CPU_FLAGS_64BIT_MATCH		0x2
 
@@ -1899,6 +1907,29 @@ cpu_flags_match (const insn_template *t)
       /* This instruction is available only on some archs.  */
       i386_cpu_flags cpu = cpu_arch_flags;
 
+      /* Dual VEX/EVEX templates may need stripping of one of the flags.  */
+      if (t->opcode_modifier.vex && t->opcode_modifier.evex)
+	{
+	  /* Dual AVX/AVX512F templates need to retain AVX512F only if we already
+	     know that EVEX encoding will be needed.  */
+	  if ((x.bitfield.cpuavx || x.bitfield.cpuavx2)
+	      && x.bitfield.cpuavx512f)
+	    {
+	      if (need_evex_encoding ())
+		{
+		  x.bitfield.cpuavx = 0;
+		  x.bitfield.cpuavx2 = 0;
+		}
+	      /* need_evex_encoding() isn't reliable before operands were
+		 parsed.  */
+	      else if (i.operands)
+		{
+		  x.bitfield.cpuavx512f = 0;
+		  x.bitfield.cpuavx512vl = 0;
+		}
+	    }
+	}
+
       /* AVX512VL is no standalone feature - match it and then strip it.  */
       if (x.bitfield.cpuavx512vl && !cpu.bitfield.cpuavx512vl)
 	return match;
@@ -1924,6 +1955,8 @@ cpu_flags_match (const insn_template *t)
 		  && (!x.bitfield.cpupclmulqdq || cpu.bitfield.cpupclmulqdq))
 		match |= CPU_FLAGS_ARCH_MATCH;
 	    }
+	  else if (x.bitfield.cpuavx2 && cpu.bitfield.cpuavx2)
+	    match |= CPU_FLAGS_ARCH_MATCH;
 	  else if (x.bitfield.cpuavx512f)
 	    {
 	      /* We need to check a few extra flags with AVX512F.  */
@@ -3646,6 +3679,27 @@ install_template (const insn_template *t
 
   i.tm = *t;
 
+  /* Dual VEX/EVEX templates need stripping one of the possible variants.  */
+  if (t->opcode_modifier.vex && t->opcode_modifier.evex)
+  {
+      if ((is_cpu (t, CpuAVX) || is_cpu (t, CpuAVX2))
+	  && is_cpu (t, CpuAVX512F))
+	{
+	  if (need_evex_encoding ())
+	    {
+	      i.tm.opcode_modifier.vex = 0;
+	      i.tm.cpu.bitfield.cpuavx = 0;
+	      if (is_cpu (&i.tm, CpuAVX2))
+	        i.tm.cpu.bitfield.isa = 0;
+	    }
+	  else
+	    {
+	      i.tm.opcode_modifier.evex = 0;
+	      i.tm.cpu.bitfield.cpuavx512f = 0;
+	    }
+	}
+  }
+
   /* Note that for pseudo prefixes this produces a length of 1. But for them
      the length isn't interesting at all.  */
   for (l = 1; l < 4; ++l)
@@ -4553,6 +4607,8 @@ optimize_encoding (void)
 	      i.tm.opcode_modifier.vex = VEX128;
 	      i.tm.opcode_modifier.vexw = VEXW0;
 	      i.tm.opcode_modifier.evex = 0;
+	      i.vec_encoding = vex_encoding_vex;
+	      i.mask.reg = NULL;
 	    }
 	  else if (optimize > 1)
 	    i.tm.opcode_modifier.evex = EVEX128;
@@ -5438,6 +5494,11 @@ md_assemble (char *line)
   if (optimize && !i.no_optimize && i.tm.opcode_modifier.optimize)
     optimize_encoding ();
 
+  /* Past optimization there's no need to distinguish vex_encoding_evex and
+     vex_encoding_evex512 anymore.  */
+  if (i.vec_encoding == vex_encoding_evex512)
+    i.vec_encoding = vex_encoding_evex;
+
   if (use_unaligned_vector_move)
     encode_with_unaligned_vector_move ();
 
@@ -5467,6 +5528,7 @@ md_assemble (char *line)
 	  if (i.tm.operand_types[j].bitfield.tmmword)
 	    i.xstate |= xstate_tmm;
 	  else if (i.tm.operand_types[j].bitfield.zmmword
+		   && !i.tm.opcode_modifier.vex
 		   && vector_size >= VSZ512)
 	    i.xstate |= xstate_zmm;
 	  else if (i.tm.operand_types[j].bitfield.ymmword
@@ -6468,7 +6530,8 @@ check_VecOperands (const insn_template *
   cpu = cpu_flags_and (cpu_flags_from_attr (t->cpu), avx512);
   if (!cpu_flags_all_zero (&cpu)
       && !is_cpu (t, CpuAVX512VL)
-      && !cpu_arch_flags.bitfield.cpuavx512vl)
+      && !cpu_arch_flags.bitfield.cpuavx512vl
+      && (!t->opcode_modifier.vex || need_evex_encoding ()))
     {
       for (op = 0; op < t->operands; ++op)
 	{
@@ -6779,6 +6842,8 @@ check_VecOperands (const insn_template *
 
   /* Check vector Disp8 operand.  */
   if (t->opcode_modifier.disp8memshift
+      && (!t->opcode_modifier.vex
+          || need_evex_encoding ())
       && i.disp_encoding <= disp_encoding_8bit)
     {
       if (i.broadcast.type || i.broadcast.bytes)
@@ -6874,7 +6939,8 @@ VEX_check_encoding (const insn_template
       return 1;
     }
 
-  if (i.vec_encoding == vex_encoding_evex)
+  if (i.vec_encoding == vex_encoding_evex
+      || i.vec_encoding == vex_encoding_evex512)
     {
       /* This instruction must be encoded with EVEX prefix.  */
       if (!is_evex_encoding (t))
@@ -11211,6 +11277,10 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
 	  goto done;
 	}
 
+      /* No need to distinguish vex_encoding_evex and vex_encoding_evex512.  */
+      if (i.vec_encoding == vex_encoding_evex512)
+	i.vec_encoding = vex_encoding_evex;
+
       /* Are we to emit ModR/M encoding?  */
       if (!i.short_form
 	  && (i.mem_operands
@@ -11633,6 +11703,12 @@ RC_SAE_specifier (const char *pstr)
 	      return NULL;
 	    }
 
+	  if (i.vec_encoding == vex_encoding_default)
+	    i.vec_encoding = vex_encoding_evex512;
+	  else if (i.vec_encoding != vex_encoding_evex
+		   && i.vec_encoding != vex_encoding_evex512)
+	    return NULL;
+
 	  i.rounding.type = RC_NamesTable[j].type;
 
 	  return (char *)(pstr + RC_NamesTable[j].len);
@@ -11692,6 +11768,12 @@ check_VecOperations (char *op_string)
 		}
 	      op_string++;
 
+	      if (i.vec_encoding == vex_encoding_default)
+		i.vec_encoding = vex_encoding_evex;
+	      else if (i.vec_encoding != vex_encoding_evex
+		       && i.vec_encoding != vex_encoding_evex512)
+		goto unknown_vec_op;
+
 	      i.broadcast.type = bcst_type;
 	      i.broadcast.operand = this_operand;
 
@@ -13953,8 +14035,17 @@ static bool check_register (const reg_en
 	}
     }
 
-  if (vector_size < VSZ512 && r->reg_type.bitfield.zmmword)
-    return false;
+  if (r->reg_type.bitfield.zmmword)
+    {
+      if (vector_size < VSZ512)
+	return false;
+
+      if (i.vec_encoding == vex_encoding_default)
+	i.vec_encoding = vex_encoding_evex512;
+      else if (i.vec_encoding != vex_encoding_evex
+	       && i.vec_encoding != vex_encoding_evex512)
+	i.vec_encoding = vex_encoding_error;
+    }
 
   if (vector_size < VSZ256 && r->reg_type.bitfield.ymmword)
     return false;
@@ -13979,7 +14070,8 @@ static bool check_register (const reg_en
 	  || flag_code != CODE_64BIT)
 	return false;
 
-      if (i.vec_encoding == vex_encoding_default)
+      if (i.vec_encoding == vex_encoding_default
+	  || i.vec_encoding == vex_encoding_evex512)
 	i.vec_encoding = vex_encoding_evex;
       else if (i.vec_encoding != vex_encoding_evex)
 	i.vec_encoding = vex_encoding_error;
--- a/gas/config/tc-i386-intel.c
+++ b/gas/config/tc-i386-intel.c
@@ -209,6 +209,11 @@ operatorT i386_operator (const char *nam
 	      || i386_types[j].sz[0] > 8
 	      || (i386_types[j].sz[0] & (i386_types[j].sz[0] - 1)))
 	    return O_illegal;
+	  if (i.vec_encoding == vex_encoding_default)
+	    i.vec_encoding = vex_encoding_evex;
+	  else if (i.vec_encoding != vex_encoding_evex
+		   && i.vec_encoding != vex_encoding_evex512)
+	    return O_illegal;
 	  if (!i.broadcast.bytes && !i.broadcast.type)
 	    {
 	      i.broadcast.bytes = i386_types[j].sz[0];
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -131,6 +131,8 @@
 #define EVexLIG EVex=EVEXLIG
 #define EVexDYN EVex=EVEXDYN
 
+#define Disp8ShiftVL Disp8MemShift=DISP8_SHIFT_VL
+
 #define Vsz256 Vsz=VSZ256
 #define Vsz512 Vsz=VSZ512
 
@@ -1518,8 +1520,8 @@ vdivs<sd>, 0x<sd:spfx>5e, AVX, Modrm|Vex
 vdppd, 0x6641, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
 vdpps, 0x6640, AVX, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vextractf128, 0x6619, AVX, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
-vextractps, 0x6617, AVX, Modrm|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg32|Dword|Unspecified|BaseIndex }
-vextractps, 0x6617, AVX|x64, RegMem|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg64 }
+vextractps, 0x6617, AVX|AVX512F, Modrm|Vex128|EVex128|Space0F3A|VexWIG|Disp8MemShift=2|NoSuf, { Imm8, RegXMM, Reg32|Dword|Unspecified|BaseIndex }
+vextractps, 0x6617, AVX|AVX512F|x64, RegMem|Vex128|EVex128|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg64 }
 vhaddpd, 0x667c, AVX, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vhaddps, 0xf27c, AVX, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vhsubpd, 0x667d, AVX, Modrm|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1541,7 +1543,7 @@ vmovap<sd>, 0x<sd:ppfx>28, AVX, D|Modrm|
 // by Intel AVX spec).  To avoid extra template in gcc x86 backend and
 // support assembler for AMD64, we accept 64bit operand on vmovd so
 // that we can use one template for both SSE and AVX instructions.
-vmovd, 0x666e, AVX, D|Modrm|Vex=1|Space0F|NoSuf, { Reg32|Unspecified|BaseIndex, RegXMM }
+vmovd, 0x666e, AVX|AVX512F, D|Modrm|Vex128|EVex128|Space0F|Disp8MemShift=2|NoSuf, { Reg32|Unspecified|BaseIndex, RegXMM }
 vmovd, 0x667e, AVX|x64, D|RegMem|Vex=1|Space0F|VexW=2|NoSuf|Size64, { RegXMM, Reg64 }
 vmovddup, 0xf212, AVX, Modrm|Vex|Space0F|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vmovddup, 0xf212, AVX, Modrm|Vex=2|Space0F|VexWIG|NoSuf, { Unspecified|BaseIndex|RegYMM, RegYMM }
@@ -1559,7 +1561,7 @@ vmovntdqa, 0x662a, AVX|AVX2, Modrm|Vex|S
 vmovntp<sd>, 0x<sd:ppfx>2b, AVX, Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { RegXMM|RegYMM, Xmmword|Ymmword|Unspecified|BaseIndex }
 vmovq, 0xf37e, AVX, Load|Modrm|Vex=1|Space0F|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vmovq, 0x66d6, AVX, Modrm|Vex=1|Space0F|VexWIG|NoSuf, { RegXMM, Qword|Unspecified|BaseIndex|RegXMM }
-vmovq, 0x666e, AVX|x64, D|Modrm|Vex=1|Space0F|VexW=2|NoSuf, { Reg64|Unspecified|BaseIndex, RegXMM }
+vmovq, 0x666e, AVX|AVX512F|x64, D|Modrm|Vex128|EVex128|Space0F|VexW1|Disp8MemShift=3|NoSuf, { Reg64|Unspecified|BaseIndex, RegXMM }
 vmovs<sd>, 0x<sd:spfx>10, AVX, D|Modrm|VexLIG|Space0F|VexWIG|NoSuf, { <sd:elem>|Unspecified|BaseIndex, RegXMM }
 vmovs<sd>, 0x<sd:spfx>10, AVX, D|Modrm|VexLIG|Space0F|VexVVVV|VexWIG|NoSuf, { RegXMM, RegXMM, RegXMM }
 vmovshdup, 0xf316, AVX, Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1599,8 +1601,10 @@ vpcmpgtq, 0x6637, AVX|AVX2, Modrm|Vex|Sp
 vpcmpistri, 0x6663, AVX, Modrm|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpistrm, 0x6662, AVX, Modrm|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vperm2f128, 0x6606, AVX, Modrm|Vex256|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }
-vpermilp<sd>, 0x660c | <sd:opc>, AVX, Modrm|Vex|Space0F38|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpermilp<sd>, 0x6604 | <sd:opc>, AVX, Modrm|Vex|Space0F3A|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM }
+vpermilps, 0x660c, AVX|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpermilps, 0x6604, AVX|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F3A|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vpermilpd, 0x660d, AVX, Modrm|Vex|Space0F38|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpermilpd, 0x6605, AVX, Modrm|Vex|Space0F3A|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM }
 vpextr<dq>, 0x6616, AVX|<dq:cpu64>, Modrm|Vex|Space0F3A|<dq:vexw64>|NoSuf, { Imm8, RegXMM, <dq:gpr>|Unspecified|BaseIndex }
 vpextrw, 0x66c5, AVX, Load|Modrm|Vex|Space0F|VexWIG|No_bSuf|No_wSuf|No_sSuf, { Imm8, RegXMM, Reg32|Reg64 }
 vpextr<bw>, 0x6614 | <bw:opc>, AVX, RegMem|Vex|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg32|Reg64 }
@@ -1632,18 +1636,18 @@ vpminub, 0x66da, AVX|AVX2, Modrm|C|Vex|S
 vpminud, 0x663b, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpminuw, 0x663a, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpmovmskb, 0x66d7, AVX|AVX2, Modrm|Vex|Space0F|VexWIG|No_bSuf|No_wSuf|No_sSuf, { RegXMM|RegYMM, Reg32|Reg64 }
-vpmovsxbd, 0x6621, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovsxbq, 0x6622, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Word|Unspecified|BaseIndex|RegXMM, RegXMM }
+vpmovsxbd, 0x6621, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
+vpmovsxbq, 0x6622, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=1|NoSuf, { RegXMM|Word|Unspecified|BaseIndex, RegXMM }
 vpmovsxbw, 0x6620, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vpmovsxdq, 0x6625, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovsxwd, 0x6623, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovsxwq, 0x6624, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovzxbd, 0x6631, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovzxbq, 0x6632, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Word|Unspecified|BaseIndex|RegXMM, RegXMM }
+vpmovsxwd, 0x6623, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
+vpmovsxwq, 0x6624, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
+vpmovzxbd, 0x6631, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
+vpmovzxbq, 0x6632, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=1|NoSuf, { RegXMM|Word|Unspecified|BaseIndex, RegXMM }
 vpmovzxbw, 0x6630, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vpmovzxdq, 0x6635, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovzxwd, 0x6633, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vpmovzxwq, 0x6634, AVX, Modrm|Vex|Space0F38|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex|RegXMM, RegXMM }
+vpmovzxwd, 0x6633, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
+vpmovzxwq, 0x6634, AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
 vpmuldq, 0x6628, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpmulhrsw, 0x660b, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpmulhuw, 0x66e4, AVX|AVX2, Modrm|C|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
@@ -1710,39 +1714,40 @@ vzeroupper, 0x77, AVX, Vex|Space0F|VexWI
 
 // 256bit integer AVX2 instructions.
 
-vpmovsxbd, 0x6621, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovsxbq, 0x6622, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex|RegXMM, RegYMM }
+vpmovsxbd, 0x6621, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
+vpmovsxbq, 0x6622, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegYMM }
 vpmovsxbw, 0x6620, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
 vpmovsxdq, 0x6625, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovsxwd, 0x6623, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovsxwq, 0x6624, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovzxbd, 0x6631, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovzxbq, 0x6632, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Dword|Unspecified|BaseIndex|RegXMM, RegYMM }
+vpmovsxwd, 0x6623, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
+vpmovsxwq, 0x6624, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
+vpmovzxbd, 0x6631, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
+vpmovzxbq, 0x6632, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegYMM }
 vpmovzxbw, 0x6630, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
 vpmovzxdq, 0x6635, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovzxwd, 0x6633, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
-vpmovzxwq, 0x6634, AVX2, Modrm|Vex=2|Space0F38|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegYMM }
+vpmovzxwd, 0x6633, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
+vpmovzxwq, 0x6634, AVX2|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
 
 // New AVX2 instructions.
 
 vbroadcasti128, 0x665A, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { Xmmword|Unspecified|BaseIndex, RegYMM }
 vbroadcastsd, 0x6619, AVX2, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { RegXMM, RegYMM }
-vbroadcastss, 0x6618, AVX2, Modrm|Vex|Space0F38|VexW=1|NoSuf, { RegXMM, RegXMM|RegYMM }
+vbroadcastss, 0x6618, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexW0|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vpblendd, 0x6602, AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpbroadcast<bw>, 0x6678 | <bw:opc>, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf, { <bw:elem>|Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM }
-vpbroadcast<dq>, 0x6658 | <dq:opc>, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf|Optimize, { <dq:elem>|Unspecified|BaseIndex|RegXMM, RegXMM|RegYMM }
+vpbroadcastd, 0x6658, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexW0|Disp8MemShift|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vpbroadcastq, 0x6659, AVX2, Modrm|Vex|Space0F38|VexW0|NoSuf|Optimize, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM }
 vperm2i128, 0x6646, AVX2, Modrm|Vex=2|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }
-vpermd, 0x6636, AVX2, Modrm|Vex256|Space0F38|VexVVVV|VexW0|NoSuf, { Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }
-vpermpd, 0x6601, AVX2, Modrm|Vex=2|Space0F3A|VexW1|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM }
-vpermps, 0x6616, AVX2, Modrm|Vex256|Space0F38|VexVVVV|VexW0|NoSuf, { Unspecified|BaseIndex|RegYMM, RegYMM, RegYMM }
-vpermq, 0x6600, AVX2, Modrm|Vex=2|Space0F3A|VexW1|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegYMM, RegYMM }
+vpermd, 0x6636, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
+vpermpd, 0x6601, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
+vpermps, 0x6616, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
+vpermq, 0x6600, AVX2|AVX512F, Modrm|Vex256|EVexDYN|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
 vextracti128, 0x6639, AVX2, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
 vinserti128, 0x6638, AVX2, Modrm|Vex256|Space0F3A|VexVVVV|VexW0|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegYMM, RegYMM }
 vpmaskmov<dq>, 0x668e, AVX2, Modrm|Vex|Space0F38|VexVVVV|<dq:vexw>|CheckOperandSize|NoSuf, { RegXMM|RegYMM, RegXMM|RegYMM, Xmmword|Ymmword|Unspecified|BaseIndex }
 vpmaskmov<dq>, 0x668c, AVX2, Modrm|Vex|Space0F38|VexVVVV|<dq:vexw>|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
-vpsllv<dq>, 0x6647, AVX2, Modrm|Vex|Space0F38|VexVVVV|<dq:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpsravd, 0x6646, AVX2, Modrm|Vex|Space0F38|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpsrlv<dq>, 0x6645, AVX2, Modrm|Vex|Space0F38|VexVVVV|<dq:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpsllv<dq>, 0x6647, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpsravd, 0x6646, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpsrlv<dq>, 0x6645, AVX2|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
 // AVX gather instructions
 vgatherdpd, 0x6692, AVX2, Modrm|Vex|Space0F38|VexVVVV|VexW1|SwapSources|CheckOperandSize|NoSuf|VecSIB128, { RegXMM|RegYMM, Qword|Unspecified|BaseIndex, RegXMM|RegYMM }
@@ -1779,7 +1784,7 @@ vpclmulhqhqdq, 0x6644/0x11, AVX|PCLMULQD
 
 vgf2p8affineinvqb, 0x66cf, AVX|GFNI, Modrm|Vex|Space0F3A|VexVVVV|VexW1|CheckOperandSize|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vgf2p8affineqb, 0x66ce, AVX|GFNI, Modrm|Vex|Space0F3A|VexVVVV|VexW1|CheckOperandSize|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vgf2p8mulb, 0x66cf, AVX|GFNI, Modrm|Vex|Space0F38|VexVVVV|VexW0|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vgf2p8mulb, 0x66cf, GFNI|AVX|AVX512F, Modrm|Vex|EVexDYN|Masking|Space0F38|VexVVVV|VexW0|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
 // FSGSBASE, RDRND and F16C
 
@@ -2082,8 +2087,6 @@ vpclmulhqhqdq, 0x6644/0x11, VPCLMULQDQ,
 
 // AVX512F instructions.
 
-#define Disp8ShiftVL Disp8MemShift=DISP8_SHIFT_VL
-
 <sdh:cpu:cpudq:ppfx:spfx:pfx:spc1:spc2:opc:vexw:elem, +
     s:AVX512F:AVX512DQ::f3:66:Space0F:Space0F38:0:VexW0:Dword, +
     d:AVX512F:AVX512DQ:66:f2:66:Space0F:Space0F38:1:VexW1:Qword, +
@@ -2142,9 +2145,7 @@ vpmuldq, 0x6628, AVX512F, Modrm|Masking|
 vpmulld, 0x6640, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW=1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vprolv<dq>, 0x6615, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vprorv<dq>, 0x6614, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpsllv<dq>, 0x6647, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpsrav<dq>, 0x6646, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpsrlv<dq>, 0x6645, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpsravq, 0x6646, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vpternlog<dq>, 0x6625, AVX512F, Modrm|Masking|Space0F3A|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
 vbroadcastf32x4, 0x661A, AVX512F, Modrm|Masking|Space0F38|VexW=1|Disp8MemShift=4|NoSuf, { XMMword|Unspecified|BaseIndex, RegYMM|RegZMM }
@@ -2153,10 +2154,9 @@ vbroadcasti32x4, 0x665A, AVX512F, Modrm|
 vbroadcastf64x4, 0x661B, AVX512F, Modrm|EVex=1|Masking|Space0F38|VexW=2|Disp8MemShift=5|NoSuf, { YMMword|Unspecified|BaseIndex, RegZMM }
 vbroadcasti64x4, 0x665B, AVX512F, Modrm|EVex=1|Masking|Space0F38|VexW=2|Disp8MemShift=5|NoSuf, { YMMword|Unspecified|BaseIndex, RegZMM }
 
-vbroadcastss, 0x6618, AVX512F, Modrm|Masking|Space0F38|VexW0|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vbroadcastsd, 0x6619, AVX512F, Modrm|Masking|Space0F38|VexW1|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
 
-vpbroadcast<dq>, 0x6658 | <dq:opc>, AVX512F, Modrm|Masking|Space0F38|<dq:vexw>|Disp8MemShift|NoSuf, { RegXMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vpbroadcastq, 0x6659, AVX512F, Modrm|Masking|Space0F38|VexW1|Disp8MemShift|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vpbroadcast<dq>, 0x667c, AVX512F, Modrm|Masking|Space0F38|<dq:vexw64>|NoSuf, { <dq:gpr>, RegXMM|RegYMM|RegZMM }
 
 vcmp<frel>p<sd>, 0x<sd:ppfx>C2/0x<frel:imm>, AVX512F, Modrm|Masking|Space0F|VexVVVV|<sd:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt|SAE, { RegXMM|RegYMM|RegZMM|<sd:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegMask }
@@ -2246,9 +2246,6 @@ vextracti32x4, 0x6639, AVX512F, Modrm|Ma
 vextractf64x4, 0x661B, AVX512F, Modrm|EVex=1|Masking|Space0F3A|VexW=2|Disp8MemShift=5|NoSuf, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
 vextracti64x4, 0x663B, AVX512F, Modrm|EVex=1|Masking|Space0F3A|VexW=2|Disp8MemShift=5|NoSuf, { Imm8, RegZMM, RegYMM|Unspecified|BaseIndex }
 
-vextractps, 0x6617, AVX512F, Modrm|EVex128|Space0F3A|VexWIG|Disp8MemShift=2|NoSuf, { Imm8, RegXMM, Reg32|Dword|Unspecified|BaseIndex }
-vextractps, 0x6617, AVX512F|x64, RegMem|EVex128|Space0F3A|VexWIG|NoSuf, { Imm8, RegXMM, Reg64 }
-
 vfixupimmp<sd>, 0x6654, AVX512F, Modrm|Masking|Space0F3A|VexVVVV|<sd:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|SAE, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|<sd:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vfixupimms<sd>, 0x6655, AVX512F, Modrm|EVexLIG|Masking|Space0F3A|VexVVVV|<sd:vexw>|Disp8MemShift|NoSuf|SAE, { Imm8|Imm8S, RegXMM|<sd:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
 
@@ -2304,8 +2301,6 @@ vmovap<sd>, 0x<sd:ppfx>28, AVX512F, D|Mo
 vmovntp<sd>, 0x<sd:ppfx>2B, AVX512F, Modrm|Space0F|<sd:vexw>|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM, XMMword|YMMword|ZMMword|Unspecified|BaseIndex }
 vmovup<sd>, 0x<sd:ppfx>10, AVX512F, D|Modrm|Masking|Space0F|<sd:vexw>|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 
-vmovd, 0x666E, AVX512F, D|Modrm|EVex=2|Space0F|Disp8MemShift=2|NoSuf, { Reg32|Unspecified|BaseIndex, RegXMM }
-
 vmovddup, 0xF212, AVX512F, Modrm|Masking|Space0F|VexW=2|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Unspecified|BaseIndex, RegYMM|RegZMM }
 
 vmovdqa64, 0x666F, AVX512F, D|Modrm|Masking|Space0F|VexW=2|Disp8ShiftVL|CheckOperandSize|NoSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
@@ -2322,7 +2317,6 @@ vmovhp<sd>, 0x<sd:ppfx>17, AVX512F, Modr
 vmovlp<sd>, 0x<sd:ppfx>12, AVX512F, Modrm|EVexLIG|Space0F|VexVVVV|<sd:vexw>|Disp8MemShift=3|NoSuf, { Qword|Unspecified|BaseIndex, RegXMM, RegXMM }
 vmovlp<sd>, 0x<sd:ppfx>13, AVX512F, Modrm|EVexLIG|Space0F|<sd:vexw>|Disp8MemShift=3|NoSuf, { RegXMM, Qword|Unspecified|BaseIndex }
 
-vmovq, 0x666E, AVX512F|x64, D|Modrm|EVex128|Space0F|VexW1|Disp8MemShift=3|NoSuf, { Reg64|Unspecified|BaseIndex, RegXMM }
 vmovq, 0xF37E, AVX512F, Load|Modrm|EVex=2|Space0F|VexW1|Disp8MemShift=3|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vmovq, 0x66D6, AVX512F, Modrm|EVex=2|Space0F|VexW1|Disp8MemShift=3|NoSuf, { RegXMM, Qword|Unspecified|BaseIndex|RegXMM }
 
@@ -2360,15 +2354,10 @@ vpcmp<irel>u<dq>, 0x661e/<irel:imm>, AVX
 vptestm<dq>, 0x6627, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegMask }
 vptestnm<dq>, 0xf327, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<dq:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<dq:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegMask }
 
-vpermd, 0x6636, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
-vpermps, 0x6616, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW0|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Dword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
-
-vpermilp<sd>, 0x6604 | <sd:opc>, AVX512F, Modrm|Masking|Space0F3A|<sd:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|<sd:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vpermilp<sd>, 0x660C | <sd:opc>, AVX512F, Modrm|Masking|Space0F38|VexVVVV|<sd:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|<sd:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpermilpd, 0x6605, AVX512F, Modrm|Masking|Space0F3A|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vpermilpd, 0x660d, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
-vpermpd, 0x6601, AVX512F, Modrm|Masking|Space0F3A|VexW=2|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
 vpermpd, 0x6616, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
-vpermq, 0x6600, AVX512F, Modrm|Masking|Space0F3A|VexW=2|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM }
 vpermq, 0x6636, AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegYMM|RegZMM, RegYMM|RegZMM }
 
 vpmovdb, 0xF331, AVX512F, Modrm|EVex=1|Masking|Space0F38|VexW=1|Disp8MemShift=4|NoSuf, { RegZMM, RegXMM|Unspecified|BaseIndex }
@@ -2593,31 +2582,11 @@ vpmovsqw, 0xF324, AVX512F|AVX512VL, Modr
 vpmovusqw, 0xF314, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexW0|Disp8MemShift=2|NoSuf, { RegXMM, RegXMM|Dword|Unspecified|BaseIndex }
 vpmovusqw, 0xF314, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexW0|Disp8MemShift=3|NoSuf, { RegYMM, RegXMM|Qword|Unspecified|BaseIndex }
 
-vpmovsxbd, 0x6621, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
-vpmovsxbd, 0x6621, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
-vpmovzxbd, 0x6631, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
-vpmovzxbd, 0x6631, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
-
-vpmovsxbq, 0x6622, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=1|NoSuf, { RegXMM|Word|Unspecified|BaseIndex, RegXMM }
-vpmovsxbq, 0x6622, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegYMM }
-vpmovzxbq, 0x6632, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=1|NoSuf, { RegXMM|Word|Unspecified|BaseIndex, RegXMM }
-vpmovzxbq, 0x6632, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegYMM }
-
 vpmovsxdq, 0x6625, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexW0|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
 vpmovsxdq, 0x6625, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexW=1|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
 vpmovzxdq, 0x6635, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexW0|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
 vpmovzxdq, 0x6635, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexW=1|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
 
-vpmovsxwd, 0x6623, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
-vpmovsxwd, 0x6623, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
-vpmovzxwd, 0x6633, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
-vpmovzxwd, 0x6633, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
-
-vpmovsxwq, 0x6624, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
-vpmovsxwq, 0x6624, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
-vpmovzxwq, 0x6634, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexWIG|Disp8MemShift=2|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegXMM }
-vpmovzxwq, 0x6634, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexWIG|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegYMM }
-
 // AVX512VL instructions end.
 
 // AVX512BW instructions.
@@ -2960,7 +2929,6 @@ vpshufbitqmb, 0x668f, AVX512_BITALG, Mod
 
 vgf2p8affineinvqb, 0x66cf, GFNI|AVX512F, Modrm|Masking|Space0F3A|VexVVVV|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vgf2p8affineqb, 0x66ce, GFNI|AVX512F, Modrm|Masking|Space0F3A|VexVVVV|VexW1|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8, RegXMM|RegYMM|RegZMM|Qword|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vgf2p8mulb, 0x66cf, GFNI|AVX512F, Modrm|Masking|Space0F38|VexVVVV|VexW0|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
 // AVX512 + GFNI instructions end
 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 2/4] x86: fold VAES/VPCLMULQDQ VEX and EVEX templates
  2023-09-19 15:43 [PATCH v2 0/4] x86: fold a number of VEX and EVEX templates Jan Beulich
  2023-09-19 15:44 ` [PATCH v2 1/4] x86: fold certain " Jan Beulich
@ 2023-09-19 15:45 ` Jan Beulich
  2023-09-19 15:45 ` [PATCH v2 3/4] x86: fold FMA " Jan Beulich
  2023-09-19 15:45 ` [PATCH v2 4/4] x86: fold F16C " Jan Beulich
  3 siblings, 0 replies; 5+ messages in thread
From: Jan Beulich @ 2023-09-19 15:45 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

Following the folding of some generic AVX/AVX2 templates with their
AVX512F counterpart ones, do this for VAES and VPCLMULQDQ ones as well.

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1944,7 +1944,17 @@ cpu_flags_match (const insn_template *t)
       cpu = cpu_flags_and (x, cpu);
       if (!cpu_flags_all_zero (&cpu))
 	{
-	  if (x.bitfield.cpuavx)
+	  if (t->cpu.bitfield.cpuavx && t->cpu.bitfield.cpuavx512f)
+	    {
+	      if ((need_evex_encoding ()
+		   ? cpu.bitfield.cpuavx512f
+		   : cpu.bitfield.cpuavx)
+		  && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
+		  && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
+		  && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
+	    match |= CPU_FLAGS_ARCH_MATCH;
+	    }
+	  else if (x.bitfield.cpuavx)
 	    {
 	      /* We need to check a few extra flags with AVX.  */
 	      if (cpu.bitfield.cpuavx
@@ -1961,9 +1971,7 @@ cpu_flags_match (const insn_template *t)
 	    {
 	      /* We need to check a few extra flags with AVX512F.  */
 	      if (cpu.bitfield.cpuavx512f
-		  && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
-		  && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
-		  && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
+		  && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni))
 		match |= CPU_FLAGS_ARCH_MATCH;
 	    }
 	  else
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -2068,20 +2068,20 @@ vsm4rnds4, 0xf2da, SM4, Modrm|Space0F38|
 
 // VAES
 
-vaesdec, 0x66de, VAES, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { RegXMM|RegYMM|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
-vaesdeclast, 0x66df, VAES, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { RegXMM|RegYMM|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
-vaesenc, 0x66dc, VAES, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { RegXMM|RegYMM|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
-vaesenclast, 0x66dd, VAES, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { RegXMM|RegYMM|Unspecified|BaseIndex, RegXMM|RegYMM, RegXMM|RegYMM }
+vaesdec, 0x66de, VAES|AVX|AVX512F, Modrm|Vex|EVexDYN|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vaesdeclast, 0x66df, VAES|AVX|AVX512F, Modrm|Vex|EVexDYN|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vaesenc, 0x66dc, VAES|AVX|AVX512F, Modrm|Vex|EVexDYN|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vaesenclast, 0x66dd, VAES|AVX|AVX512F, Modrm|Vex|EVexDYN|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
 // VAES instructions end
 
 // VPCLMULQDQ instructions
 
-vpclmulqdq, 0x6644, VPCLMULQDQ, Modrm|Vex|Space0F3A|VexWIG|VexVVVV|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpclmullqlqdq, 0x6644/0x00, VPCLMULQDQ, Modrm|Vex|Space0F3A|VexWIG|VexVVVV|CheckOperandSize|NoSuf|ImmExt, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpclmulhqlqdq, 0x6644/0x01, VPCLMULQDQ, Modrm|Vex|Space0F3A|VexWIG|VexVVVV|CheckOperandSize|NoSuf|ImmExt, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpclmullqhqdq, 0x6644/0x10, VPCLMULQDQ, Modrm|Vex|Space0F3A|VexWIG|VexVVVV|CheckOperandSize|NoSuf|ImmExt, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpclmulhqhqdq, 0x6644/0x11, VPCLMULQDQ, Modrm|Vex|Space0F3A|VexWIG|VexVVVV|CheckOperandSize|NoSuf|ImmExt, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpclmulqdq, 0x6644, VPCLMULQDQ|AVX|AVX512F, Modrm|Space0F3A|Vex|EVexDYN|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpclmullqlqdq, 0x6644/0x00, VPCLMULQDQ|AVX|AVX512F, Modrm|Space0F3A|Vex|EVexDYN|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpclmulhqlqdq, 0x6644/0x01, VPCLMULQDQ|AVX|AVX512F, Modrm|Space0F3A|Vex|EVexDYN|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpclmullqhqdq, 0x6644/0x10, VPCLMULQDQ|AVX|AVX512F, Modrm|Space0F3A|Vex|EVexDYN|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vpclmulhqhqdq, 0x6644/0x11, VPCLMULQDQ|AVX|AVX512F, Modrm|Space0F3A|Vex|EVexDYN|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
 // VPCLMULQDQ instructions end
 
@@ -2932,25 +2932,6 @@ vgf2p8affineqb, 0x66ce, GFNI|AVX512F, Mo
 
 // AVX512 + GFNI instructions end
 
-// AVX512 + VAES instructions
-
-vaesdec, 0x66de, VAES|AVX512F, Modrm|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vaesdeclast, 0x66df, VAES|AVX512F, Modrm|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vaesenc, 0x66dc, VAES|AVX512F, Modrm|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vaesenclast, 0x66dd, VAES|AVX512F, Modrm|Space0F38|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-
-// AVX512 + VAES instructions end
-
-// AVX512 + VPCLMULQDQ instructions
-
-vpclmulqdq, 0x6644, VPCLMULQDQ|AVX512F, Modrm|Space0F3A|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf, { Imm8|Imm8S, RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpclmullqlqdq, 0x6644/0x00, VPCLMULQDQ|AVX512F, Modrm|Space0F3A|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpclmulhqlqdq, 0x6644/0x01, VPCLMULQDQ|AVX512F, Modrm|Space0F3A|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpclmullqhqdq, 0x6644/0x10, VPCLMULQDQ|AVX512F, Modrm|Space0F3A|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vpclmulhqhqdq, 0x6644/0x11, VPCLMULQDQ|AVX512F, Modrm|Space0F3A|VexWIG|VexVVVV|Disp8ShiftVL|CheckOperandSize|NoSuf|ImmExt, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-
-// AVX512 + VPCLMULQDQ instructions end
-
 // INVLPGB instructions
 
 invlpgb, 0xf01fe, INVLPGB, NoSuf, {}


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 3/4] x86: fold FMA VEX and EVEX templates
  2023-09-19 15:43 [PATCH v2 0/4] x86: fold a number of VEX and EVEX templates Jan Beulich
  2023-09-19 15:44 ` [PATCH v2 1/4] x86: fold certain " Jan Beulich
  2023-09-19 15:45 ` [PATCH v2 2/4] x86: fold VAES/VPCLMULQDQ " Jan Beulich
@ 2023-09-19 15:45 ` Jan Beulich
  2023-09-19 15:45 ` [PATCH v2 4/4] x86: fold F16C " Jan Beulich
  3 siblings, 0 replies; 5+ messages in thread
From: Jan Beulich @ 2023-09-19 15:45 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

Following the folding of some generic AVX/AVX2 templates with their
AVX512F counterpart ones, do this for FMA ones as well, requiring one
further adjustment to cpu_flags_match().
---
v2: Eliminate unwanted side effect.

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1926,6 +1926,8 @@ cpu_flags_match (const insn_template *t)
 		{
 		  x.bitfield.cpuavx512f = 0;
 		  x.bitfield.cpuavx512vl = 0;
+		  if (x.bitfield.cpufma && !cpu.bitfield.cpufma)
+		    x.bitfield.cpuavx = 0;
 		}
 	    }
 	}
@@ -1949,6 +1951,8 @@ cpu_flags_match (const insn_template *t)
 	      if ((need_evex_encoding ()
 		   ? cpu.bitfield.cpuavx512f
 		   : cpu.bitfield.cpuavx)
+		  && (!x.bitfield.cpufma || cpu.bitfield.cpufma
+		      || cpu_arch_flags.bitfield.cpuavx512f)
 		  && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
 		  && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
 		  && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1802,16 +1802,21 @@ vcvtps2ph, 0x661d, F16C, Modrm|Vex=2|Spa
 
 <fma:opc, 132:10, 213:20, 231:30>
 
-vfmadd<fma>p<sd>, 0x6688 | 0x<fma:opc>, FMA, Modrm|Vex|Space0F38|VexVVVV|<sd:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vfmadd<fma>s<sd>, 0x6689 | 0x<fma:opc>, FMA, Modrm|VexLIG|Space0F38|VexVVVV|<sd:vexw>|NoSuf, { <sd:elem>|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
-vfmaddsub<fma>p<sd>, 0x6686 | 0x<fma:opc>, FMA, Modrm|Vex|Space0F38|VexVVVV|<sd:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vfmsub<fma>p<sd>, 0x668a | 0x<fma:opc>, FMA, Modrm|Vex|Space0F38|VexVVVV|<sd:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vfmsub<fma>s<sd>, 0x668b | 0x<fma:opc>, FMA, Modrm|VexLIG|Space0F38|VexVVVV|<sd:vexw>|NoSuf, { <sd:elem>|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
-vfmsubadd<fma>p<sd>, 0x6687 | 0x<fma:opc>, FMA, Modrm|Vex|Space0F38|VexVVVV|<sd:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vfnmadd<fma>p<sd>, 0x668c | 0x<fma:opc>, FMA, Modrm|Vex|Space0F38|VexVVVV|<sd:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vfnmadd<fma>s<sd>, 0x668d | 0x<fma:opc>, FMA, Modrm|VexLIG|Space0F38|VexVVVV|<sd:vexw>|NoSuf, { <sd:elem>|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
-vfnmsub<fma>p<sd>, 0x668e | 0x<fma:opc>, FMA, Modrm|Vex|Space0F38|VexVVVV|<sd:vexw>|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vfnmsub<fma>s<sd>, 0x668f | 0x<fma:opc>, FMA, Modrm|VexLIG|Space0F38|VexVVVV|<sd:vexw>|NoSuf, { <sd:elem>|Unspecified|BaseIndex|RegXMM, RegXMM, RegXMM }
+<sdh:cpu:cpudq:fma:ppfx:spfx:pfx:spc1:spc2:opc:vex:vexlig:vexw:elem, +
+    s:AVX512F:AVX512DQ:FMA|AVX|AVX512F::f3:66:Space0F:Space0F38:0:Vex|EVexDYN:VexLIG|EVexLIG:VexW0:Dword, +
+    d:AVX512F:AVX512DQ:FMA|AVX|AVX512F:66:f2:66:Space0F:Space0F38:1:Vex|EVexDYN:VexLIG|EVexLIG:VexW1:Qword, +
+    h:AVX512_FP16:AVX512_FP16:AVX512_FP16::f3::EVexMap5:EVexMap6:0::EVexLIG:VexW0:Word>
+
+vfmadd<fma>p<sdh>, 0x6688 | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vex>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vfmadd<fma>s<sdh>, 0x6689 | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vexlig>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
+vfmaddsub<fma>p<sdh>, 0x6686 | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vex>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vfmsub<fma>p<sdh>, 0x668a | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vex>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vfmsub<fma>s<sdh>, 0x668b | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vexlig>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
+vfmsubadd<fma>p<sdh>, 0x6687 | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vex>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vfnmadd<fma>p<sdh>, 0x668c | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vex>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vfnmadd<fma>s<sdh>, 0x668d | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vexlig>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
+vfnmsub<fma>p<sdh>, 0x668e | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vex>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
+vfnmsub<fma>s<sdh>, 0x668f | 0x<fma:opc>, <sdh:fma>, Modrm|<sdh:vexlig>|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
 
 // HLE prefixes
 
@@ -2087,11 +2092,6 @@ vpclmulhqhqdq, 0x6644/0x11, VPCLMULQDQ|A
 
 // AVX512F instructions.
 
-<sdh:cpu:cpudq:ppfx:spfx:pfx:spc1:spc2:opc:vexw:elem, +
-    s:AVX512F:AVX512DQ::f3:66:Space0F:Space0F38:0:VexW0:Dword, +
-    d:AVX512F:AVX512DQ:66:f2:66:Space0F:Space0F38:1:VexW1:Qword, +
-    h:AVX512_FP16:AVX512_FP16::f3::EVexMap5:EVexMap6:0:VexW0:Word>
-
 // <Exy> is used for EVEX instructions with x/y suffixes.
 <Exy:vl:attr:sr:sae:src:dst, +
     $z::EVex512|Disp8MemShift=6:StaticRounding|SAE:SAE:RegZMM|Unspecified|BaseIndex:RegYMM, +
@@ -2255,17 +2255,6 @@ vgetmants<sdh>, 0x<sdh:pfx>27, <sdh:cpu>
 vrndscalep<sdh>, 0x<sdh:pfx>08 | <sdh:opc>, <sdh:cpu>, Modrm|Masking|Space0F3A|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|SAE, { Imm8, RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vrndscales<sdh>, 0x<sdh:pfx>0a | <sdh:opc>, <sdh:cpu>, Modrm|EVexLIG|Masking|Space0F3A|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|SAE, { Imm8, RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
 
-vfmadd<fma>p<sdh>, 0x6688 | 0x<fma:opc>, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vfmadd<fma>s<sdh>, 0x6689 | 0x<fma:opc>, <sdh:cpu>, Modrm|EVexLIG|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
-vfmaddsub<fma>p<sdh>, 0x6686 | 0x<fma:opc>, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vfmsub<fma>p<sdh>, 0x668a | 0x<fma:opc>, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vfmsub<fma>s<sdh>, 0x668b | 0x<fma:opc>, <sdh:cpu>, Modrm|EVexLIG|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
-vfmsubadd<fma>p<sdh>, 0x6687 | 0x<fma:opc>, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vfnmadd<fma>p<sdh>, 0x668c | 0x<fma:opc>, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vfnmadd<fma>s<sdh>, 0x668d | 0x<fma:opc>, <sdh:cpu>, Modrm|EVexLIG|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
-vfnmsub<fma>p<sdh>, 0x668e | 0x<fma:opc>, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
-vfnmsub<fma>s<sdh>, 0x668f | 0x<fma:opc>, <sdh:cpu>, Modrm|EVexLIG|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
-
 vscalefp<sdh>, 0x662c, <sdh:cpu>, Modrm|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Broadcast|Disp8ShiftVL|CheckOperandSize|NoSuf|StaticRounding|SAE, { RegXMM|RegYMM|RegZMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 vscalefs<sdh>, 0x662d, <sdh:cpu>, Modrm|EVexLIG|Masking|<sdh:spc2>|VexVVVV|<sdh:vexw>|Disp8MemShift|NoSuf|StaticRounding|SAE, { RegXMM|<sdh:elem>|Unspecified|BaseIndex, RegXMM, RegXMM }
 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 4/4] x86: fold F16C VEX and EVEX templates
  2023-09-19 15:43 [PATCH v2 0/4] x86: fold a number of VEX and EVEX templates Jan Beulich
                   ` (2 preceding siblings ...)
  2023-09-19 15:45 ` [PATCH v2 3/4] x86: fold FMA " Jan Beulich
@ 2023-09-19 15:45 ` Jan Beulich
  3 siblings, 0 replies; 5+ messages in thread
From: Jan Beulich @ 2023-09-19 15:45 UTC (permalink / raw)
  To: Binutils; +Cc: H.J. Lu

Following the folding of some generic AVX/AVX2 templates with their
AVX512F counterpart ones, do this for F16C ones as well, requiring one
further adjustment to cpu_flags_match(). Note that there is a slight
asymmetry with the FMA checks, resulting from the various vector lengths
having separate insn templates here, but being a single combined one
(each) for FMA.
---
TBD: Unlike for FMA the gains aren't as big yet the code changes are
     slightly bigger. The change may therefore be deemed to not be worth
     it.
---
v2: Eliminate unwanted side effect.

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -1926,9 +1926,12 @@ cpu_flags_match (const insn_template *t)
 		{
 		  x.bitfield.cpuavx512f = 0;
 		  x.bitfield.cpuavx512vl = 0;
-		  if (x.bitfield.cpufma && !cpu.bitfield.cpufma)
+		  if ((x.bitfield.cpufma && !cpu.bitfield.cpufma)
+		      || (x.bitfield.cpuf16c && !cpu.bitfield.cpuf16c))
 		    x.bitfield.cpuavx = 0;
 		}
+	      else if (cpu.bitfield.cpuf16c)
+		x.bitfield.cpuavx512vl = 0;
 	    }
 	}
 
@@ -1953,6 +1956,8 @@ cpu_flags_match (const insn_template *t)
 		   : cpu.bitfield.cpuavx)
 		  && (!x.bitfield.cpufma || cpu.bitfield.cpufma
 		      || cpu_arch_flags.bitfield.cpuavx512f)
+		  && (!x.bitfield.cpuf16c || cpu.bitfield.cpuf16c
+		      || cpu_arch_flags.bitfield.cpuavx512f)
 		  && (!x.bitfield.cpugfni || cpu.bitfield.cpugfni)
 		  && (!x.bitfield.cpuvaes || cpu.bitfield.cpuvaes)
 		  && (!x.bitfield.cpuvpclmulqdq || cpu.bitfield.cpuvpclmulqdq))
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1793,10 +1793,10 @@ rdgsbase, 0xf30fae/1, FSGSBase, Modrm|Ig
 rdrand, 0xfc7/6, RdRnd, Modrm|NoSuf, { Reg16|Reg32|Reg64 }
 wrfsbase, 0xf30fae/2, FSGSBase, Modrm|IgnoreSize|NoSuf, { Reg32|Reg64 }
 wrgsbase, 0xf30fae/3, FSGSBase, Modrm|IgnoreSize|NoSuf, { Reg32|Reg64 }
-vcvtph2ps, 0x6613, F16C, Modrm|Vex|Space0F38|VexW0|NoSuf, { Qword|Unspecified|BaseIndex|RegXMM, RegXMM }
-vcvtph2ps, 0x6613, F16C, Modrm|Vex=2|Space0F38|VexW=1|NoSuf, { Unspecified|BaseIndex|RegXMM, RegYMM }
-vcvtps2ph, 0x661d, F16C, Modrm|Vex|Space0F3A|VexW0|NoSuf, { Imm8, RegXMM, Qword|Unspecified|BaseIndex|RegXMM }
-vcvtps2ph, 0x661d, F16C, Modrm|Vex=2|Space0F3A|VexW=1|NoSuf, { Imm8, RegYMM, Unspecified|BaseIndex|RegXMM }
+vcvtph2ps, 0x6613, F16C|AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F38|VexW0|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
+vcvtph2ps, 0x6613, F16C|AVX|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F38|VexW0|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
+vcvtps2ph, 0x661D, F16C|AVX|AVX512F|AVX512VL, Modrm|Vex128|EVex128|Masking|Space0F3A|VexW0|Disp8MemShift=3|NoSuf, { Imm8, RegXMM, RegXMM|Qword|Unspecified|BaseIndex }
+vcvtps2ph, 0x661D, F16C|AVX|AVX512F|AVX512VL, Modrm|Vex256|EVex256|Masking|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, RegXMM|Unspecified|BaseIndex }
 
 // FMA instructions
 
@@ -2525,15 +2525,9 @@ vcvtdq2pd, 0xF3E6, AVX512F|AVX512VL, Mod
 vcvtudq2pd, 0xF37A, AVX512F|AVX512VL, Modrm|EVex128|Masking|Space0F|VexW0|Broadcast|Disp8MemShift=3|NoSuf, { RegXMM|Dword|Qword|Unspecified|BaseIndex, RegXMM }
 vcvtudq2pd, 0xF37A, AVX512F|AVX512VL, Modrm|EVex256|Masking|Space0F|VexW0|Broadcast|Disp8MemShift=4|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegYMM }
 
-vcvtph2ps, 0x6613, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexW0|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
-vcvtph2ps, 0x6613, AVX512F|AVX512VL, Modrm|EVex=3|Masking|Space0F38|VexW=1|Disp8MemShift=4|NoSuf, { RegXMM|Unspecified|BaseIndex, RegYMM }
-
 vcvtps2pd, 0x5A, AVX512F|AVX512VL, Modrm|EVex128|Masking|Space0F|VexW0|Broadcast|Disp8MemShift=3|NoSuf, { RegXMM|Dword|Qword|Unspecified|BaseIndex, RegXMM }
 vcvtps2pd, 0x5A, AVX512F|AVX512VL, Modrm|EVex256|Masking|Space0F|VexW0|Broadcast|Disp8MemShift=4|NoSuf, { RegXMM|Dword|Unspecified|BaseIndex, RegYMM }
 
-vcvtps2ph, 0x661D, AVX512F|AVX512VL, Modrm|EVex128|Masking|Space0F3A|VexW0|Disp8MemShift=3|NoSuf, { Imm8, RegXMM, RegXMM|Qword|Unspecified|BaseIndex }
-vcvtps2ph, 0x661D, AVX512F|AVX512VL, Modrm|EVex256|Masking|Space0F3A|VexW0|Disp8MemShift=4|NoSuf, { Imm8, RegYMM, RegXMM|Unspecified|BaseIndex }
-
 vmovddup, 0xF212, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F|VexW1|Disp8MemShift=3|NoSuf, { RegXMM|Qword|Unspecified|BaseIndex, RegXMM }
 
 vpmovdb, 0xF331, AVX512F|AVX512VL, Modrm|EVex=2|Masking|Space0F38|VexW0|Disp8MemShift=2|NoSuf, { RegXMM, RegXMM|Dword|Unspecified|BaseIndex }


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-09-19 15:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-19 15:43 [PATCH v2 0/4] x86: fold a number of VEX and EVEX templates Jan Beulich
2023-09-19 15:44 ` [PATCH v2 1/4] x86: fold certain " Jan Beulich
2023-09-19 15:45 ` [PATCH v2 2/4] x86: fold VAES/VPCLMULQDQ " Jan Beulich
2023-09-19 15:45 ` [PATCH v2 3/4] x86: fold FMA " Jan Beulich
2023-09-19 15:45 ` [PATCH v2 4/4] x86: fold F16C " Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).