public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
From: "Cui, Lili" <lili.cui@intel.com>
To: "Beulich, Jan" <JBeulich@suse.com>
Cc: "H.J. Lu" <hjl.tools@gmail.com>,
	"binutils@sourceware.org" <binutils@sourceware.org>
Subject: RE: [PATCH] Support APX NF
Date: Fri, 1 Mar 2024 11:36:41 +0000	[thread overview]
Message-ID: <PH0PR11MB5593C80CDBC004E5FF0B7CEA9E5E2@PH0PR11MB5593.namprd11.prod.outlook.com> (raw)
In-Reply-To: <e908fddd-6fd2-4880-bcd7-7c49d245db18@suse.com>

> > --- /dev/null
> > +++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
> >[...]
> > +	{nf}	ror	%cl, 291(%r8, %rax, 4), %r9
> > +	{nf}	sar	$1, %bl
> > +	{nf}	sar	$1, %bl, %dl
> > +	{nf}	sar	$1, %dx
> > +	{nf}	sar	$1, %dx, %ax
> > +	{nf}	sar	$1, %ecx
> > +	{nf}	sar	$1, %ecx, %edx
> > +	{nf}	sar	$1, %r9
> > +	{nf}	sar	$1, %r9, %r31
> > +	{nf}	sarb	$1, 291(%r8, %rax, 4)
> > +	{nf}	sar	$1, 291(%r8, %rax, 4), %bl
> > +	{nf}	sarw	$1, 291(%r8, %rax, 4)
> > +	{nf}	sar	$1, 291(%r8, %rax, 4), %dx
> > +	{nf}	sarl	$1, 291(%r8, %rax, 4)
> > +	{nf}	sar	$1, 291(%r8, %rax, 4), %ecx
> > +	{nf}	sarq	$1, 291(%r8, %rax, 4)
> > +	{nf}	sar	$1, 291(%r8, %rax, 4), %r9
> > +	{nf}	sar	$123, %bl
> > +	{nf}	sar	$123, %bl, %dl
> > +	{nf}	sar	$123, %dx
> > +	{nf}	sar	$123, %dx, %ax
> > +	{nf}	sar	$123, %ecx
> > +	{nf}	sar	$123, %ecx, %edx
> > +	{nf}	sar	$123, %r9
> > +	{nf}	sar	$123, %r9, %r31
> > +	{nf}	sarb	$123, 291(%r8, %rax, 4)
> > +	{nf}	sar	$123, 291(%r8, %rax, 4), %bl
> > +	{nf}	sarw	$123, 291(%r8, %rax, 4)
> > +	{nf}	sar	$123, 291(%r8, %rax, 4), %dx
> > +	{nf}	sarl	$123, 291(%r8, %rax, 4)
> > +	{nf}	sar	$123, 291(%r8, %rax, 4), %ecx
> > +	{nf}	sarq	$123, 291(%r8, %rax, 4)
> > +	{nf}	sar	$123, 291(%r8, %rax, 4), %r9
> > +	{nf}	sar	%cl, %bl
> > +	{nf}	sar	%cl, %bl, %dl
> > +	{nf}	sar	%cl, %dx
> > +	{nf}	sar	%cl, %dx, %ax
> > +	{nf}	sar	%cl, %ecx
> > +	{nf}	sar	%cl, %ecx, %edx
> > +	{nf}	sar	%cl, %r9
> > +	{nf}	sar	%cl, %r9, %r31
> > +	{nf}	sarb	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sar	%cl, 291(%r8, %rax, 4), %bl
> > +	{nf}	sarw	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sar	%cl, 291(%r8, %rax, 4), %dx
> > +	{nf}	sarl	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sar	%cl, 291(%r8, %rax, 4), %ecx
> > +	{nf}	sarq	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sar	%cl, 291(%r8, %rax, 4), %r9
> > +	{nf}	shl	$1, %bl
> > +	{nf}	shl	$1, %bl, %dl
> > +	{nf}	shl	$1, %dx
> > +	{nf}	shl	$1, %dx, %ax
> > +	{nf}	shl	$1, %ecx
> > +	{nf}	shl	$1, %ecx, %edx
> > +	{nf}	shl	$1, %r9
> > +	{nf}	shl	$1, %r9, %r31
> > +	{nf}	shlb	$1, 291(%r8, %rax, 4)
> > +	{nf}	shl	$1, 291(%r8, %rax, 4), %bl
> > +	{nf}	shlw	$1, 291(%r8, %rax, 4)
> > +	{nf}	shl	$1, 291(%r8, %rax, 4), %dx
> > +	{nf}	shll	$1, 291(%r8, %rax, 4)
> > +	{nf}	shl	$1, 291(%r8, %rax, 4), %ecx
> > +	{nf}	shlq	$1, 291(%r8, %rax, 4)
> > +	{nf}	shl	$1, 291(%r8, %rax, 4), %r9
> > +	{nf}	shl	$123, %bl
> > +	{nf}	shl	$123, %bl, %dl
> > +	{nf}	shl	$123, %dx
> > +	{nf}	shl	$123, %dx, %ax
> > +	{nf}	shl	$123, %ecx
> > +	{nf}	shl	$123, %ecx, %edx
> > +	{nf}	shl	$123, %r9
> > +	{nf}	shl	$123, %r9, %r31
> > +	{nf}	shlb	$123, 291(%r8, %rax, 4)
> > +	{nf}	shl	$123, 291(%r8, %rax, 4), %bl
> > +	{nf}	shlw	$123, 291(%r8, %rax, 4)
> > +	{nf}	shl	$123, 291(%r8, %rax, 4), %dx
> > +	{nf}	shll	$123, 291(%r8, %rax, 4)
> > +	{nf}	shl	$123, 291(%r8, %rax, 4), %ecx
> > +	{nf}	shlq	$123, 291(%r8, %rax, 4)
> > +	{nf}	shl	$123, 291(%r8, %rax, 4), %r9
> > +	{nf}	shl	%cl, %bl
> > +	{nf}	shl	%cl, %bl, %dl
> > +	{nf}	shl	%cl, %dx
> > +	{nf}	shl	%cl, %dx, %ax
> > +	{nf}	shl	%cl, %ecx
> > +	{nf}	shl	%cl, %ecx, %edx
> > +	{nf}	shl	%cl, %r9
> > +	{nf}	shl	%cl, %r9, %r31
> > +	{nf}	shlb	%cl, 291(%r8, %rax, 4)
> > +	{nf}	shl	%cl, 291(%r8, %rax, 4), %bl
> > +	{nf}	shlw	%cl, 291(%r8, %rax, 4)
> > +	{nf}	shl	%cl, 291(%r8, %rax, 4), %dx
> > +	{nf}	shll	%cl, 291(%r8, %rax, 4)
> > +	{nf}	shl	%cl, 291(%r8, %rax, 4), %ecx
> > +	{nf}	shlq	%cl, 291(%r8, %rax, 4)
> > +	{nf}	shl	%cl, 291(%r8, %rax, 4), %r9
> > +	{nf}	sal	$1, %bl
> > +	{nf}	sal	$1, %bl, %dl
> > +	{nf}	sal	$1, %dx
> > +	{nf}	sal	$1, %dx, %ax
> > +	{nf}	sal	$1, %ecx
> > +	{nf}	sal	$1, %ecx, %edx
> > +	{nf}	sal	$1, %r9
> > +	{nf}	sal	$1, %r9, %r31
> > +	{nf}	salb	$1, 291(%r8, %rax, 4)
> > +	{nf}	sal	$1, 291(%r8, %rax, 4), %bl
> > +	{nf}	salw	$1, 291(%r8, %rax, 4)
> > +	{nf}	sal	$1, 291(%r8, %rax, 4), %dx
> > +	{nf}	sall	$1, 291(%r8, %rax, 4)
> > +	{nf}	sal	$1, 291(%r8, %rax, 4), %ecx
> > +	{nf}	salq	$1, 291(%r8, %rax, 4)
> > +	{nf}	sal	$1, 291(%r8, %rax, 4), %r9
> > +	{nf}	sal	$123, %bl
> > +	{nf}	sal	$123, %bl, %dl
> > +	{nf}	sal	$123, %dx
> > +	{nf}	sal	$123, %dx, %ax
> > +	{nf}	sal	$123, %ecx
> > +	{nf}	sal	$123, %ecx, %edx
> > +	{nf}	sal	$123, %r9
> > +	{nf}	sal	$123, %r9, %r31
> > +	{nf}	salb	$123, 291(%r8, %rax, 4)
> > +	{nf}	sal	$123, 291(%r8, %rax, 4), %bl
> > +	{nf}	salw	$123, 291(%r8, %rax, 4)
> > +	{nf}	sal	$123, 291(%r8, %rax, 4), %dx
> > +	{nf}	sall	$123, 291(%r8, %rax, 4)
> > +	{nf}	sal	$123, 291(%r8, %rax, 4), %ecx
> > +	{nf}	salq	$123, 291(%r8, %rax, 4)
> > +	{nf}	sal	$123, 291(%r8, %rax, 4), %r9
> > +	{nf}	sal	%cl, %bl
> > +	{nf}	sal	%cl, %bl, %dl
> > +	{nf}	sal	%cl, %dx
> > +	{nf}	sal	%cl, %dx, %ax
> > +	{nf}	sal	%cl, %ecx
> > +	{nf}	sal	%cl, %ecx, %edx
> > +	{nf}	sal	%cl, %r9
> > +	{nf}	sal	%cl, %r9, %r31
> > +	{nf}	salb	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sal	%cl, 291(%r8, %rax, 4), %bl
> > +	{nf}	salw	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sal	%cl, 291(%r8, %rax, 4), %dx
> > +	{nf}	sall	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sal	%cl, 291(%r8, %rax, 4), %ecx
> > +	{nf}	salq	%cl, 291(%r8, %rax, 4)
> > +	{nf}	sal	%cl, 291(%r8, %rax, 4), %r9
> 
> Hmm, I think sorting in the source file is more relevant than in the output
> (expectations), so I think this SAL block wants moving up. Would of course be
> yet more natural if we actually encoded SAL with ModR/M.reg=6 rather than
> the same encoding as SHL ...
> 

Done.

> > --- a/opcodes/i386-dis-evex-reg.h
> > +++ b/opcodes/i386-dis-evex-reg.h
> > @@ -51,33 +51,33 @@
> >    },
> >    /* REG_EVEX_MAP4_80 */
> >    {
> > -    { "addA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > -    { "orA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "%XNaddA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> > +    { "%XNorA",	{ VexGb, Eb, Ib }, NO_PREFIX },
> 
> Since there are quite a number of entries which are affected (and more to
> come), did you consider using a single-character macro here? I realize the
> three we presently have free don't fit overly well letter-wise, but it ought to be
> possible to e.g. free up F (rarely used, could become a two- letter one) for use
> here.
> 
> Seeing that you need to fiddle with the "case 'N'" code anyway, did you
> further consider giving 'N' a second purpose? Present and projected uses are
> easy to tell apart by being non-EVEX / EVEX respectively.
> 
> If we really wanted to stick to a two-letter one, I think it would further want
> considering to use %NF, such that its purpose is immediately clear from the
> letters used.
> 

Changed to %NF.

> > @@ -9147,6 +9150,10 @@ get_valid_dis386 (const struct dis386 *dp,
> instr_info *ins)
> >        ins->vex.v = *ins->codep & 0x8;
> >        ins->vex.mask_register_specifier = *ins->codep & 0x7;
> >        ins->vex.zeroing = *ins->codep & 0x80;
> > +      /* Set the NF bit for the EVEX instruction extended from the legacy or
> > +	 vex instruction, this bit will be cleared when it can be confirmed
> > +	 that its defaut type is evex.  */
> > +      ins->vex.nf = *ins->codep & 0x4;
> >
> >        if (ins->address_mode != mode_64bit)
> >  	{
> > @@ -9600,6 +9607,15 @@ print_insn (bfd_vma pc, disassemble_info *info,
> int intel_syntax)
> >  	  && ins.vex.prefix == DATA_PREFIX_OPCODE)
> >  	sizeflag ^= DFLAG;
> >
> > +      if(ins.evex_type == evex_default)
> > +	ins.vex.nf = false;
> 
> Up to here I think I agree.
> 
> > +      else
> > +	/* For EVEX-promoted formats, we need to clear EVEX.NF (For ccmp
> and
> > +	   ctest, they will be cleared separately.) in mask_register_specifier
> > +	   and keep the low 2 bits of mask_register_specifier to report errors
> > +	   for invalid cases.*/
> > +	ins.vex.mask_register_specifier &= 0x3;
> 
> But this I'm in trouble with: How would you recognize (and accordingly print)
> insns with NF wrongly set? (By implication there's also a respective testcase
> [addition] missing.)
> 

For the else branch here, "ins.vex.mask_register_specifier" and "ins.vex.nf" store the same value, I just clean the redundant one. Otherwise, NF bit in "ins.vex.mask_register_specifier" will be printed out as a k register later. But we still need the 2 low bits to know the original value, we will use them in subsequent patches.

Jan, is it necessary to report bad for NF? We need to set a new flag in putop() for instructions that have %NF, then after putop() we can know if NF bit is set incorrectly, which is a bit ugly. Do you have a better way?

> > --- a/opcodes/i386-opc.h
> > +++ b/opcodes/i386-opc.h
> > @@ -1017,7 +1017,8 @@ typedef struct insn_template
> >  #define Prefix_EVEX		7	/* {evex} */
> >  #define Prefix_REX		8	/* {rex} */
> >  #define Prefix_REX2		9	/* {rex2} */
> > -#define Prefix_NoOptimize	10	/* {nooptimize} */
> > +#define Prefix_NF		10	/* {nf} */
> > +#define Prefix_NoOptimize	11	/* {nooptimize} */
> 
> I find it increasingly puzzling that nooptimize is pushed all further down, for no
> real reason.

Ah yes, they are all pseudo prefixes, it is convenient to always put the new one at the end.

> 
> > --- a/opcodes/i386-opc.tbl
> > +++ b/opcodes/i386-opc.tbl
> > @@ -310,32 +310,42 @@ sti, 0xfb, 0, NoSuf, {}  // Arithmetic.
> >  add, 0x0, APX_F,
> > D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64 }  add, 0x0, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +add, 0x0, APX_F,
> D|W|CheckOperandSize|Modrm|No_sSuf|EVexMap4|NF, {
> > +Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex
> > +}
> >  add, 0x83/0, APX_F,
> > Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF,
> { Imm8S,
> > Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }  add,
> > 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> > Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +add, 0x83/0, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF, { Imm8S,
> > +Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> >  add, 0x4, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> > Acc|Byte|Word|Dword|Qword }  add, 0x80/0, APX_F,
> > W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|NF, {
> > Imm8|Imm16|Imm32|Imm32S,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64}  add, 0x80/0, 0,
> > W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S,
> > Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +add, 0x80/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, {
> > +Imm8|Imm16|Imm32|Imm32S,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex
> > +}
> 
> Adding these templates has a 2nd effect, for which no testcase is being added:
> They now allow (taking the example here) "{evex} add $1, %eax". Such a new
> test (which could be less extensive than the -nf one you already add) should
> then also cover ADCX and ADOX, for which the 2-operand EVEX templates
> were added prematurely.
> 

We added the test cases you mentioned in a separate patch that added all {evex} test cases for the APX instructions. I'll send it out later.

> >  inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 }  inc,
> > 0xfe/0, APX_F,
> W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|NF,
> > {Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> Reg8|Reg16|Reg32|Reg64}
> > inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> > Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, {
> > +Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> >
> >  sub, 0x28, APX_F,
> > D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64, }  sub, 0x28, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +sub, 0x28, APX_F,
> > +D|W|CheckOperandSize|Modrm|No_sSuf|Optimize|EVexMap4|NF, {
> > +Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex
> > +}
> 
> What's the purpose of Optimize here? Just to repeat my earlier request:
> Please don't blindly copy all attributes when you clone templates. See how the
> existing APX template already doesn't have this attribute.
> Apparently you re-cloned the legacy one, not the APX one.
> 

I noticed the Optimize here before sending the patch, I didn't remove them because I felt some optimizations should be extended to {evex}, but now I realize I was wrong, the optimization of sub was to reduce the binary size, it does not work with {evex}.

subq %r64, %r64     -> subl %r32, %r32

I will drop Optimize for NF.

> Specifically here, this template will only be chosen if either {nf} or {evex} is
> present. Both of which preclude the NDD->REX2 transformation, in turn
> making further optimization impossible.
> 
> As to {nf} precluding optimization: can_convert_NDD_to_legacy() checks
> i.tm.opcode_modifier.nf rather than i.has_nf. That's entirely dead code, as i.tm
> is populated only by install_template(). This check wants dropping in a prereq
> patch, I suppose, and then the patch here should add the correct check. I recall
> saying back then that a respective check needs adding here, not already in the
> patch introducing the transformation.
> 
> Jan

Wow, great finding, changed.

Thanks,
Lili.


  parent reply	other threads:[~2024-03-01 11:37 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-27  9:01 Cui, Lili
2024-02-28 16:11 ` H.J. Lu
2024-02-29  1:12   ` Cui, Lili
2024-02-29  6:53   ` Jan Beulich
2024-02-29  8:39     ` Cui, Lili
2024-02-29  9:06       ` Jan Beulich
2024-02-29 10:22         ` Cui, Lili
2024-02-29 12:23           ` H.J. Lu
2024-02-29 12:26             ` Cui, Lili
2024-02-29 11:21 ` Jan Beulich
2024-02-29 12:00   ` Cui, Lili
2024-02-29 12:04     ` Jan Beulich
2024-02-29 12:41       ` Cui, Lili
2024-02-29 13:17         ` Jan Beulich
2024-02-29 13:47           ` Cui, Lili
2024-02-29 14:12             ` Jan Beulich
2024-03-01  3:23               ` Cui, Lili
2024-03-01  6:56                 ` Jan Beulich
2024-03-01  8:01                   ` Cui, Lili
2024-03-01 11:36   ` Cui, Lili [this message]
2024-03-01 11:49     ` Jan Beulich
2024-03-01  7:04 ` Jan Beulich
2024-03-01 11:50   ` Cui, Lili
2024-03-19  6:41 Cui, Lili
2024-03-21 14:26 ` Jan Beulich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=PH0PR11MB5593C80CDBC004E5FF0B7CEA9E5E2@PH0PR11MB5593.namprd11.prod.outlook.com \
    --to=lili.cui@intel.com \
    --cc=JBeulich@suse.com \
    --cc=binutils@sourceware.org \
    --cc=hjl.tools@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).