From: "Cui, Lili" <lili.cui@intel.com>
To: "Beulich, Jan" <JBeulich@suse.com>
Cc: "H.J. Lu" <hjl.tools@gmail.com>,
"binutils@sourceware.org" <binutils@sourceware.org>
Subject: RE: [PATCH] Support APX NF
Date: Fri, 1 Mar 2024 11:36:41 +0000 [thread overview]
Message-ID: <PH0PR11MB5593C80CDBC004E5FF0B7CEA9E5E2@PH0PR11MB5593.namprd11.prod.outlook.com> (raw)
In-Reply-To: <e908fddd-6fd2-4880-bcd7-7c49d245db18@suse.com>
> > --- /dev/null
> > +++ b/gas/testsuite/gas/i386/x86-64-apx-nf.s
> >[...]
> > + {nf} ror %cl, 291(%r8, %rax, 4), %r9
> > + {nf} sar $1, %bl
> > + {nf} sar $1, %bl, %dl
> > + {nf} sar $1, %dx
> > + {nf} sar $1, %dx, %ax
> > + {nf} sar $1, %ecx
> > + {nf} sar $1, %ecx, %edx
> > + {nf} sar $1, %r9
> > + {nf} sar $1, %r9, %r31
> > + {nf} sarb $1, 291(%r8, %rax, 4)
> > + {nf} sar $1, 291(%r8, %rax, 4), %bl
> > + {nf} sarw $1, 291(%r8, %rax, 4)
> > + {nf} sar $1, 291(%r8, %rax, 4), %dx
> > + {nf} sarl $1, 291(%r8, %rax, 4)
> > + {nf} sar $1, 291(%r8, %rax, 4), %ecx
> > + {nf} sarq $1, 291(%r8, %rax, 4)
> > + {nf} sar $1, 291(%r8, %rax, 4), %r9
> > + {nf} sar $123, %bl
> > + {nf} sar $123, %bl, %dl
> > + {nf} sar $123, %dx
> > + {nf} sar $123, %dx, %ax
> > + {nf} sar $123, %ecx
> > + {nf} sar $123, %ecx, %edx
> > + {nf} sar $123, %r9
> > + {nf} sar $123, %r9, %r31
> > + {nf} sarb $123, 291(%r8, %rax, 4)
> > + {nf} sar $123, 291(%r8, %rax, 4), %bl
> > + {nf} sarw $123, 291(%r8, %rax, 4)
> > + {nf} sar $123, 291(%r8, %rax, 4), %dx
> > + {nf} sarl $123, 291(%r8, %rax, 4)
> > + {nf} sar $123, 291(%r8, %rax, 4), %ecx
> > + {nf} sarq $123, 291(%r8, %rax, 4)
> > + {nf} sar $123, 291(%r8, %rax, 4), %r9
> > + {nf} sar %cl, %bl
> > + {nf} sar %cl, %bl, %dl
> > + {nf} sar %cl, %dx
> > + {nf} sar %cl, %dx, %ax
> > + {nf} sar %cl, %ecx
> > + {nf} sar %cl, %ecx, %edx
> > + {nf} sar %cl, %r9
> > + {nf} sar %cl, %r9, %r31
> > + {nf} sarb %cl, 291(%r8, %rax, 4)
> > + {nf} sar %cl, 291(%r8, %rax, 4), %bl
> > + {nf} sarw %cl, 291(%r8, %rax, 4)
> > + {nf} sar %cl, 291(%r8, %rax, 4), %dx
> > + {nf} sarl %cl, 291(%r8, %rax, 4)
> > + {nf} sar %cl, 291(%r8, %rax, 4), %ecx
> > + {nf} sarq %cl, 291(%r8, %rax, 4)
> > + {nf} sar %cl, 291(%r8, %rax, 4), %r9
> > + {nf} shl $1, %bl
> > + {nf} shl $1, %bl, %dl
> > + {nf} shl $1, %dx
> > + {nf} shl $1, %dx, %ax
> > + {nf} shl $1, %ecx
> > + {nf} shl $1, %ecx, %edx
> > + {nf} shl $1, %r9
> > + {nf} shl $1, %r9, %r31
> > + {nf} shlb $1, 291(%r8, %rax, 4)
> > + {nf} shl $1, 291(%r8, %rax, 4), %bl
> > + {nf} shlw $1, 291(%r8, %rax, 4)
> > + {nf} shl $1, 291(%r8, %rax, 4), %dx
> > + {nf} shll $1, 291(%r8, %rax, 4)
> > + {nf} shl $1, 291(%r8, %rax, 4), %ecx
> > + {nf} shlq $1, 291(%r8, %rax, 4)
> > + {nf} shl $1, 291(%r8, %rax, 4), %r9
> > + {nf} shl $123, %bl
> > + {nf} shl $123, %bl, %dl
> > + {nf} shl $123, %dx
> > + {nf} shl $123, %dx, %ax
> > + {nf} shl $123, %ecx
> > + {nf} shl $123, %ecx, %edx
> > + {nf} shl $123, %r9
> > + {nf} shl $123, %r9, %r31
> > + {nf} shlb $123, 291(%r8, %rax, 4)
> > + {nf} shl $123, 291(%r8, %rax, 4), %bl
> > + {nf} shlw $123, 291(%r8, %rax, 4)
> > + {nf} shl $123, 291(%r8, %rax, 4), %dx
> > + {nf} shll $123, 291(%r8, %rax, 4)
> > + {nf} shl $123, 291(%r8, %rax, 4), %ecx
> > + {nf} shlq $123, 291(%r8, %rax, 4)
> > + {nf} shl $123, 291(%r8, %rax, 4), %r9
> > + {nf} shl %cl, %bl
> > + {nf} shl %cl, %bl, %dl
> > + {nf} shl %cl, %dx
> > + {nf} shl %cl, %dx, %ax
> > + {nf} shl %cl, %ecx
> > + {nf} shl %cl, %ecx, %edx
> > + {nf} shl %cl, %r9
> > + {nf} shl %cl, %r9, %r31
> > + {nf} shlb %cl, 291(%r8, %rax, 4)
> > + {nf} shl %cl, 291(%r8, %rax, 4), %bl
> > + {nf} shlw %cl, 291(%r8, %rax, 4)
> > + {nf} shl %cl, 291(%r8, %rax, 4), %dx
> > + {nf} shll %cl, 291(%r8, %rax, 4)
> > + {nf} shl %cl, 291(%r8, %rax, 4), %ecx
> > + {nf} shlq %cl, 291(%r8, %rax, 4)
> > + {nf} shl %cl, 291(%r8, %rax, 4), %r9
> > + {nf} sal $1, %bl
> > + {nf} sal $1, %bl, %dl
> > + {nf} sal $1, %dx
> > + {nf} sal $1, %dx, %ax
> > + {nf} sal $1, %ecx
> > + {nf} sal $1, %ecx, %edx
> > + {nf} sal $1, %r9
> > + {nf} sal $1, %r9, %r31
> > + {nf} salb $1, 291(%r8, %rax, 4)
> > + {nf} sal $1, 291(%r8, %rax, 4), %bl
> > + {nf} salw $1, 291(%r8, %rax, 4)
> > + {nf} sal $1, 291(%r8, %rax, 4), %dx
> > + {nf} sall $1, 291(%r8, %rax, 4)
> > + {nf} sal $1, 291(%r8, %rax, 4), %ecx
> > + {nf} salq $1, 291(%r8, %rax, 4)
> > + {nf} sal $1, 291(%r8, %rax, 4), %r9
> > + {nf} sal $123, %bl
> > + {nf} sal $123, %bl, %dl
> > + {nf} sal $123, %dx
> > + {nf} sal $123, %dx, %ax
> > + {nf} sal $123, %ecx
> > + {nf} sal $123, %ecx, %edx
> > + {nf} sal $123, %r9
> > + {nf} sal $123, %r9, %r31
> > + {nf} salb $123, 291(%r8, %rax, 4)
> > + {nf} sal $123, 291(%r8, %rax, 4), %bl
> > + {nf} salw $123, 291(%r8, %rax, 4)
> > + {nf} sal $123, 291(%r8, %rax, 4), %dx
> > + {nf} sall $123, 291(%r8, %rax, 4)
> > + {nf} sal $123, 291(%r8, %rax, 4), %ecx
> > + {nf} salq $123, 291(%r8, %rax, 4)
> > + {nf} sal $123, 291(%r8, %rax, 4), %r9
> > + {nf} sal %cl, %bl
> > + {nf} sal %cl, %bl, %dl
> > + {nf} sal %cl, %dx
> > + {nf} sal %cl, %dx, %ax
> > + {nf} sal %cl, %ecx
> > + {nf} sal %cl, %ecx, %edx
> > + {nf} sal %cl, %r9
> > + {nf} sal %cl, %r9, %r31
> > + {nf} salb %cl, 291(%r8, %rax, 4)
> > + {nf} sal %cl, 291(%r8, %rax, 4), %bl
> > + {nf} salw %cl, 291(%r8, %rax, 4)
> > + {nf} sal %cl, 291(%r8, %rax, 4), %dx
> > + {nf} sall %cl, 291(%r8, %rax, 4)
> > + {nf} sal %cl, 291(%r8, %rax, 4), %ecx
> > + {nf} salq %cl, 291(%r8, %rax, 4)
> > + {nf} sal %cl, 291(%r8, %rax, 4), %r9
>
> Hmm, I think sorting in the source file is more relevant than in the output
> (expectations), so I think this SAL block wants moving up. Would of course be
> yet more natural if we actually encoded SAL with ModR/M.reg=6 rather than
> the same encoding as SHL ...
>
Done.
> > --- a/opcodes/i386-dis-evex-reg.h
> > +++ b/opcodes/i386-dis-evex-reg.h
> > @@ -51,33 +51,33 @@
> > },
> > /* REG_EVEX_MAP4_80 */
> > {
> > - { "addA", { VexGb, Eb, Ib }, NO_PREFIX },
> > - { "orA", { VexGb, Eb, Ib }, NO_PREFIX },
> > + { "%XNaddA", { VexGb, Eb, Ib }, NO_PREFIX },
> > + { "%XNorA", { VexGb, Eb, Ib }, NO_PREFIX },
>
> Since there are quite a number of entries which are affected (and more to
> come), did you consider using a single-character macro here? I realize the
> three we presently have free don't fit overly well letter-wise, but it ought to be
> possible to e.g. free up F (rarely used, could become a two- letter one) for use
> here.
>
> Seeing that you need to fiddle with the "case 'N'" code anyway, did you
> further consider giving 'N' a second purpose? Present and projected uses are
> easy to tell apart by being non-EVEX / EVEX respectively.
>
> If we really wanted to stick to a two-letter one, I think it would further want
> considering to use %NF, such that its purpose is immediately clear from the
> letters used.
>
Changed to %NF.
> > @@ -9147,6 +9150,10 @@ get_valid_dis386 (const struct dis386 *dp,
> instr_info *ins)
> > ins->vex.v = *ins->codep & 0x8;
> > ins->vex.mask_register_specifier = *ins->codep & 0x7;
> > ins->vex.zeroing = *ins->codep & 0x80;
> > + /* Set the NF bit for the EVEX instruction extended from the legacy or
> > + vex instruction, this bit will be cleared when it can be confirmed
> > + that its defaut type is evex. */
> > + ins->vex.nf = *ins->codep & 0x4;
> >
> > if (ins->address_mode != mode_64bit)
> > {
> > @@ -9600,6 +9607,15 @@ print_insn (bfd_vma pc, disassemble_info *info,
> int intel_syntax)
> > && ins.vex.prefix == DATA_PREFIX_OPCODE)
> > sizeflag ^= DFLAG;
> >
> > + if(ins.evex_type == evex_default)
> > + ins.vex.nf = false;
>
> Up to here I think I agree.
>
> > + else
> > + /* For EVEX-promoted formats, we need to clear EVEX.NF (For ccmp
> and
> > + ctest, they will be cleared separately.) in mask_register_specifier
> > + and keep the low 2 bits of mask_register_specifier to report errors
> > + for invalid cases.*/
> > + ins.vex.mask_register_specifier &= 0x3;
>
> But this I'm in trouble with: How would you recognize (and accordingly print)
> insns with NF wrongly set? (By implication there's also a respective testcase
> [addition] missing.)
>
For the else branch here, "ins.vex.mask_register_specifier" and "ins.vex.nf" store the same value, I just clean the redundant one. Otherwise, NF bit in "ins.vex.mask_register_specifier" will be printed out as a k register later. But we still need the 2 low bits to know the original value, we will use them in subsequent patches.
Jan, is it necessary to report bad for NF? We need to set a new flag in putop() for instructions that have %NF, then after putop() we can know if NF bit is set incorrectly, which is a bit ugly. Do you have a better way?
> > --- a/opcodes/i386-opc.h
> > +++ b/opcodes/i386-opc.h
> > @@ -1017,7 +1017,8 @@ typedef struct insn_template
> > #define Prefix_EVEX 7 /* {evex} */
> > #define Prefix_REX 8 /* {rex} */
> > #define Prefix_REX2 9 /* {rex2} */
> > -#define Prefix_NoOptimize 10 /* {nooptimize} */
> > +#define Prefix_NF 10 /* {nf} */
> > +#define Prefix_NoOptimize 11 /* {nooptimize} */
>
> I find it increasingly puzzling that nooptimize is pushed all further down, for no
> real reason.
Ah yes, they are all pseudo prefixes, it is convenient to always put the new one at the end.
>
> > --- a/opcodes/i386-opc.tbl
> > +++ b/opcodes/i386-opc.tbl
> > @@ -310,32 +310,42 @@ sti, 0xfb, 0, NoSuf, {} // Arithmetic.
> > add, 0x0, APX_F,
> > D|C|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64 } add, 0x0, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +add, 0x0, APX_F,
> D|W|CheckOperandSize|Modrm|No_sSuf|EVexMap4|NF, {
> > +Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex
> > +}
> > add, 0x83/0, APX_F,
> > Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|NF,
> { Imm8S,
> > Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 } add,
> > 0x83/0, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock, { Imm8S,
> > Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +add, 0x83/0, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|NF, { Imm8S,
> > +Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > add, 0x4, 0, W|No_sSuf, { Imm8|Imm16|Imm32|Imm32S,
> > Acc|Byte|Word|Dword|Qword } add, 0x80/0, APX_F,
> > W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|NF, {
> > Imm8|Imm16|Imm32|Imm32S,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64} add, 0x80/0, 0,
> > W|Modrm|No_sSuf|HLEPrefixLock, { Imm8|Imm16|Imm32|Imm32S,
> > Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +add, 0x80/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, {
> > +Imm8|Imm16|Imm32|Imm32S,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex
> > +}
>
> Adding these templates has a 2nd effect, for which no testcase is being added:
> They now allow (taking the example here) "{evex} add $1, %eax". Such a new
> test (which could be less extensive than the -nf one you already add) should
> then also cover ADCX and ADOX, for which the 2-operand EVEX templates
> were added prematurely.
>
We added the test cases you mentioned in a separate patch that added all {evex} test cases for the APX instructions. I'll send it out later.
> > inc, 0x40, No64, No_bSuf|No_sSuf|No_qSuf, { Reg16|Reg32 } inc,
> > 0xfe/0, APX_F,
> W|Modrm|No_sSuf|CheckOperandSize|DstVVVV|EVexMap4|NF,
> > {Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> Reg8|Reg16|Reg32|Reg64}
> > inc, 0xfe/0, 0, W|Modrm|No_sSuf|HLEPrefixLock, {
> > Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +inc, 0xfe/0, APX_F, W|Modrm|No_sSuf|EVexMap4|NF, {
> > +Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> >
> > sub, 0x28, APX_F,
> > D|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|NF, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex,
> > Reg8|Reg16|Reg32|Reg64, } sub, 0x28, 0,
> > D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|Optimize, {
> > Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
> > +sub, 0x28, APX_F,
> > +D|W|CheckOperandSize|Modrm|No_sSuf|Optimize|EVexMap4|NF, {
> > +Reg8|Reg16|Reg32|Reg64,
> Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex
> > +}
>
> What's the purpose of Optimize here? Just to repeat my earlier request:
> Please don't blindly copy all attributes when you clone templates. See how the
> existing APX template already doesn't have this attribute.
> Apparently you re-cloned the legacy one, not the APX one.
>
I noticed the Optimize here before sending the patch, I didn't remove them because I felt some optimizations should be extended to {evex}, but now I realize I was wrong, the optimization of sub was to reduce the binary size, it does not work with {evex}.
subq %r64, %r64 -> subl %r32, %r32
I will drop Optimize for NF.
> Specifically here, this template will only be chosen if either {nf} or {evex} is
> present. Both of which preclude the NDD->REX2 transformation, in turn
> making further optimization impossible.
>
> As to {nf} precluding optimization: can_convert_NDD_to_legacy() checks
> i.tm.opcode_modifier.nf rather than i.has_nf. That's entirely dead code, as i.tm
> is populated only by install_template(). This check wants dropping in a prereq
> patch, I suppose, and then the patch here should add the correct check. I recall
> saying back then that a respective check needs adding here, not already in the
> patch introducing the transformation.
>
> Jan
Wow, great finding, changed.
Thanks,
Lili.
next prev parent reply other threads:[~2024-03-01 11:37 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-27 9:01 Cui, Lili
2024-02-28 16:11 ` H.J. Lu
2024-02-29 1:12 ` Cui, Lili
2024-02-29 6:53 ` Jan Beulich
2024-02-29 8:39 ` Cui, Lili
2024-02-29 9:06 ` Jan Beulich
2024-02-29 10:22 ` Cui, Lili
2024-02-29 12:23 ` H.J. Lu
2024-02-29 12:26 ` Cui, Lili
2024-02-29 11:21 ` Jan Beulich
2024-02-29 12:00 ` Cui, Lili
2024-02-29 12:04 ` Jan Beulich
2024-02-29 12:41 ` Cui, Lili
2024-02-29 13:17 ` Jan Beulich
2024-02-29 13:47 ` Cui, Lili
2024-02-29 14:12 ` Jan Beulich
2024-03-01 3:23 ` Cui, Lili
2024-03-01 6:56 ` Jan Beulich
2024-03-01 8:01 ` Cui, Lili
2024-03-01 11:36 ` Cui, Lili [this message]
2024-03-01 11:49 ` Jan Beulich
2024-03-01 7:04 ` Jan Beulich
2024-03-01 11:50 ` Cui, Lili
2024-03-19 6:41 Cui, Lili
2024-03-21 14:26 ` Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=PH0PR11MB5593C80CDBC004E5FF0B7CEA9E5E2@PH0PR11MB5593.namprd11.prod.outlook.com \
--to=lili.cui@intel.com \
--cc=JBeulich@suse.com \
--cc=binutils@sourceware.org \
--cc=hjl.tools@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).