* [PATCH 1/2] Remove i386 cmpstrnsi peephole
@ 2014-09-27 18:11 Andi Kleen
2014-09-27 18:11 ` [PATCH 2/2] Remove x86 cmpstrnsi Andi Kleen
0 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2014-09-27 18:11 UTC (permalink / raw)
To: gcc-patches; +Cc: Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
The peephole that removes the code to compute a tristate for cmpstrnsi
when only a boolean jump is needed never triggers in my tests. Just
remove it.
gcc/:
2014-09-27 Andi Kleen <ak@linux.intel.com>
* config/i386/i386.md: Remove peepholes for cmpstrn*.
---
gcc/config/i386/i386.md | 77 -------------------------------------------------
1 file changed, 77 deletions(-)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 004302d..98df8e1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16297,83 +16297,6 @@
(const_string "0")
(const_string "*")))
(set_attr "prefix_rep" "1")])
-
-;; Peephole optimizations to clean up after cmpstrn*. This should be
-;; handled in combine, but it is not currently up to the task.
-;; When used for their truth value, the cmpstrn* expanders generate
-;; code like this:
-;;
-;; repz cmpsb
-;; seta %al
-;; setb %dl
-;; cmpb %al, %dl
-;; jcc label
-;;
-;; The intermediate three instructions are unnecessary.
-
-;; This one handles cmpstrn*_nz_1...
-(define_peephole2
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (compare:CC (mem:BLK (match_operand 4 "register_operand"))
- (mem:BLK (match_operand 5 "register_operand"))))
- (use (match_operand 6 "register_operand"))
- (use (match_operand:SI 3 "immediate_operand"))
- (clobber (match_operand 0 "register_operand"))
- (clobber (match_operand 1 "register_operand"))
- (clobber (match_operand 2 "register_operand"))])
- (set (match_operand:QI 7 "register_operand")
- (gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (match_operand:QI 8 "register_operand")
- (ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (reg FLAGS_REG)
- (compare (match_dup 7) (match_dup 8)))
- ]
- "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])"
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (compare:CC (mem:BLK (match_dup 4))
- (mem:BLK (match_dup 5))))
- (use (match_dup 6))
- (use (match_dup 3))
- (clobber (match_dup 0))
- (clobber (match_dup 1))
- (clobber (match_dup 2))])])
-
-;; ...and this one handles cmpstrn*_1.
-(define_peephole2
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (if_then_else:CC (ne (match_operand 6 "register_operand")
- (const_int 0))
- (compare:CC (mem:BLK (match_operand 4 "register_operand"))
- (mem:BLK (match_operand 5 "register_operand")))
- (const_int 0)))
- (use (match_operand:SI 3 "immediate_operand"))
- (use (reg:CC FLAGS_REG))
- (clobber (match_operand 0 "register_operand"))
- (clobber (match_operand 1 "register_operand"))
- (clobber (match_operand 2 "register_operand"))])
- (set (match_operand:QI 7 "register_operand")
- (gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (match_operand:QI 8 "register_operand")
- (ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (reg FLAGS_REG)
- (compare (match_dup 7) (match_dup 8)))
- ]
- "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])"
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (if_then_else:CC (ne (match_dup 6)
- (const_int 0))
- (compare:CC (mem:BLK (match_dup 4))
- (mem:BLK (match_dup 5)))
- (const_int 0)))
- (use (match_dup 3))
- (use (reg:CC FLAGS_REG))
- (clobber (match_dup 0))
- (clobber (match_dup 1))
- (clobber (match_dup 2))])])
\f
;; Conditional move instructions.
--
2.1.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 2/2] Remove x86 cmpstrnsi
2014-09-27 18:11 [PATCH 1/2] Remove i386 cmpstrnsi peephole Andi Kleen
@ 2014-09-27 18:11 ` Andi Kleen
2014-09-27 18:45 ` Oleg Endo
0 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2014-09-27 18:11 UTC (permalink / raw)
To: gcc-patches; +Cc: Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
In my tests the optimized glibc out of line strcmp is always faster than
using inline rep ; cmpsb, even for small strings. The Intel optimization manual
also recommends to not use it. So remove the cmpstrnsi instruction.
Tested on Sandy Bridge, Westmere Intel CPUs.
gcc/:
2014-09-27 Andi Kleen <ak@linux.intel.com>
* config/i386/i386.md (cmpstrnsi, cmpintqi): Remove expanders.
---
gcc/config/i386/i386.md | 85 -------------------------------------------------
1 file changed, 85 deletions(-)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 98df8e1..1d2f1a5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16097,91 +16097,6 @@
(const_string "*")))
(set_attr "mode" "QI")])
-(define_expand "cmpstrnsi"
- [(set (match_operand:SI 0 "register_operand")
- (compare:SI (match_operand:BLK 1 "general_operand")
- (match_operand:BLK 2 "general_operand")))
- (use (match_operand 3 "general_operand"))
- (use (match_operand 4 "immediate_operand"))]
- ""
-{
- rtx addr1, addr2, out, outlow, count, countreg, align;
-
- if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
- FAIL;
-
- /* Can't use this if the user has appropriated ecx, esi or edi. */
- if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
- FAIL;
-
- out = operands[0];
- if (!REG_P (out))
- out = gen_reg_rtx (SImode);
-
- addr1 = copy_addr_to_reg (XEXP (operands[1], 0));
- addr2 = copy_addr_to_reg (XEXP (operands[2], 0));
- if (addr1 != XEXP (operands[1], 0))
- operands[1] = replace_equiv_address_nv (operands[1], addr1);
- if (addr2 != XEXP (operands[2], 0))
- operands[2] = replace_equiv_address_nv (operands[2], addr2);
-
- count = operands[3];
- countreg = ix86_zero_extend_to_Pmode (count);
-
- /* %%% Iff we are testing strict equality, we can use known alignment
- to good advantage. This may be possible with combine, particularly
- once cc0 is dead. */
- align = operands[4];
-
- if (CONST_INT_P (count))
- {
- if (INTVAL (count) == 0)
- {
- emit_move_insn (operands[0], const0_rtx);
- DONE;
- }
- emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align,
- operands[1], operands[2]));
- }
- else
- {
- rtx (*gen_cmp) (rtx, rtx);
-
- gen_cmp = (TARGET_64BIT
- ? gen_cmpdi_1 : gen_cmpsi_1);
-
- emit_insn (gen_cmp (countreg, countreg));
- emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align,
- operands[1], operands[2]));
- }
-
- outlow = gen_lowpart (QImode, out);
- emit_insn (gen_cmpintqi (outlow));
- emit_move_insn (out, gen_rtx_SIGN_EXTEND (SImode, outlow));
-
- if (operands[0] != out)
- emit_move_insn (operands[0], out);
-
- DONE;
-})
-
-;; Produce a tri-state integer (-1, 0, 1) from condition codes.
-
-(define_expand "cmpintqi"
- [(set (match_dup 1)
- (gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (match_dup 2)
- (ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (parallel [(set (match_operand:QI 0 "register_operand")
- (minus:QI (match_dup 1)
- (match_dup 2)))
- (clobber (reg:CC FLAGS_REG))])]
- ""
-{
- operands[1] = gen_reg_rtx (QImode);
- operands[2] = gen_reg_rtx (QImode);
-})
-
;; memcmp recognizers. The `cmpsb' opcode does nothing if the count is
;; zero. Emit extra code to make sure that a zero-length compare is EQ.
--
2.1.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] Remove x86 cmpstrnsi
2014-09-27 18:11 ` [PATCH 2/2] Remove x86 cmpstrnsi Andi Kleen
@ 2014-09-27 18:45 ` Oleg Endo
2014-09-27 18:54 ` Andi Kleen
0 siblings, 1 reply; 5+ messages in thread
From: Oleg Endo @ 2014-09-27 18:45 UTC (permalink / raw)
To: Andi Kleen; +Cc: gcc-patches, Andi Kleen
On Sat, 2014-09-27 at 11:10 -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> In my tests the optimized glibc out of line strcmp is always faster than
> using inline rep ; cmpsb, even for small strings. The Intel optimization manual
> also recommends to not use it. So remove the cmpstrnsi instruction.
>
> Tested on Sandy Bridge, Westmere Intel CPUs.
>
> gcc/:
>
> 2014-09-27 Andi Kleen <ak@linux.intel.com>
>
> * config/i386/i386.md (cmpstrnsi, cmpintqi): Remove expanders.
This has been mentioned a while ago, e.g.
https://gcc.gnu.org/ml/gcc/2002-10/msg01616.html
https://gcc.gnu.org/ml/gcc/2003-04/msg00166.html
Instead of just completely removing it, how about disabling it for newer
CPU types if not optimizing for size?
Cheers,
Oleg
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] Remove x86 cmpstrnsi
2014-09-27 18:45 ` Oleg Endo
@ 2014-09-27 18:54 ` Andi Kleen
0 siblings, 0 replies; 5+ messages in thread
From: Andi Kleen @ 2014-09-27 18:54 UTC (permalink / raw)
To: Oleg Endo; +Cc: Andi Kleen, gcc-patches, Andi Kleen
On Sat, Sep 27, 2014 at 08:45:18PM +0200, Oleg Endo wrote:
> On Sat, 2014-09-27 at 11:10 -0700, Andi Kleen wrote:
> > From: Andi Kleen <ak@linux.intel.com>
> >
> > In my tests the optimized glibc out of line strcmp is always faster than
> > using inline rep ; cmpsb, even for small strings. The Intel optimization manual
> > also recommends to not use it. So remove the cmpstrnsi instruction.
> >
> > Tested on Sandy Bridge, Westmere Intel CPUs.
> >
> > gcc/:
> >
> > 2014-09-27 Andi Kleen <ak@linux.intel.com>
> >
> > * config/i386/i386.md (cmpstrnsi, cmpintqi): Remove expanders.
>
> This has been mentioned a while ago, e.g.
> https://gcc.gnu.org/ml/gcc/2002-10/msg01616.html
> https://gcc.gnu.org/ml/gcc/2003-04/msg00166.html
>
> Instead of just completely removing it, how about disabling it for newer
> CPU types if not optimizing for size?
I believe it was slow even on old CPUs. But back then glibc may have
been even slower. Not sure it is worth keeping it for -Os, especially
given that parts of it have already bitrotted.
-Andi
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 1/2] Remove i386 cmpstrnsi peephole
@ 2014-07-04 5:14 Andi Kleen
2014-07-04 5:14 ` [PATCH 2/2] Remove x86 cmpstrnsi Andi Kleen
0 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2014-07-04 5:14 UTC (permalink / raw)
To: gcc-patches; +Cc: Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
The peephole that removes the code to compute a tristate for cmpstrnsi
when only a boolean jump is needed never triggers in my tests. Just
remove it.
gcc/:
2014-07-02 Andi Kleen <ak@linux.intel.com>
* config/i386/i386.md: Remove peepholes for cmpstrn*.
---
gcc/config/i386/i386.md | 77 -------------------------------------------------
1 file changed, 77 deletions(-)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 695b981..5f32a24 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16078,83 +16078,6 @@
(const_string "0")
(const_string "*")))
(set_attr "prefix_rep" "1")])
-
-;; Peephole optimizations to clean up after cmpstrn*. This should be
-;; handled in combine, but it is not currently up to the task.
-;; When used for their truth value, the cmpstrn* expanders generate
-;; code like this:
-;;
-;; repz cmpsb
-;; seta %al
-;; setb %dl
-;; cmpb %al, %dl
-;; jcc label
-;;
-;; The intermediate three instructions are unnecessary.
-
-;; This one handles cmpstrn*_nz_1...
-(define_peephole2
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (compare:CC (mem:BLK (match_operand 4 "register_operand"))
- (mem:BLK (match_operand 5 "register_operand"))))
- (use (match_operand 6 "register_operand"))
- (use (match_operand:SI 3 "immediate_operand"))
- (clobber (match_operand 0 "register_operand"))
- (clobber (match_operand 1 "register_operand"))
- (clobber (match_operand 2 "register_operand"))])
- (set (match_operand:QI 7 "register_operand")
- (gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (match_operand:QI 8 "register_operand")
- (ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (reg FLAGS_REG)
- (compare (match_dup 7) (match_dup 8)))
- ]
- "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])"
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (compare:CC (mem:BLK (match_dup 4))
- (mem:BLK (match_dup 5))))
- (use (match_dup 6))
- (use (match_dup 3))
- (clobber (match_dup 0))
- (clobber (match_dup 1))
- (clobber (match_dup 2))])])
-
-;; ...and this one handles cmpstrn*_1.
-(define_peephole2
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (if_then_else:CC (ne (match_operand 6 "register_operand")
- (const_int 0))
- (compare:CC (mem:BLK (match_operand 4 "register_operand"))
- (mem:BLK (match_operand 5 "register_operand")))
- (const_int 0)))
- (use (match_operand:SI 3 "immediate_operand"))
- (use (reg:CC FLAGS_REG))
- (clobber (match_operand 0 "register_operand"))
- (clobber (match_operand 1 "register_operand"))
- (clobber (match_operand 2 "register_operand"))])
- (set (match_operand:QI 7 "register_operand")
- (gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (match_operand:QI 8 "register_operand")
- (ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (reg FLAGS_REG)
- (compare (match_dup 7) (match_dup 8)))
- ]
- "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])"
- [(parallel[
- (set (reg:CC FLAGS_REG)
- (if_then_else:CC (ne (match_dup 6)
- (const_int 0))
- (compare:CC (mem:BLK (match_dup 4))
- (mem:BLK (match_dup 5)))
- (const_int 0)))
- (use (match_dup 3))
- (use (reg:CC FLAGS_REG))
- (clobber (match_dup 0))
- (clobber (match_dup 1))
- (clobber (match_dup 2))])])
\f
;; Conditional move instructions.
--
2.0.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 2/2] Remove x86 cmpstrnsi
2014-07-04 5:14 [PATCH 1/2] Remove i386 cmpstrnsi peephole Andi Kleen
@ 2014-07-04 5:14 ` Andi Kleen
0 siblings, 0 replies; 5+ messages in thread
From: Andi Kleen @ 2014-07-04 5:14 UTC (permalink / raw)
To: gcc-patches; +Cc: Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
In my tests the optimized glibc out of line strcmp is always faster than
using inline rep ; cmpsb, even for small strings. The Intel optimization manual
also recommends to not use it. So remove the cmpstrnsi instruction.
Tested on Sandy Bridge, Westmere Intel CPUs.
gcc/:
2014-07-02 Andi Kleen <ak@linux.intel.com>
* config/i386/i386.md (cmpstrnsi, cmpintqi): Remove expanders.
---
gcc/config/i386/i386.md | 85 -------------------------------------------------
1 file changed, 85 deletions(-)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5f32a24..67f1343 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15878,91 +15878,6 @@
(const_string "*")))
(set_attr "mode" "QI")])
-(define_expand "cmpstrnsi"
- [(set (match_operand:SI 0 "register_operand")
- (compare:SI (match_operand:BLK 1 "general_operand")
- (match_operand:BLK 2 "general_operand")))
- (use (match_operand 3 "general_operand"))
- (use (match_operand 4 "immediate_operand"))]
- ""
-{
- rtx addr1, addr2, out, outlow, count, countreg, align;
-
- if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
- FAIL;
-
- /* Can't use this if the user has appropriated ecx, esi or edi. */
- if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
- FAIL;
-
- out = operands[0];
- if (!REG_P (out))
- out = gen_reg_rtx (SImode);
-
- addr1 = copy_addr_to_reg (XEXP (operands[1], 0));
- addr2 = copy_addr_to_reg (XEXP (operands[2], 0));
- if (addr1 != XEXP (operands[1], 0))
- operands[1] = replace_equiv_address_nv (operands[1], addr1);
- if (addr2 != XEXP (operands[2], 0))
- operands[2] = replace_equiv_address_nv (operands[2], addr2);
-
- count = operands[3];
- countreg = ix86_zero_extend_to_Pmode (count);
-
- /* %%% Iff we are testing strict equality, we can use known alignment
- to good advantage. This may be possible with combine, particularly
- once cc0 is dead. */
- align = operands[4];
-
- if (CONST_INT_P (count))
- {
- if (INTVAL (count) == 0)
- {
- emit_move_insn (operands[0], const0_rtx);
- DONE;
- }
- emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align,
- operands[1], operands[2]));
- }
- else
- {
- rtx (*gen_cmp) (rtx, rtx);
-
- gen_cmp = (TARGET_64BIT
- ? gen_cmpdi_1 : gen_cmpsi_1);
-
- emit_insn (gen_cmp (countreg, countreg));
- emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align,
- operands[1], operands[2]));
- }
-
- outlow = gen_lowpart (QImode, out);
- emit_insn (gen_cmpintqi (outlow));
- emit_move_insn (out, gen_rtx_SIGN_EXTEND (SImode, outlow));
-
- if (operands[0] != out)
- emit_move_insn (operands[0], out);
-
- DONE;
-})
-
-;; Produce a tri-state integer (-1, 0, 1) from condition codes.
-
-(define_expand "cmpintqi"
- [(set (match_dup 1)
- (gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (set (match_dup 2)
- (ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
- (parallel [(set (match_operand:QI 0 "register_operand")
- (minus:QI (match_dup 1)
- (match_dup 2)))
- (clobber (reg:CC FLAGS_REG))])]
- ""
-{
- operands[1] = gen_reg_rtx (QImode);
- operands[2] = gen_reg_rtx (QImode);
-})
-
;; memcmp recognizers. The `cmpsb' opcode does nothing if the count is
;; zero. Emit extra code to make sure that a zero-length compare is EQ.
--
2.0.0
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2014-09-27 18:54 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-27 18:11 [PATCH 1/2] Remove i386 cmpstrnsi peephole Andi Kleen
2014-09-27 18:11 ` [PATCH 2/2] Remove x86 cmpstrnsi Andi Kleen
2014-09-27 18:45 ` Oleg Endo
2014-09-27 18:54 ` Andi Kleen
-- strict thread matches above, loose matches on Subject: below --
2014-07-04 5:14 [PATCH 1/2] Remove i386 cmpstrnsi peephole Andi Kleen
2014-07-04 5:14 ` [PATCH 2/2] Remove x86 cmpstrnsi Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).