On 23/11/15 15:01, Kyrill Tkachov wrote: > > On 23/11/15 14:58, James Greenhalgh wrote: >> On Mon, Nov 23, 2015 at 10:33:01AM +0000, Kyrill Tkachov wrote: >>> On 12/11/15 12:05, James Greenhalgh wrote: >>>> On Tue, Nov 03, 2015 at 03:43:24PM +0000, Kyrill Tkachov wrote: >>>>> Hi all, >>>>> >>>>> Bootstrapped and tested on aarch64. >>>>> >>>>> Ok for trunk? >>>> Comments in-line. >>>> >>> Here's an updated patch according to your comments. >>> Sorry it took so long to respin it, had other things to deal with with >>> stage1 closing... >>> >>> I've indented the sample code sequences and used valid mnemonics. >>> These patterns can only match during combine, so I'd expect them to always >>> split during combine or immediately after, but I don't think that's a documented >>> guarantee so I've gated them on !reload_completed. >>> >>> I've used IN_RANGE in the predicate.md hunk and added scan-assembler checks >>> in the tests. >>> >>> Is this ok? >>> >>> Thanks, >>> Kyrill >>> >>> 2015-11-20 Kyrylo Tkachov >>> >>> * config/aarch64/aarch64.md (*condjump): Rename to... >>> (condjump): ... This. >>> (*compare_condjump): New define_insn_and_split. >>> (*compare_cstore_insn): Likewise. >>> (*cstore_insn): Rename to... >>> (cstore_insn): ... This. >>> * config/aarch64/iterators.md (CMP): Handle ne code. >>> * config/aarch64/predicates.md (aarch64_imm24): New predicate. >>> >>> 2015-11-20 Kyrylo Tkachov >>> >>> * gcc.target/aarch64/cmpimm_branch_1.c: New test. >>> * gcc.target/aarch64/cmpimm_cset_1.c: Likewise. >>> >>> commit bb44feed4e6beaae25d9bdffa45073dc61c65838 >>> Author: Kyrylo Tkachov >>> Date: Mon Sep 21 10:56:47 2015 +0100 >>> >>> [AArch64] Improve comparison with complex immediates >>> >>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md >>> index 11f6387..3e57d08 100644 >>> --- a/gcc/config/aarch64/aarch64.md >>> +++ b/gcc/config/aarch64/aarch64.md >>> @@ -372,7 +372,7 @@ (define_expand "mod3" >>> } >>> ) >>> -(define_insn "*condjump" >>> +(define_insn "condjump" >>> [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator" >>> [(match_operand 1 "cc_register" "") (const_int 0)]) >>> (label_ref (match_operand 2 "" "")) >>> @@ -397,6 +397,41 @@ (define_insn "*condjump" >>> (const_int 1)))] >>> ) >>> +;; For a 24-bit immediate CST we can optimize the compare for equality >>> +;; and branch sequence from: >>> +;; mov x0, #imm1 >>> +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ >>> +;; cmp x1, x0 >>> +;; b .Label >>> +;; into the shorter: >>> +;; sub x0, x0, #(CST & 0xfff000) >>> +;; subs x0, x0, #(CST & 0x000fff) >> sub x0, x1, #(CST....) ? >> >> The transform doesn't make sense otherwise. > > Doh, yes. The source should be x1 of course. > Here's what I'll be committing. Thanks, Kyrill 2015-11-24 Kyrylo Tkachov * config/aarch64/aarch64.md (*condjump): Rename to... (condjump): ... This. (*compare_condjump): New define_insn_and_split. (*compare_cstore_insn): Likewise. (*cstore_insn): Rename to... (cstore_insn): ... This. * config/aarch64/iterators.md (CMP): Handle ne code. * config/aarch64/predicates.md (aarch64_imm24): New predicate. 2015-11-24 Kyrylo Tkachov * gcc.target/aarch64/cmpimm_branch_1.c: New test. * gcc.target/aarch64/cmpimm_cset_1.c: Likewise. > Kyrill > >> >>> +;; b .Label >>> +(define_insn_and_split "*compare_condjump" >>> + [(set (pc) (if_then_else (EQL >>> + (match_operand:GPI 0 "register_operand" "r") >>> + (match_operand:GPI 1 "aarch64_imm24" "n")) >>> + (label_ref:P (match_operand 2 "" "")) >>> + (pc)))] >>> + "!aarch64_move_imm (INTVAL (operands[1]), mode) >>> + && !aarch64_plus_operand (operands[1], mode) >>> + && !reload_completed" >>> + "#" >>> + "&& true" >>> + [(const_int 0)] >>> + { >>> + HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff; >>> + HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000; >>> + rtx tmp = gen_reg_rtx (mode); >>> + emit_insn (gen_add3 (tmp, operands[0], GEN_INT (-hi_imm))); >>> + emit_insn (gen_add3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); >>> + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); >>> + rtx cmp_rtx = gen_rtx_fmt_ee (, mode, cc_reg, const0_rtx); >>> + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2])); >>> + DONE; >>> + } >>> +) >>> + >>> (define_expand "casesi" >>> [(match_operand:SI 0 "register_operand" "") ; Index >>> (match_operand:SI 1 "const_int_operand" "") ; Lower bound >>> @@ -2901,7 +2936,7 @@ (define_expand "cstore4" >>> " >>> ) >>> -(define_insn "*cstore_insn" >>> +(define_insn "aarch64_cstore" >>> [(set (match_operand:ALLI 0 "register_operand" "=r") >>> (match_operator:ALLI 1 "aarch64_comparison_operator" >>> [(match_operand 2 "cc_register" "") (const_int 0)]))] >>> @@ -2910,6 +2945,40 @@ (define_insn "*cstore_insn" >>> [(set_attr "type" "csel")] >>> ) >>> +;; For a 24-bit immediate CST we can optimize the compare for equality >>> +;; and branch sequence from: >>> +;; mov x0, #imm1 >>> +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ >>> +;; cmp x1, x0 >>> +;; cset x2, >>> +;; into the shorter: >>> +;; sub x0, x0, #(CST & 0xfff000) >>> +;; subs x0, x0, #(CST & 0x000fff) >>> +;; cset x1, . >> Please fix the register allocation in your shorter sequence, these >> are not equivalent. >> >>> +(define_insn_and_split "*compare_cstore_insn" >>> + [(set (match_operand:GPI 0 "register_operand" "=r") >>> + (EQL:GPI (match_operand:GPI 1 "register_operand" "r") >>> + (match_operand:GPI 2 "aarch64_imm24" "n")))] >>> + "!aarch64_move_imm (INTVAL (operands[2]), mode) >>> + && !aarch64_plus_operand (operands[2], mode) >>> + && !reload_completed" >>> + "#" >>> + "&& true" >>> + [(const_int 0)] >>> + { >>> + HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff; >>> + HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000; >>> + rtx tmp = gen_reg_rtx (mode); >>> + emit_insn (gen_add3 (tmp, operands[1], GEN_INT (-hi_imm))); >>> + emit_insn (gen_add3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); >>> + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); >>> + rtx cmp_rtx = gen_rtx_fmt_ee (, mode, cc_reg, const0_rtx); >>> + emit_insn (gen_aarch64_cstore (operands[0], cmp_rtx, cc_reg)); >>> + DONE; >>> + } >>> + [(set_attr "type" "csel")] >>> +) >>> + >>> ;; zero_extend version of the above >>> (define_insn "*cstoresi_insn_uxtw" >>> [(set (match_operand:DI 0 "register_operand" "=r") >>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md >>> index c2eb7de..422bc87 100644 >>> --- a/gcc/config/aarch64/iterators.md >>> +++ b/gcc/config/aarch64/iterators.md >>> @@ -824,7 +824,8 @@ (define_code_attr cmp_2 [(lt "1") (le "1") (eq "2") (ge "2") (gt "2") >>> (ltu "1") (leu "1") (geu "2") (gtu "2")]) >>> (define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT") >>> - (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")]) >>> + (ltu "LTU") (leu "LEU") (ne "NE") (geu "GEU") >>> + (gtu "GTU")]) >>> (define_code_attr fix_trunc_optab [(fix "fix_trunc") >>> (unsigned_fix "fixuns_trunc")]) >>> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md >>> index e7f76e0..c0c3ff5 100644 >>> --- a/gcc/config/aarch64/predicates.md >>> +++ b/gcc/config/aarch64/predicates.md >>> @@ -145,6 +145,11 @@ (define_predicate "aarch64_imm3" >>> (and (match_code "const_int") >>> (match_test "(unsigned HOST_WIDE_INT) INTVAL (op) <= 4"))) >>> +;; An immediate that fits into 24 bits. >>> +(define_predicate "aarch64_imm24" >>> + (and (match_code "const_int") >>> + (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)"))) >>> + >>> (define_predicate "aarch64_pwr_imm3" >>> (and (match_code "const_int") >>> (match_test "INTVAL (op) != 0 >>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c >>> new file mode 100644 >>> index 0000000..7ad736b >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c >>> @@ -0,0 +1,26 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-save-temps -O2" } */ >>> + >>> +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ >>> + >>> +void g (void); >>> +void >>> +foo (int x) >>> +{ >>> + if (x != 0x123456) >>> + g (); >>> +} >>> + >>> +void >>> +fool (long long x) >>> +{ >>> + if (x != 0x123456) >>> + g (); >>> +} >>> + >>> +/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */ >>> +/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */ >>> +/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */ >>> +/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */ >>> +/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */ >>> +/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */ >>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c >>> new file mode 100644 >>> index 0000000..6a03cc9 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c >>> @@ -0,0 +1,23 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-save-temps -O2" } */ >>> + >>> +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ >>> + >>> +int >>> +foo (int x) >>> +{ >>> + return x == 0x123456; >>> +} >>> + >>> +long >>> +fool (long x) >>> +{ >>> + return x == 0x123456; >>> +} >>> + >> This test will be broken for ILP32. This should be long long. >> >> OK with those comments fixed. > > Thanks, I'll prepare an updated version. > > Kyrill > >> Thanks, >> James >> >