diff --git a/gcc/combine.cc b/gcc/combine.cc index a5fabf3..9412a9c 100644 --- a/gcc/combine.cc +++ b/gcc/combine.cc @@ -7288,7 +7288,17 @@ expand_compound_operation (rtx x) && (STORE_FLAG_VALUE & ~GET_MODE_MASK (inner_mode)) == 0) return SUBREG_REG (XEXP (x, 0)); + /* If ZERO_EXTEND is cheap on this target, do nothing, + i.e. don't attempt to convert it to a pair of shifts. */ + if (set_src_cost (x, mode, optimize_this_for_speed_p) + <= COSTS_N_INSNS (1)) + return x; } + /* Likewise, if SIGN_EXTEND is cheap, do nothing. */ + else if (GET_CODE (x) == SIGN_EXTEND + && set_src_cost (x, mode, optimize_this_for_speed_p) + <= COSTS_N_INSNS (1)) + return x; /* If we reach here, we want to return a pair of shifts. The inner shift is a left shift of BITSIZE - POS - LEN bits. The outer @@ -7309,7 +7319,11 @@ expand_compound_operation (rtx x) if (modewidth >= pos + len) { tem = gen_lowpart (mode, XEXP (x, 0)); - if (!tem || GET_CODE (tem) == CLOBBER) + if (!tem + || GET_CODE (tem) == CLOBBER + || (GET_CODE (tem) == SUBREG + && !REG_P (SUBREG_REG (tem)) + && !MEM_P (SUBREG_REG (tem)))) return x; tem = simplify_shift_const (NULL_RTX, ASHIFT, mode, tem, modewidth - pos - len); @@ -7321,7 +7335,11 @@ expand_compound_operation (rtx x) tem = simplify_shift_const (NULL_RTX, LSHIFTRT, inner_mode, XEXP (x, 0), pos); tem = gen_lowpart (mode, tem); - if (!tem || GET_CODE (tem) == CLOBBER) + if (!tem + || GET_CODE (tem) == CLOBBER + || (GET_CODE (tem) == SUBREG + && !REG_P (SUBREG_REG (tem)) + && !MEM_P (SUBREG_REG (tem)))) return x; tem = simplify_and_const_int (NULL_RTX, mode, tem, (HOST_WIDE_INT_1U << len) - 1); diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index fc0d6c3..698ca6e 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -1404,22 +1404,32 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode, break; case FFS: - /* (ffs (*_extend )) = (ffs ) */ + /* (ffs (*_extend )) = (*_extend (ffs )). */ if (GET_CODE (op) == SIGN_EXTEND || GET_CODE (op) == ZERO_EXTEND) - return simplify_gen_unary (FFS, mode, XEXP (op, 0), - GET_MODE (XEXP (op, 0))); + { + temp = simplify_gen_unary (FFS, GET_MODE (XEXP (op, 0)), + XEXP (op, 0), GET_MODE (XEXP (op, 0))); + return simplify_gen_unary (GET_CODE (op), mode, temp, + GET_MODE (temp)); + } break; case POPCOUNT: switch (GET_CODE (op)) { case BSWAP: - case ZERO_EXTEND: - /* (popcount (zero_extend )) = (popcount ) */ + /* (popcount (bswap )) = (popcount ). */ return simplify_gen_unary (POPCOUNT, mode, XEXP (op, 0), GET_MODE (XEXP (op, 0))); + case ZERO_EXTEND: + /* (popcount (zero_extend )) = (zero_extend (popcount )). */ + temp = simplify_gen_unary (POPCOUNT, GET_MODE (XEXP (op, 0)), + XEXP (op, 0), GET_MODE (XEXP (op, 0))); + return simplify_gen_unary (ZERO_EXTEND, mode, temp, + GET_MODE (temp)); + case ROTATE: case ROTATERT: /* Rotations don't affect popcount. */ @@ -1438,11 +1448,16 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode, { case NOT: case BSWAP: - case ZERO_EXTEND: - case SIGN_EXTEND: return simplify_gen_unary (PARITY, mode, XEXP (op, 0), GET_MODE (XEXP (op, 0))); + case ZERO_EXTEND: + case SIGN_EXTEND: + temp = simplify_gen_unary (PARITY, GET_MODE (XEXP (op, 0)), + XEXP (op, 0), GET_MODE (XEXP (op, 0))); + return simplify_gen_unary (GET_CODE (op), mode, temp, + GET_MODE (temp)); + case ROTATE: case ROTATERT: /* Rotations don't affect parity. */ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 1be9b66..dcf50cb 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17010,6 +17010,42 @@ (set_attr "type" "bitmanip") (set_attr "mode" "SI")]) +(define_insn_and_split "*clzsi2_lzcnt_zext_2" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_LZCNT && TARGET_64BIT" + "lzcnt{l}\t{%1, %k0|%k0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (zero_extend:DI (clz:SI (match_dup 1)))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +; False dependency happens when destination is only updated by tzcnt, +; lzcnt or popcnt. There is no false dependency when destination is +; also used in source. +(define_insn "*clzsi2_lzcnt_zext_2_falsedep" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (clz:SI (match_operand:SWI48 1 "nonimmediate_operand" "rm")))) + (unspec [(match_operand:DI 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "TARGET_LZCNT" + "lzcnt{l}\t{%1, %k0|%k0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_int_iterator LT_ZCNT [(UNSPEC_TZCNT "TARGET_BMI") (UNSPEC_LZCNT "TARGET_LZCNT")]) @@ -17328,6 +17364,22 @@ (set_attr "prefix" "vex") (set_attr "mode" "DI")]) +(define_insn "*bmi2_bzhi_zero_extendsidi_5" + [(set (match_operand:DI 0 "register_operand" "=r") + (and:DI + (zero_extend:DI + (plus:SI + (ashift:SI (const_int 1) + (match_operand:QI 2 "register_operand" "r")) + (const_int -1))) + (match_operand:DI 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_BMI2" + "bzhi\t{%q2, %q1, %q0|%q0, %q1, %q2}" + [(set_attr "type" "bitmanip") + (set_attr "prefix" "vex") + (set_attr "mode" "DI")]) + (define_insn "bmi2_pdep_3" [(set (match_operand:SWI48 0 "register_operand" "=r") (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r") @@ -17590,6 +17642,54 @@ (set_attr "type" "bitmanip") (set_attr "mode" "SI")]) +(define_insn_and_split "*popcountsi2_zext_2" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT && TARGET_64BIT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %k0|%k0, %1}"; +#else + return "popcnt{l}\t{%1, %k0|%k0, %1}"; +#endif +} + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (zero_extend:DI (popcount:SI (match_dup 1)))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +; False dependency happens when destination is only updated by tzcnt, +; lzcnt or popcnt. There is no false dependency when destination is +; also used in source. +(define_insn "*popcountsi2_zext_2_falsedep" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (unspec [(match_operand:DI 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT && TARGET_64BIT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %k0|%k0, %1}"; +#else + return "popcnt{l}\t{%1, %k0|%k0, %1}"; +#endif +} + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_insn_and_split "*popcounthi2_1" [(set (match_operand:SI 0 "register_operand") (popcount:SI @@ -17608,6 +17708,24 @@ DONE; }) +(define_insn_and_split "*popcounthi2_2" + [(set (match_operand:SI 0 "register_operand") + (zero_extend:SI + (popcount:HI (match_operand:HI 1 "nonimmediate_operand")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx tmp = gen_reg_rtx (HImode); + + emit_insn (gen_popcounthi2 (tmp, operands[1])); + emit_insn (gen_zero_extendhisi2 (operands[0], tmp)); + DONE; +}) + (define_insn "popcounthi2" [(set (match_operand:HI 0 "register_operand" "=r") (popcount:HI @@ -17927,6 +18045,39 @@ PUT_CODE (operands[5], GET_CODE (operands[5]) == EQ ? UNORDERED : ORDERED); }) +;; Eliminate HImode popcount&1 using parity flag (variant 2) +(define_peephole2 + [(match_scratch:HI 0 "Q") + (parallel [(set (match_operand:HI 1 "register_operand") + (popcount:HI + (match_operand:HI 2 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))]) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (and:QI (match_operand:QI 3 "register_operand") + (const_int 1)) + (const_int 0))) + (set (pc) (if_then_else (match_operator 4 "bt_comparison_operator" + [(reg:CCZ FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 5)) + (pc)))] + "REGNO (operands[1]) == REGNO (operands[3]) + && peep2_reg_dead_p (2, operands[1]) + && peep2_reg_dead_p (2, operands[3]) + && peep2_regno_dead_p (3, FLAGS_REG)" + [(set (match_dup 0) (match_dup 2)) + (parallel [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_PARITY)) + (clobber (match_dup 0))]) + (set (pc) (if_then_else (match_op_dup 4 [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_dup 5)) + (pc)))] +{ + operands[4] = shallow_copy_rtx (operands[4]); + PUT_CODE (operands[4], GET_CODE (operands[4]) == EQ ? UNORDERED : ORDERED); +}) + ;; Thread-local storage patterns for ELF. ;;