Index: gcc/config/rs6000/predicates.md =================================================================== --- gcc/config/rs6000/predicates.md (revision 199168) +++ gcc/config/rs6000/predicates.md (working copy) @@ -1654,3 +1654,99 @@ (define_predicate "small_toc_ref" return GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_TOCREL; }) + +;; Match the first insn (addis) in fusing the combination of addis and loads to +;; GPR registers on power8. Power8 currently will only do the fusion if the +;; top 11 bits of the addis value are all 1's or 0's. +(define_predicate "fusion_gpr_addis" + (match_code "const_int,high,plus") +{ + HOST_WIDE_INT value; + rtx int_const; + + /* 32-bit is not done yet. */ + if (TARGET_ELF && !TARGET_POWERPC64) + return 0; + + if (GET_CODE (op) == HIGH) + return 1; + + if (CONST_INT_P (op)) + int_const = op; + + else if (GET_CODE (op) == PLUS + && base_reg_operand (XEXP (op, 0), Pmode) + && CONST_INT_P (XEXP (op, 1))) + int_const = XEXP (op, 1); + + else + return 0; + + value = INTVAL (int_const); + if ((value & (HOST_WIDE_INT)0xffff) != 0) + return 0; + + if ((value & (HOST_WIDE_INT)0xffff0000) == 0) + return 0; + + return (IN_RANGE (value >> 16, -32, 31)); +}) + +;; Match the second insn (lbz, lhz, lwz, ld) in fusing the combination of addis +;; and loads to GPR registers on power8. +(define_predicate "fusion_gpr_mem_load" + (match_code "mem") +{ + rtx addr; + + if (!MEM_P (op)) + return 0; + + switch (mode) + { + case QImode: + case HImode: + case SImode: + break; + + case DImode: + if (!TARGET_POWERPC64) + return 0; + break; + + default: + return 0; + } + + addr = XEXP (op, 0); + if (GET_CODE (addr) == PLUS) + { + rtx base = XEXP (addr, 0); + rtx offset = XEXP (addr, 1); + + return (base_reg_operand (base, GET_MODE (base)) + && satisfies_constraint_I (offset)); + } + + else if (GET_CODE (addr) == LO_SUM) + { + rtx base = XEXP (addr, 0); + rtx offset = XEXP (addr, 1); + + /* 32-bit is not done yet. */ + if (TARGET_ELF && !TARGET_POWERPC64) + return 0; + + if (!base_reg_operand (base, GET_MODE (base))) + return 0; + + else if (TARGET_XCOFF || (TARGET_ELF && TARGET_POWERPC64)) + return small_toc_ref (offset, GET_MODE (offset)); +/* + else if (TARGET_ELF && !TARGET_POWERPC64) + return CONSTANT_P (offset); +*/ + } + + return 0; +}) Index: gcc/config/rs6000/rs6000-modes.def =================================================================== --- gcc/config/rs6000/rs6000-modes.def (revision 199037) +++ gcc/config/rs6000/rs6000-modes.def (working copy) @@ -42,5 +42,7 @@ VECTOR_MODES (FLOAT, 8); /* VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF */ -/* Replacement for TImode that only is allowed in GPRs. */ +/* Replacement for TImode that only is allowed in GPRs. We also use PTImode + for quad memory atomic operations to force getting an even/odd register + combination. */ PARTIAL_INT_MODE (TI); Index: gcc/config/rs6000/rs6000-protos.h =================================================================== --- gcc/config/rs6000/rs6000-protos.h (revision 199200) +++ gcc/config/rs6000/rs6000-protos.h (working copy) @@ -73,6 +73,8 @@ extern int mems_ok_for_quad_peep (rtx, r extern bool gpr_or_gpr_p (rtx, rtx); extern bool direct_move_p (rtx, rtx); extern bool quad_load_store_p (rtx, rtx); +extern bool fusion_gpr_load_p (rtx, rtx, rtx, rtx, rtx); +extern const char *emit_fusion_gpr_load (rtx, rtx, rtx, rtx); extern enum reg_class (*rs6000_preferred_reload_class_ptr) (rtx, enum reg_class); extern enum reg_class (*rs6000_secondary_reload_class_ptr) (enum reg_class, Index: gcc/config/rs6000/rs6000.opt =================================================================== --- gcc/config/rs6000/rs6000.opt (revision 199122) +++ gcc/config/rs6000/rs6000.opt (working copy) @@ -542,3 +542,11 @@ Use ISA 2.07 direct move between GPR & V mquad-memory Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags) Generate the quad word memory instructions (lq/stq/lqarx/stqcx). + +mlra +Target Report Mask(LRA) Var(rs6000_isa_flags) +Enable the use of the LRA (local register allocator). + +mconstrain-regs +Target Undocumented Mask(CONSTRAIN_REGS) Var(rs6000_isa_flags) +; Only allow ints of certain modes to go in SPRs Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 199210) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -1044,6 +1044,7 @@ static bool rs6000_debug_cannot_change_m enum machine_mode, enum reg_class); static bool rs6000_save_toc_in_prologue_p (void); +static bool rs6000_lra_p (void); rtx (*rs6000_legitimize_reload_address_ptr) (rtx, enum machine_mode, int, int, int, int *) @@ -1519,6 +1520,9 @@ static const struct attribute_spec rs600 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK #define TARGET_VECTORIZE_VEC_PERM_CONST_OK rs6000_vectorize_vec_perm_const_ok + +#undef TARGET_LRA_P +#define TARGET_LRA_P rs6000_lra_p /* Processor table. */ @@ -1631,6 +1635,18 @@ rs6000_hard_regno_mode_ok (int regno, en if (mode == TImode && TARGET_VSX_TIMODE && VSX_REGNO_P (regno)) return 1; + /* Allow 64-bit and the 32-bit floating point types in Altivec registers + under Power8. In theory, we would allow 32-bit integers as well. We + allow SDmode, even though no decimal operation works the Altivec + registers, but it is ok for moves. */ + if (TARGET_VSX && VSX_REGNO_P (regno) && TARGET_P8_VECTOR + && VECTOR_MEM_VSX_P (DFmode) + && (mode == DImode + || mode == DDmode + || mode == SFmode + || mode == SDmode)) + return 1; + /* The GPRs can hold any mode, but values bigger than one register cannot go past R31. */ if (INT_REGNO_P (regno)) @@ -1671,6 +1687,18 @@ rs6000_hard_regno_mode_ok (int regno, en if (SPE_SIMD_REGNO_P (regno) && TARGET_SPE && SPE_VECTOR_MODE (mode)) return 1; + /* See if we need to be stricter about what goes into the special + registers (LR, CTR, VSAVE, VSCR). */ + if (TARGET_CONSTRAIN_REGS) + { + if (regno == LR_REGNO || regno == CTR_REGNO) + return (GET_MODE_CLASS (mode) == MODE_INT + && rs6000_hard_regno_nregs[mode][regno] == 1); + + if (regno == VRSAVE_REGNO || regno == VSCR_REGNO) + return (mode == SImode); + } + /* We cannot put non-VSX TImode or PTImode anywhere except general register and it must be able to fit within the register set. */ @@ -2138,6 +2166,9 @@ rs6000_debug_reg_global (void) fprintf (stderr, DEBUG_FMT_S, "p8 fusion", (TARGET_P8_FUSION_SIGN) ? "zero+sign" : "zero"); + if (TARGET_CONSTRAIN_REGS) + fprintf (stderr, DEBUG_FMT_S, "constrain-regs", "true"); + fprintf (stderr, DEBUG_FMT_S, "plt-format", TARGET_SECURE_PLT ? "secure" : "bss"); fprintf (stderr, DEBUG_FMT_S, "struct-return", @@ -2321,6 +2352,15 @@ rs6000_init_hard_regno_mode_ok (bool glo rs6000_vector_align[TImode] = align64; } + /* SFmode, see if we want to use the VSX unit. */ + if (TARGET_P8_VECTOR) + { + rs6000_vector_unit[SFmode] = VECTOR_P8_VECTOR; + rs6000_vector_mem[SFmode] + = (TARGET_VSX_SCALAR_MEMORY ? VECTOR_P8_VECTOR : VECTOR_NONE); + rs6000_vector_align[SFmode] = align32; + } + /* TODO add SPE and paired floating point vector support. */ /* Register class constraints for the constraints that depend on compile @@ -3067,6 +3107,21 @@ rs6000_option_override_internal (bool gl rs6000_isa_flags &= ~OPTION_MASK_VSX_TIMODE; } + /* Enable power8 fusion if we are tuning for power8, even if we aren't + generating power8 instructions. */ + if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION)) + rs6000_isa_flags |= (processor_target_table[tune_index].target_enable + & OPTION_MASK_P8_FUSION); + + /* Power8 does not fuse sign extended loads with the addis. If we are + optimizing at high levels for speed, convert a sign extended load into a + zero extending load, and an explicit sign extension. */ + if (TARGET_P8_FUSION + && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN) + && optimize_function_for_speed_p (cfun) + && optimize >= 3) + rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN; + if (TARGET_DEBUG_REG || TARGET_DEBUG_TARGET) rs6000_print_isa_options (stderr, 0, "after defaults", rs6000_isa_flags); @@ -28674,12 +28729,14 @@ static struct rs6000_opt_mask const rs60 { { "altivec", OPTION_MASK_ALTIVEC, false, true }, { "cmpb", OPTION_MASK_CMPB, false, true }, + { "constrain-regs", OPTION_MASK_CONSTRAIN_REGS, false, false }, { "crypto", OPTION_MASK_CRYPTO, false, true }, { "direct-move", OPTION_MASK_DIRECT_MOVE, false, true }, { "dlmzb", OPTION_MASK_DLMZB, false, true }, { "fprnd", OPTION_MASK_FPRND, false, true }, { "hard-dfp", OPTION_MASK_DFP, false, true }, { "isel", OPTION_MASK_ISEL, false, true }, + { "lra", OPTION_MASK_LRA, false, false }, { "mfcrf", OPTION_MASK_MFCRF, false, true }, { "mfpgpr", OPTION_MASK_MFPGPR, false, true }, { "mulhw", OPTION_MASK_MULHW, false, true }, @@ -29683,6 +29740,254 @@ rs6000_set_up_by_prologue (struct hard_r add_to_hard_reg_set (&set->set, Pmode, RS6000_PIC_OFFSET_TABLE_REGNUM); } + +/* Enable/disable the LRA (local register allocator). */ + +static bool +rs6000_lra_p (void) +{ + return TARGET_LRA; +} + + +/* Return true if the peephole2 can combine a load involving a combination of + an addis instruction and a load with an offset that can be fused together on + a power8. */ + +bool +fusion_gpr_load_p (rtx addis_reg, /* reg. to hold high value. */ + rtx addis_value, /* high value loaded. */ + rtx target, /* reg. that is loaded. */ + rtx mem, /* memory to load. */ + rtx insn) /* insn for looking up reg notes or + NULL_RTX if this is a peephole2. */ +{ + rtx addr; + rtx base_reg; + + /* Validate arguments. */ + if (!base_reg_operand (addis_reg, GET_MODE (addis_reg))) + return false; + + if (!base_reg_operand (target, GET_MODE (target))) + return false; + + if (!fusion_gpr_addis (addis_value, GET_MODE (addis_value))) + return false; + + if (!fusion_gpr_mem_load (mem, GET_MODE (mem))) + return false; + + /* Validate that the register used to load the high value is either the + register being loaded, or we can safely replace its use in a peephole. + + If this is a peephole2, we assume that there are 2 instructions in the + peephole (addis and load), so we want to check if the target register was + not used and the register to hold the addis result is dead after the + peephole. */ + if (REGNO (addis_reg) != REGNO (target)) + { + if (reg_mentioned_p (target, mem)) + return false; + + if (insn) + { + if (!find_reg_note (insn, REG_DEAD, addis_reg)) + return false; + } + else + { + if (!peep2_reg_dead_p (2, addis_reg)) + return false; + } + } + + /* Validate that the value being loaded in the addis is used in the load. */ + addr = XEXP (mem, 0); /* either PLUS or LO_SUM. */ + if (GET_CODE (addr) != PLUS && GET_CODE (addr) != LO_SUM) + return false; + + base_reg = XEXP (addr, 0); + return REGNO (addis_reg) == REGNO (base_reg); +} + +/* Return a string to fuse an addis instruction with a gpr load to the same + register that we loaded up the addis instruction. The code is complicated, + so we call output_asm_insn directly, and just return "". */ + +const char * +emit_fusion_gpr_load (rtx addis_reg, rtx addis_value, rtx target, rtx mem) +{ + rtx fuse_ops[10]; + rtx addr; + rtx load_offset; + const char *addis_str = NULL; + const char *load_str = NULL; + const char *mode_name = NULL; + char insn_template[80]; + enum machine_mode mode = GET_MODE (mem); + const char *comment_str = ASM_COMMENT_START; + + if (*comment_str == ' ') + comment_str++; + + if (!MEM_P (mem)) + gcc_unreachable (); + + addr = XEXP (mem, 0); + if (GET_CODE (addr) != PLUS && GET_CODE (addr) != LO_SUM) + gcc_unreachable (); + + load_offset = XEXP (addr, 1); + + /* Now emit the load instruction to the same register. */ + switch (mode) + { + case QImode: + mode_name = "char"; + load_str = "lbz"; + break; + + case HImode: + mode_name = "short"; + load_str = "lhz"; + break; + + case SImode: + mode_name = "int"; + load_str = "lwz"; + break; + + case DImode: + if (TARGET_POWERPC64) + { + mode_name = "long"; + load_str = "ld"; + } + break; + + default: + break; + } + + if (!load_str) + gcc_unreachable (); + + /* Emit the addis instruction. */ + fuse_ops[0] = target; + fuse_ops[1] = addis_reg; + if (satisfies_constraint_L (addis_value)) + { + fuse_ops[2] = addis_value; + addis_str = "lis %0,%v2"; + } + + else if (GET_CODE (addis_value) == PLUS) + { + rtx op0 = XEXP (addis_value, 0); + rtx op1 = XEXP (addis_value, 1); + + if (REG_P (op0) && CONST_INT_P (op1) + && satisfies_constraint_L (op1)) + { + fuse_ops[2] = op0; + fuse_ops[3] = op1; + addis_str = "addis %0,%2,%v3"; + } + } + + else if (GET_CODE (addis_value) == HIGH) + { + rtx value = XEXP (addis_value, 0); + if (GET_CODE (value) == UNSPEC && XINT (value, 1) == UNSPEC_TOCREL) + { + fuse_ops[2] = XVECEXP (value, 0, 0); /* symbol ref. */ + fuse_ops[3] = XVECEXP (value, 0, 1); /* TOC register. */ + if (TARGET_ELF) + addis_str = "addis %0,%3,%2@toc@ha"; + + else if (TARGET_XCOFF) + addis_str = "addis %0,%2@u(%3)"; + } + + else if (GET_CODE (value) == PLUS) + { + rtx op0 = XEXP (value, 0); + rtx op1 = XEXP (value, 1); + + if (GET_CODE (op0) == UNSPEC + && XINT (op0, 1) == UNSPEC_TOCREL + && CONST_INT_P (op1)) + { + fuse_ops[2] = XVECEXP (op0, 0, 0); /* symbol ref. */ + fuse_ops[3] = XVECEXP (op0, 0, 1); /* TOC register. */ + fuse_ops[4] = op1; + if (TARGET_ELF) + addis_str = "addis %0,%3,%2+%4@toc@ha"; + + else if (TARGET_XCOFF) + addis_str = "addis %0,%2+%4@u(%3)"; + } + } + } + + if (!addis_str) + gcc_unreachable (); + + sprintf (insn_template, "%s\t\t%s gpr load fusion, type %s, addis reg %%1", + addis_str, comment_str, mode_name); + output_asm_insn (insn_template, fuse_ops); + + if (CONST_INT_P (load_offset) && satisfies_constraint_I (load_offset)) + { + sprintf (insn_template, "%s %%0,%%1(%%0)", load_str); + fuse_ops[1] = load_offset; + output_asm_insn (insn_template, fuse_ops); + } + + else if (GET_CODE (load_offset) == UNSPEC + && XINT (load_offset, 1) == UNSPEC_TOCREL) + { + if (TARGET_ELF) + sprintf (insn_template, "%s %%0,%%1@toc@l(%%0)", load_str); + + else if (TARGET_XCOFF) + sprintf (insn_template, "%s %%0,%%1@l(%%0)", load_str); + + else + gcc_unreachable (); + + fuse_ops[1] = XVECEXP (load_offset, 0, 0); + output_asm_insn (insn_template, fuse_ops); + } + + else if (GET_CODE (load_offset) == PLUS + && GET_CODE (XEXP (load_offset, 0)) == UNSPEC + && XINT (XEXP (load_offset, 0), 1) == UNSPEC_TOCREL + && CONST_INT_P (XEXP (load_offset, 1))) + { + rtx tocrel_unspec = XEXP (load_offset, 0); + if (TARGET_ELF) + sprintf (insn_template, "%s %%0,%%1+%%2@toc@l(%%0)", load_str); + + else if (TARGET_XCOFF) + sprintf (insn_template, "%s %%0,%%1+%%2@l(%%0)", load_str); + + else + gcc_unreachable (); + + fuse_ops[1] = XVECEXP (tocrel_unspec, 0, 0); + fuse_ops[2] = XEXP (load_offset, 1); + output_asm_insn (insn_template, fuse_ops); + } + + else + gcc_unreachable (); + + return ""; +} + + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-rs6000.h" Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 199200) +++ gcc/config/rs6000/vsx.md (working copy) @@ -1847,3 +1847,28 @@ (define_insn_and_split "*vsx_reduc_x %x2,%0,%3" + [(set_attr "length" "8") + (set_attr "type" "vecload")]) + +(define_peephole + [(set (match_operand:P 0 "base_reg_operand" "") + (match_operand:P 1 "short_cint_operand" "")) + (set (match_operand:VSX_M2 2 "vsx_register_operand" "") + (mem:VSX_M2 (plus:P (match_operand:P 3 "int_reg_operand" "") + (match_dup 0))))] + "TARGET_P8_FUSION" + "li %0,%1\t\t# vector load fusion\;lxx %x2,%0,%3" + [(set_attr "length" "8") + (set_attr "type" "vecload")]) Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 199210) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -15237,6 +15237,117 @@ (define_insn "rs6000_mftb_" }) +;; Power8 fusion support for fusing an addis instruction with a D-form load of +;; a GPR. The addis instruction must be adjacent to the load, and use the same +;; register that is being loaded. + +;; Note, the fused ops must be adjacent, so don't split these ops Originally +;; these were written as define_peephole2's, but moved to define_peephole's so +;; that it doesn't conflict with cse after reload in some cases. + +;; GPR fusion for single word integer types + +(define_peephole + [(set (match_operand:P 0 "base_reg_operand" "") + (match_operand:P 1 "fusion_gpr_addis" "")) + (set (match_operand:INT1 2 "base_reg_operand" "") + (match_operand:INT1 3 "fusion_gpr_mem_load" ""))] + "TARGET_P8_FUSION + && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3], + insn)" +{ + return emit_fusion_gpr_load (operands[0], operands[1], operands[2], + operands[3]); +} + [(set_attr "type" "load") + (set_attr "length" "8")]) + +(define_peephole + [(set (match_operand:DI 0 "base_reg_operand" "") + (match_operand:DI 1 "fusion_gpr_addis" "")) + (set (match_operand:DI 2 "base_reg_operand" "") + (zero_extend:DI (match_operand:QHSI 3 "fusion_gpr_mem_load" "")))] + "TARGET_P8_FUSION && TARGET_POWERPC64 + && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3], + insn)" +{ + return emit_fusion_gpr_load (operands[0], operands[1], operands[2], + operands[3]); +} + [(set_attr "type" "load") + (set_attr "length" "8")]) + +;; Power8 does not fuse a sign extending load, so convert the sign extending +;; load into a zero extending load, and do an explicit sign extension. Don't +;; do this if we are trying to optimize for space. Do this as a peephole2 to +;; allow final rtl optimizations and scheduling to move the sign extend. +(define_peephole2 + [(set (match_operand:DI 0 "base_reg_operand" "") + (match_operand:DI 1 "fusion_gpr_addis" "")) + (set (match_operand:DI 2 "base_reg_operand" "") + (sign_extend:DI (match_operand:HSI 3 "fusion_gpr_mem_load" "")))] + "TARGET_P8_FUSION && TARGET_P8_FUSION_SIGN && TARGET_POWERPC64 + && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3], + NULL_RTX)" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 4) (match_dup 3)) + (set (match_dup 2) (sign_extend:DI (match_dup 4)))] +{ + unsigned int offset + = (BYTES_BIG_ENDIAN ? 8 - GET_MODE_SIZE (mode) : 0); + + operands[4] = simplify_subreg (mode, operands[2], DImode, + offset); +}) + +(define_peephole + [(set (match_operand:P 0 "base_reg_operand" "") + (match_operand:P 1 "fusion_gpr_addis" "")) + (set (match_operand:SI 2 "base_reg_operand" "") + (zero_extend:SI (match_operand:QHI 3 "fusion_gpr_mem_load" "")))] + "TARGET_P8_FUSION + && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3], + insn)" +{ + return emit_fusion_gpr_load (operands[0], operands[1], operands[2], + operands[3]); +} + [(set_attr "type" "load") + (set_attr "length" "8")]) + +(define_peephole2 + [(set (match_operand:P 0 "base_reg_operand" "") + (match_operand:P 1 "fusion_gpr_addis" "")) + (set (match_operand:SI 2 "base_reg_operand" "") + (sign_extend:SI (match_operand:HI 3 "fusion_gpr_mem_load" "")))] + "TARGET_P8_FUSION && TARGET_P8_FUSION_SIGN + && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3], + NULL_RTX)" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 4) (match_dup 3)) + (set (match_dup 2) (sign_extend:SI (match_dup 4)))] +{ + unsigned int offset = (BYTES_BIG_ENDIAN ? 2 : 0); + + operands[4] = simplify_subreg (HImode, operands[2], SImode, offset); +}) + +(define_peephole + [(set (match_operand:P 0 "base_reg_operand" "") + (match_operand:P 1 "fusion_gpr_addis" "")) + (set (match_operand:HI 2 "base_reg_operand" "") + (zero_extend:HI (match_operand:QI 3 "fusion_gpr_mem_load" "")))] + "TARGET_P8_FUSION + && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3], + insn)" +{ + return emit_fusion_gpr_load (operands[0], operands[1], operands[2], + operands[3]); +} + [(set_attr "type" "load") + (set_attr "length" "8")]) + + (include "sync.md") (include "vector.md")