diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index be38586..a17327e 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -291,7 +291,11 @@ scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) insns = BITMAP_ALLOC (NULL); defs = BITMAP_ALLOC (NULL); defs_conv = BITMAP_ALLOC (NULL); + insns_conv = BITMAP_ALLOC (NULL); queue = NULL; + + n_sse_to_integer = 0; + n_integer_to_sse = 0; } /* Free chain's data. */ @@ -301,6 +305,7 @@ scalar_chain::~scalar_chain () BITMAP_FREE (insns); BITMAP_FREE (defs); BITMAP_FREE (defs_conv); + BITMAP_FREE (insns_conv); bitmap_obstack_release (NULL); } @@ -319,25 +324,11 @@ scalar_chain::add_to_queue (unsigned insn_uid) bitmap_set_bit (queue, insn_uid); } -general_scalar_chain::general_scalar_chain (enum machine_mode smode_, - enum machine_mode vmode_) - : scalar_chain (smode_, vmode_) -{ - insns_conv = BITMAP_ALLOC (NULL); - n_sse_to_integer = 0; - n_integer_to_sse = 0; -} - -general_scalar_chain::~general_scalar_chain () -{ - BITMAP_FREE (insns_conv); -} - /* For DImode conversion, mark register defined by DEF as requiring conversion. */ void -general_scalar_chain::mark_dual_mode_def (df_ref def) +scalar_chain::mark_dual_mode_def (df_ref def) { gcc_assert (DF_REF_REG_DEF_P (def)); @@ -364,14 +355,6 @@ general_scalar_chain::mark_dual_mode_def (df_ref def) DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); } -/* For TImode conversion, it is unused. */ - -void -timode_scalar_chain::mark_dual_mode_def (df_ref) -{ - gcc_unreachable (); -} - /* Check REF's chain to add new insns into a queue and find registers requiring conversion. */ @@ -934,7 +917,7 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) /* Convert COMPARE to vector mode. */ rtx -general_scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn) +scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn) { rtx tmp = gen_reg_rtx (vmode); rtx src; @@ -1156,6 +1139,83 @@ general_scalar_chain::convert_insn (rtx_insn *insn) df_insn_rescan (insn); } +/* Compute a gain for chain conversion. */ + +int +timode_scalar_chain::compute_convert_gain () +{ + /* Assume that if we have to move TImode values between units, + then transforming this chain isn't worth it. */ + if (n_sse_to_integer || n_integer_to_sse) + return -1; + + bitmap_iterator bi; + unsigned insn_uid; + + /* Split ties to prefer V1TImode when not optimizing for size. */ + int gain = optimize_size ? 0 : 1; + + if (dump_file) + fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); + + EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) + { + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; + rtx def_set = single_set (insn); + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + int igain = 0; + + switch (GET_CODE (src)) + { + case REG: + if (optimize_insn_for_size_p ()) + igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3); + else + igain = COSTS_N_INSNS (1); + break; + + case MEM: + igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7) + : COSTS_N_INSNS (1); + break; + + case CONST_INT: + if (MEM_P (dst) + && standard_sse_constant_p (src, V1TImode)) + igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1; + break; + + case NOT: + if (MEM_P (dst)) + igain = -COSTS_N_INSNS (1); + break; + + case AND: + case XOR: + case IOR: + if (!MEM_P (dst)) + igain = COSTS_N_INSNS (1); + break; + + default: + break; + } + + if (igain != 0 && dump_file) + { + fprintf (dump_file, " Instruction gain %d for ", igain); + dump_insn_slim (dump_file, insn); + } + gain += igain; + } + + if (dump_file) + fprintf (dump_file, " Total gain: %d\n", gain); + + return gain; +} + /* Fix uses of converted REG in debug insns. */ void @@ -1194,6 +1254,61 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg) } } +/* Convert operand OP in INSN from TImode to V1TImode. */ + +void +timode_scalar_chain::convert_op (rtx *op, rtx_insn *insn) +{ + *op = copy_rtx_if_shared (*op); + + if (REG_P (*op)) + *op = gen_rtx_SUBREG (V1TImode, *op, 0); + else if (MEM_P (*op)) + { + rtx tmp = gen_reg_rtx (V1TImode); + emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (V1TImode, tmp, 0), + gen_gpr_to_xmm_move_src (V1TImode, *op)), + insn); + *op = gen_rtx_SUBREG (V1TImode, tmp, 0); + + if (dump_file) + fprintf (dump_file, " Preloading operand for insn %d into r%d\n", + INSN_UID (insn), REGNO (tmp)); + } + else if (CONST_INT_P (*op)) + { + rtx vec_cst; + rtx tmp = gen_rtx_SUBREG (V1TImode, gen_reg_rtx (TImode), 0); + + /* Prefer all ones vector in case of -1. */ + if (constm1_operand (*op, TImode)) + vec_cst = CONSTM1_RTX (V1TImode); + else + { + rtx *v = XALLOCAVEC (rtx, 1); + v[0] = *op; + vec_cst = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec_v (1, v)); + } + + if (!standard_sse_constant_p (vec_cst, V1TImode)) + { + start_sequence (); + vec_cst = validize_mem (force_const_mem (V1TImode, vec_cst)); + rtx_insn *seq = get_insns (); + end_sequence (); + emit_insn_before (seq, insn); + } + + emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn); + *op = tmp; + } + else + { + gcc_assert (SUBREG_P (*op)); + gcc_assert (GET_MODE (*op) == vmode); + } +} + /* Convert INSN from TImode to V1T1mode. */ void @@ -1202,13 +1317,14 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) rtx def_set = single_set (insn); rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); + rtx tmp; switch (GET_CODE (dst)) { case REG: { - rtx tmp = find_reg_equal_equiv_note (insn); - if (tmp) + tmp = find_reg_equal_equiv_note (insn); + if (tmp && GET_MODE (XEXP (tmp, 0)) == TImode) PUT_MODE (XEXP (tmp, 0), V1TImode); PUT_MODE (dst, V1TImode); fix_debug_reg_uses (dst); @@ -1275,6 +1391,49 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) } break; + case AND: + if (GET_CODE (XEXP (src, 0)) == NOT) + { + convert_op (&XEXP (XEXP (src, 0), 0), insn); + convert_op (&XEXP (src, 1), insn); + PUT_MODE (XEXP (src, 0), V1TImode); + PUT_MODE (src, V1TImode); + break; + } + /* FALLTHRU */ + + case XOR: + case IOR: + convert_op (&XEXP (src, 0), insn); + convert_op (&XEXP (src, 1), insn); + PUT_MODE (src, V1TImode); + if (MEM_P (dst)) + { + tmp = gen_reg_rtx (V1TImode); + emit_insn_before (gen_rtx_SET (tmp, src), insn); + src = tmp; + } + break; + + case NOT: + src = XEXP (src, 0); + convert_op (&src, insn); + tmp = gen_reg_rtx (V1TImode); + emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn); + src = gen_rtx_XOR (V1TImode, src, tmp); + if (MEM_P (dst)) + { + tmp = gen_reg_rtx (V1TImode); + emit_insn_before (gen_rtx_SET (tmp, src), insn); + src = tmp; + } + break; + + case COMPARE: + dst = gen_rtx_REG (CCmode, FLAGS_REG); + src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn); + break; + default: gcc_unreachable (); } @@ -1522,6 +1681,16 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode) return true; } +/* Check for a suitable TImode memory operand. */ + +static bool +timode_mem_p (rtx x) +{ + return MEM_P (x) + && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL + || !misaligned_operand (x, TImode)); +} + /* The TImode version of scalar_to_vector_candidate_p. */ static bool @@ -1535,45 +1704,64 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); - /* Only TImode load and store are allowed. */ - if (GET_MODE (dst) != TImode) + if (GET_CODE (src) == COMPARE) + return convertible_comparison_p (insn, TImode); + + if (GET_MODE (dst) != TImode + || (GET_MODE (src) != TImode + && !CONST_SCALAR_INT_P (src))) return false; - if (MEM_P (dst)) - { - /* Check for store. Memory must be aligned or unaligned store - is optimal. Only support store from register, standard SSE - constant or CONST_WIDE_INT generated from piecewise store. + if (!REG_P (dst) && !MEM_P (dst)) + return false; - ??? Verify performance impact before enabling CONST_INT for - __int128 store. */ - if (misaligned_operand (dst, TImode) - && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL) - return false; + if (MEM_P (dst) + && misaligned_operand (dst, TImode) + && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + return false; - switch (GET_CODE (src)) - { - default: - return false; + switch (GET_CODE (src)) + { + case REG: + case CONST_WIDE_INT: + return true; - case REG: - case CONST_WIDE_INT: - return true; + case CONST_INT: + /* ??? Verify performance impact before enabling CONST_INT for + __int128 store. */ + return standard_sse_constant_p (src, TImode); - case CONST_INT: - return standard_sse_constant_p (src, TImode); - } - } - else if (MEM_P (src)) - { - /* Check for load. Memory must be aligned or unaligned load is - optimal. */ + case MEM: + /* Memory must be aligned or unaligned load is optimal. */ return (REG_P (dst) && (!misaligned_operand (src, TImode) || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)); - } - return false; + case AND: + if (MEM_P (dst)) + return false; + if (GET_CODE (XEXP (src, 0)) == NOT + && REG_P (XEXP (XEXP (src, 0), 0)) + && (REG_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1)))) + return true; + return REG_P (XEXP (src, 0)) + && (REG_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))); + + case IOR: + case XOR: + if (MEM_P (dst)) + return false; + return REG_P (XEXP (src, 0)) + && (REG_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))); + + case NOT: + if (MEM_P (dst)) + return false; + return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0)); + + default: + return false; + } } /* For a register REGNO, scan instructions for its defs and uses. diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h index 839b63c..88b222d 100644 --- a/gcc/config/i386/i386-features.h +++ b/gcc/config/i386/i386-features.h @@ -148,6 +148,10 @@ class scalar_chain /* Registers used in both vector and sclar modes. */ bitmap defs_conv; + bitmap insns_conv; + unsigned n_sse_to_integer; + unsigned n_integer_to_sse; + void build (bitmap candidates, unsigned insn_uid); virtual int compute_convert_gain () = 0; int convert (); @@ -155,33 +159,31 @@ class scalar_chain protected: void add_to_queue (unsigned insn_uid); void emit_conversion_insns (rtx insns, rtx_insn *pos); + rtx convert_compare (rtx op1, rtx op2, rtx_insn *insn); + void mark_dual_mode_def (df_ref def); private: void add_insn (bitmap candidates, unsigned insn_uid); void analyze_register_chain (bitmap candidates, df_ref ref); - virtual void mark_dual_mode_def (df_ref def) = 0; virtual void convert_insn (rtx_insn *insn) = 0; virtual void convert_registers () = 0; + virtual void convert_op (rtx *op, rtx_insn *insn) = 0; }; class general_scalar_chain : public scalar_chain { public: - general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_); - ~general_scalar_chain (); + general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) + : scalar_chain (smode_, vmode_) {} int compute_convert_gain () final override; + private: hash_map defs_map; - bitmap insns_conv; - unsigned n_sse_to_integer; - unsigned n_integer_to_sse; - void mark_dual_mode_def (df_ref def) final override; void convert_insn (rtx_insn *insn) final override; - void convert_op (rtx *op, rtx_insn *insn); void convert_reg (rtx_insn *insn, rtx dst, rtx src); + void convert_op (rtx *op, rtx_insn *insn); void make_vector_copies (rtx_insn *, rtx); void convert_registers () final override; - rtx convert_compare (rtx op1, rtx op2, rtx_insn *insn); int vector_const_cost (rtx exp); }; @@ -189,21 +191,20 @@ class timode_scalar_chain : public scalar_chain { public: timode_scalar_chain () : scalar_chain (TImode, V1TImode) {} - - /* Convert from TImode to V1TImode is always faster. */ - int compute_convert_gain () final override { return 1; } + int compute_convert_gain () final override; private: - void mark_dual_mode_def (df_ref def) final override; void fix_debug_reg_uses (rtx reg); void convert_insn (rtx_insn *insn) final override; - /* We don't convert registers to difference size. */ + void convert_op (rtx *op, rtx_insn *insn); + /* We don't convert registers to different size. */ void convert_registers () final override {} }; } // anon namespace -bool ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined); +bool ix86_save_reg (unsigned int regno, bool maybe_eh_return, + bool ignore_outlined); int ix86_compare_version_priority (tree decl1, tree decl2); tree ix86_generate_version_dispatcher_body (void *node_p); tree ix86_get_function_versions_dispatcher (void *decl); diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-2.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-2.c new file mode 100644 index 0000000..e637927 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-2.c @@ -0,0 +1,14 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 a[16]; +__int128 b[16]; +__int128 c[16]; + +void foo() +{ + for (unsigned int i=0; i<16; i++) + a[i] = b[i] & c[i]; +} + +/* { dg-final { scan-assembler "pand" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-3.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-3.c new file mode 100644 index 0000000..bdc0bac --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-3.c @@ -0,0 +1,14 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 a[16]; +__int128 b[16]; +__int128 c[16]; + +void foo() +{ + for (unsigned int i=0; i<16; i++) + a[i] = b[i] | c[i]; +} + +/* { dg-final { scan-assembler "por" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-4.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-4.c new file mode 100644 index 0000000..a9ef619 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-4.c @@ -0,0 +1,14 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 a[16]; +__int128 b[16]; +__int128 c[16]; + +void foo() +{ + for (unsigned int i=0; i<16; i++) + a[i] = b[i] ^ c[i]; +} + +/* { dg-final { scan-assembler "pxor" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-5.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-5.c new file mode 100644 index 0000000..7eff3c1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-5.c @@ -0,0 +1,14 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 a[16]; +__int128 b[16]; +__int128 c[16]; + +void foo() +{ + for (unsigned int i=0; i<16; i++) + a[i] = b[i] & ~c[i]; +} + +/* { dg-final { scan-assembler "pandn" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-6.c b/gcc/testsuite/gcc.target/i386/sse4_1-stv-6.c new file mode 100644 index 0000000..407bc55 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-6.c @@ -0,0 +1,15 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */ + +__int128 a[16]; +__int128 b[16]; + +int foo() +{ + for (unsigned int i=0; i<16; i++) + if (a[i] == b[i]) + return i; + return -1; +} + +/* { dg-final { scan-assembler "ptest" } } */