public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Sylvain Noiry <snoiry@kalrayinc.com>
To: gcc-patches@gcc.gnu.org
Cc: Sylvain Noiry <snoiry@kalrayinc.com>
Subject: [PATCH v2 11/11] Native complex ops: Experimental support in x86 backend
Date: Tue, 12 Sep 2023 12:07:13 +0200	[thread overview]
Message-ID: <20230912100713.1074-12-snoiry@kalrayinc.com> (raw)
In-Reply-To: <20230912100713.1074-1-snoiry@kalrayinc.com>

Summary:
Add an experimental support for native complex operation handling in
the x86 backend. For now it only support add, sub, mul, conj, neg, mov
in SCmode (complex float). Performance gains are still marginal on this
target because there are no particular instructions to speedup complex
operation, except some SIMD tricks.

gcc/ChangeLog:

	* config/i386/i386.cc (classify_argument): Align complex
	element to the whole size, not size of the parts
	(ix86_return_in_memory): Handle complex modes like a scalar
	with the same size
	(ix86_class_max_nregs): Likewise
	(ix86_hard_regno_nregs): Likewise
	(function_value_ms_64): Add case for SCmode
	(ix86_build_const_vector): Likewise
	(ix86_build_signbit_mask): Likewise
	(x86_gen_rtx_complex): New: Implement the gen_rtx_complex
	hook, use registers of complex modes to represent complex
	elements in rtl
	(x86_read_complex_part): New: Implement the read_complex_part
	hook, handle registers of complex modes
	(x86_write_complex_part): New: Implement the write_complex_part
	hook, handle registers of complex modes
	* config/i386/i386.h: Add SCmode in several predicates
	* config/i386/sse.md: Add pattern for some complex operations in
	SCmode. This includes movsc, addsc3, subsc3, negsc2, mulsc3,
	and conjsc2
---
 gcc/config/i386/i386.cc | 296 +++++++++++++++++++++++++++++++++++++++-
 gcc/config/i386/i386.h  |  11 +-
 gcc/config/i386/sse.md  | 144 +++++++++++++++++++
 3 files changed, 440 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 477e6cecc38..77bf80b64b1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -2348,8 +2348,8 @@ classify_argument (machine_mode mode, const_tree type,
 	mode_alignment = 128;
       else if (mode == XCmode)
 	mode_alignment = 256;
-      if (COMPLEX_MODE_P (mode))
-	mode_alignment /= 2;
+      /*if (COMPLEX_MODE_P (mode))
+	mode_alignment /= 2;*/
       /* Misaligned fields are always returned in memory.  */
       if (bit_offset % mode_alignment)
 	return 0;
@@ -3023,6 +3023,7 @@ pass_in_reg:
     case E_V4BFmode:
     case E_V2SImode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V1TImode:
     case E_V1DImode:
       if (!type || !AGGREGATE_TYPE_P (type))
@@ -3273,6 +3274,7 @@ pass_in_reg:
     case E_V4BFmode:
     case E_V2SImode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V1TImode:
     case E_V1DImode:
       if (!type || !AGGREGATE_TYPE_P (type))
@@ -4187,8 +4189,8 @@ function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
 	      && !INTEGRAL_TYPE_P (valtype)
 	      && !VECTOR_FLOAT_TYPE_P (valtype))
 	    break;
-	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
-	      && !COMPLEX_MODE_P (mode))
+	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)))
+	     // && !COMPLEX_MODE_P (mode))
 	    regno = FIRST_SSE_REG;
 	  break;
 	case 8:
@@ -4295,7 +4297,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
 	       || INTEGRAL_TYPE_P (type)
 	       || VECTOR_FLOAT_TYPE_P (type))
 	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
-	      && !COMPLEX_MODE_P (mode)
+	      //&& !COMPLEX_MODE_P (mode)
 	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
 	    return false;
 
@@ -15752,6 +15754,7 @@ ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
     case E_V8SFmode:
     case E_V4SFmode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V8DFmode:
     case E_V4DFmode:
     case E_V2DFmode:
@@ -15800,6 +15803,7 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
     case E_V8SFmode:
     case E_V4SFmode:
     case E_V2SFmode:
+    case E_SCmode:
     case E_V2SImode:
       vec_mode = mode;
       imode = SImode;
@@ -19894,7 +19898,8 @@ ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
   else
     {
       if (COMPLEX_MODE_P (mode))
-	return 2;
+	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+	//return 2;
       else
 	return 1;
     }
@@ -20230,7 +20235,8 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
       return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
     }
   if (COMPLEX_MODE_P (mode))
-    return 2;
+    return 1;
+    //return 2;
   /* Register pair for mask registers.  */
   if (mode == P2QImode || mode == P2HImode)
     return 2;
@@ -23757,6 +23763,273 @@ ix86_preferred_simd_mode (scalar_mode mode)
     }
 }
 
+static rtx
+x86_gen_rtx_complex (machine_mode mode, rtx real_part, rtx imag_part)
+{
+  machine_mode imode = GET_MODE_INNER (mode);
+
+  if ((real_part == imag_part) && (real_part == CONST0_RTX (imode)))
+    {
+      if (CONST_DOUBLE_P (real_part))
+       return const_double_from_real_value (dconst0, mode);
+      else if (CONST_INT_P (real_part))
+       return GEN_INT (0);
+      else
+       gcc_unreachable ();
+    }
+
+  bool saved_generating_concat_p = generating_concat_p;
+  generating_concat_p = false;
+  rtx complex_reg = gen_reg_rtx (mode);
+  generating_concat_p = saved_generating_concat_p;
+
+  if (real_part)
+    {
+      gcc_assert (imode == GET_MODE (real_part));
+      write_complex_part (complex_reg, real_part, REAL_P, false);
+    }
+
+  if (imag_part)
+    {
+      gcc_assert (imode == GET_MODE (imag_part));
+      write_complex_part (complex_reg, imag_part, IMAG_P, false);
+    }
+
+  return complex_reg;
+}
+
+static rtx
+x86_read_complex_part (rtx cplx, complex_part_t part)
+{
+  machine_mode cmode;
+  scalar_mode imode;
+  unsigned ibitsize;
+
+  if (GET_CODE (cplx) == CONCAT)
+    return XEXP (cplx, part);
+
+  cmode = GET_MODE (cplx);
+  imode = GET_MODE_INNER (cmode);
+  ibitsize = GET_MODE_BITSIZE (imode);
+
+  if (COMPLEX_MODE_P (cmode) && (part == BOTH_P))
+    return cplx;
+
+  /* For constants under 32-bit vector constans are folded during expand,
+   * so we need to compensate for it as cplx is an integer constant
+   * In this case cmode and imode are equal */
+  if (cmode == imode)
+    ibitsize /= 2;
+
+  if (cmode == E_VOIDmode)
+    return cplx;               /* FIXME case used when initialising mock in a complex register */
+
+  if ((cmode == E_DCmode) && (GET_CODE (cplx) == CONST_DOUBLE))        /* FIXME stop generation of DC const_double, because not patterns and wired */
+    return CONST0_RTX (E_DFmode);
+  /* verify aswell SC const_double */
+
+  /* Special case reads from complex constants that got spilled to memory.  */
+  if (MEM_P (cplx) && GET_CODE (XEXP (cplx, 0)) == SYMBOL_REF)
+    {
+      tree decl = SYMBOL_REF_DECL (XEXP (cplx, 0));
+      if (decl && TREE_CODE (decl) == COMPLEX_CST)
+	{
+	  tree cplx_part = (part == IMAG_P) ? TREE_IMAGPART (decl)
+			  : (part == REAL_P) ? TREE_REALPART (decl)
+			  : TREE_COMPLEX_BOTH_PARTS (decl);
+	if (CONSTANT_CLASS_P (cplx_part))
+	  return expand_expr (cplx_part, NULL_RTX, imode, EXPAND_NORMAL);
+	}
+    }
+
+  /* For MEMs simplify_gen_subreg may generate an invalid new address
+     because, e.g., the original address is considered mode-dependent
+     by the target, which restricts simplify_subreg from invoking
+     adjust_address_nv.  Instead of preparing fallback support for an
+     invalid address, we call adjust_address_nv directly.  */
+  if (MEM_P (cplx))
+    {
+      if (part == BOTH_P)
+       return adjust_address_nv (cplx, cmode, 0);
+      else
+       return adjust_address_nv (cplx, imode, (part == IMAG_P)
+				 ? GET_MODE_SIZE (imode) : 0);
+    }
+
+  /* If the sub-object is at least word sized, then we know that subregging
+     will work.  This special case is important, since extract_bit_field
+     wants to operate on integer modes, and there's rarely an OImode to
+     correspond to TCmode.  */
+  if (ibitsize >= BITS_PER_WORD
+      /* For hard regs we have exact predicates.  Assume we can split
+	 the original object if it spans an even number of hard regs.
+	 This special case is important for SCmode on 64-bit platforms
+	 where the natural size of floating-point regs is 32-bit.  */
+      || (REG_P (cplx)
+	  && REGNO (cplx) < FIRST_PSEUDO_REGISTER
+	  && REG_NREGS (cplx) % 2 == 0))
+    {
+      rtx ret = simplify_gen_subreg (imode, cplx, cmode, (part == IMAG_P)
+				     ? GET_MODE_SIZE (imode) : 0);
+      if (ret)
+       return ret;
+      else
+       /* simplify_gen_subreg may fail for sub-word MEMs.  */
+       gcc_assert (MEM_P (cplx) && ibitsize < BITS_PER_WORD);
+    }
+
+  if (part == BOTH_P)
+    return extract_bit_field (cplx, 2 * ibitsize, 0, true, NULL_RTX, cmode,
+			      cmode, false, NULL);
+  else
+    return extract_bit_field (cplx, ibitsize, (part == IMAG_P) ? ibitsize : 0,
+			      true, NULL_RTX, imode, imode, false, NULL);
+}
+
+static void
+x86_write_complex_part (rtx cplx, rtx val, complex_part_t part, bool undefined_p)
+{
+  machine_mode cmode;
+  scalar_mode imode;
+  unsigned ibitsize;
+
+  cmode = GET_MODE (cplx);
+  imode = GET_MODE_INNER (cmode);
+  ibitsize = GET_MODE_BITSIZE (imode);
+
+  /* special case for constants */
+  if (GET_CODE (val) == CONST_VECTOR)
+    {
+      if (part == BOTH_P)
+	{
+	  machine_mode temp_mode = E_BLKmode;;
+	  switch (cmode)
+	    {
+	    case E_CQImode:
+	      temp_mode = E_HImode;
+	      break;
+	    case E_CHImode:
+	      temp_mode = E_SImode;
+	      break;
+	    case E_CSImode:
+	      temp_mode = E_DImode;
+	      break;
+	    case E_SCmode:
+	      temp_mode = E_DFmode;
+	      break;
+	    case E_CDImode:
+	      temp_mode = E_TImode;
+	      break;
+	    case E_DCmode:
+	    default:
+	      break;
+	    }
+
+	  if (temp_mode != E_BLKmode)
+	    {
+	      rtx temp_reg = gen_reg_rtx (temp_mode);
+	      store_bit_field (temp_reg, GET_MODE_BITSIZE (temp_mode), 0, 0,
+			       0, GET_MODE (val), val, false, undefined_p);
+	      emit_move_insn (cplx,
+			      simplify_gen_subreg (cmode, temp_reg, temp_mode,
+						   0));
+	    }
+	  else
+	    {
+	      /* write real part and imag part separetly */
+	      gcc_assert (GET_CODE (val) == CONST_VECTOR);
+	      write_complex_part (cplx, const_vector_elt (val, 0), REAL_P, false);
+	      write_complex_part (cplx, const_vector_elt (val, 1), IMAG_P, false);
+	    }
+	}
+      else
+	write_complex_part (cplx,
+			    const_vector_elt (val,
+			    ((part == REAL_P) ? 0 : 1)),
+			    part, false);
+      return;
+    }
+
+  if ((part == BOTH_P) && !MEM_P (cplx)
+      /*&& (optab_handler (mov_optab, cmode) != CODE_FOR_nothing)*/)
+    {
+      write_complex_part (cplx, read_complex_part(cplx, REAL_P), REAL_P, undefined_p);
+      write_complex_part (cplx, read_complex_part(cplx, IMAG_P), IMAG_P, undefined_p);
+      //emit_move_insn (cplx, val);
+      return;
+    }
+
+  if ((GET_CODE (val) == CONST_DOUBLE) || (GET_CODE (val) == CONST_INT))
+    {
+      if (part == REAL_P)
+	{
+	  emit_move_insn (gen_lowpart (imode, cplx), val);
+	  return;
+	}
+      else if (part == IMAG_P)
+	{
+	  /* cannot set highpart of a pseudo register */
+	  if (REGNO (cplx) < FIRST_PSEUDO_REGISTER)
+	    {
+	      emit_move_insn (gen_highpart (imode, cplx), val);
+	      return;
+	    }
+	}
+      else
+	gcc_unreachable ();
+    }
+
+  if (GET_CODE (cplx) == CONCAT)
+    {
+      emit_move_insn (XEXP (cplx, part), val);
+      return;
+    }
+
+  /* For MEMs simplify_gen_subreg may generate an invalid new address
+     because, e.g., the original address is considered mode-dependent
+     by the target, which restricts simplify_subreg from invoking
+     adjust_address_nv.  Instead of preparing fallback support for an
+     invalid address, we call adjust_address_nv directly.  */
+  if (MEM_P (cplx))
+    {
+      if (part == BOTH_P)
+       emit_move_insn (adjust_address_nv (cplx, cmode, 0), val);
+      else
+       emit_move_insn (adjust_address_nv (cplx, imode, (part == IMAG_P)
+					  ? GET_MODE_SIZE (imode) : 0), val);
+      return;
+    }
+
+  /* If the sub-object is at least word sized, then we know that subregging
+     will work.  This special case is important, since store_bit_field
+     wants to operate on integer modes, and there's rarely an OImode to
+     correspond to TCmode.  */
+  if (ibitsize >= BITS_PER_WORD
+      /* For hard regs we have exact predicates.  Assume we can split
+	 the original object if it spans an even number of hard regs.
+	 This special case is important for SCmode on 64-bit platforms
+	 where the natural size of floating-point regs is 32-bit.  */
+      || (REG_P (cplx)
+	  && REGNO (cplx) < FIRST_PSEUDO_REGISTER
+	  && REG_NREGS (cplx) % 2 == 0))
+    {
+      rtx cplx_part = simplify_gen_subreg (imode, cplx, cmode,
+					   (part == IMAG_P)
+					   ? GET_MODE_SIZE (imode) : 0);
+      if (cplx_part)
+	{
+	  emit_move_insn (cplx_part, val);
+	  return;
+	}
+      else
+       /* simplify_gen_subreg may fail for sub-word MEMs.  */
+       gcc_assert (MEM_P (cplx) && ibitsize < BITS_PER_WORD);
+    }
+
+  store_bit_field (cplx, ibitsize, (part == IMAG_P) ? ibitsize : 0, 0, 0,
+		   imode, val, false, undefined_p);
+}
+
 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
    256bit and 128bit vectors.  */
@@ -25792,6 +26065,15 @@ ix86_libgcc_floating_mode_supported_p
 #undef TARGET_IFUNC_REF_LOCAL_OK
 #define TARGET_IFUNC_REF_LOCAL_OK ix86_ifunc_ref_local_ok
 
+#undef TARGET_GEN_RTX_COMPLEX
+#define TARGET_GEN_RTX_COMPLEX x86_gen_rtx_complex
+
+#undef TARGET_READ_COMPLEX_PART
+#define TARGET_READ_COMPLEX_PART x86_read_complex_part
+
+#undef TARGET_WRITE_COMPLEX_PART
+#define TARGET_WRITE_COMPLEX_PART x86_write_complex_part
+
 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
 # undef TARGET_ASM_RELOC_RW_MASK
 # define TARGET_ASM_RELOC_RW_MASK ix86_reloc_rw_mask
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3e8488f2ae8..faa058f3ec0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1058,7 +1058,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode	\
    || (MODE) == V2DImode || (MODE) == V2QImode				\
    || (MODE) == DFmode	|| (MODE) == DImode				\
-   || (MODE) == HFmode || (MODE) == BFmode)
+   || (MODE) == HFmode || (MODE) == BFmode				\
+   || (MODE) == SCmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
@@ -1067,7 +1068,7 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == TFmode || (MODE) == TDmode)
 
 #define VALID_MMX_REG_MODE_3DNOW(MODE) \
-  ((MODE) == V2SFmode || (MODE) == SFmode)
+  ((MODE) == V2SFmode || (MODE) == SFmode || (MODE) == SCmode)
 
 /* To match ia32 psABI, V4HFmode should be added here.  */
 #define VALID_MMX_REG_MODE(MODE)					\
@@ -1110,13 +1111,15 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
    || (MODE) == V16SFmode \
    || (MODE) == V32HFmode || (MODE) == V16HFmode || (MODE) == V8HFmode  \
-   || (MODE) == V32BFmode || (MODE) == V16BFmode || (MODE) == V8BFmode)
+   || (MODE) == V32BFmode || (MODE) == V16BFmode || (MODE) == V8BFmode	\
+   || (MODE) == SCmode)
 
 #define X87_FLOAT_MODE_P(MODE)	\
   (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
 
 #define SSE_FLOAT_MODE_P(MODE) \
-  ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode))
+  ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode) \
+   || (TARGET_SSE2 && (MODE) == SCmode))
 
 #define SSE_FLOAT_MODE_SSEMATH_OR_HF_P(MODE)				\
   ((SSE_FLOAT_MODE_P (MODE) && TARGET_SSE_MATH)				\
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 80b43fd7db7..06281eb0fd6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30504,3 +30504,147 @@
   "TARGET_AVXVNNIINT16"
   "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "prefix" "vex")])
+
+(define_expand "movsc"
+  [(match_operand:SC 0 "nonimmediate_operand" "")
+   (match_operand:SC 1 "nonimmediate_operand" "")]
+  ""
+  {
+    emit_insn (gen_movv2sf (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+			    simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "addsc3"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")
+   (match_operand:SC 2 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_addv2sf3 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "subsc3"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")
+   (match_operand:SC 2 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_subv2sf3 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+			     simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "negsc2"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_negv2sf2 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+                             simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "sse_shufsc"
+  [(match_operand:V4SF 0 "register_operand")
+   (match_operand:SC 1 "register_operand")
+   (match_operand:SC 2 "vector_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_SSE"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_sse_shufsc_sc (operands[0],
+						     operands[1],
+						     operands[2],
+						     GEN_INT ((mask >> 0) & 3),
+						     GEN_INT ((mask >> 2) & 3),
+						     GEN_INT (((mask >> 4) & 3) + 4),
+						     GEN_INT (((mask >> 6) & 3) + 4)));
+  DONE;
+})
+
+(define_insn "sse_shufsc_sc"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,v")
+	(vec_select:V4SF
+	  (vec_concat:V4SF
+	    (match_operand:V2SF 1 "register_operand" "0,v")
+	    (match_operand:V2SF 2 "vector_operand" "xBm,vm"))
+	  (parallel [(match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_4_to_7_operand")
+		     (match_operand 6 "const_4_to_7_operand")])))]
+  "TARGET_SSE"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[3]) << 0;
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 4) << 4;
+  mask |= (INTVAL (operands[6]) - 4) << 6;
+  operands[3] = GEN_INT (mask);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "shufps\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseshuf")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set_attr "mode" "V4SF")])
+
+(define_expand "mulsc3"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")
+   (match_operand:SC 2 "register_operand" "r")]
+  "TARGET_SSE3"
+  {
+    rtx a = gen_reg_rtx (V4SFmode);
+    rtx b = gen_reg_rtx (V4SFmode);
+    emit_insn (gen_sse_shufsc (a,
+                                    simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+                                    simplify_gen_subreg (V2SFmode, operands[1], SCmode, 0),
+                                    GEN_INT (0b01000100)));
+    emit_insn (gen_sse_shufsc (b,
+                                    simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0),
+                                    simplify_gen_subreg (V2SFmode, operands[2], SCmode, 0),
+                                    GEN_INT (0b00010100)));
+    emit_insn (gen_mulv4sf3 (a, a, b));
+    emit_insn (gen_sse_shufps (b,
+                                    a,
+                                    a,
+                                    GEN_INT (0b00001101)));
+    emit_insn (gen_sse_shufps (a,
+                                    a,
+                                    a,
+                                    GEN_INT (0b00001000)));
+    emit_insn (gen_vec_addsubv2sf3 (simplify_gen_subreg (V2SFmode, operands[0], SCmode, 0),
+				    simplify_gen_subreg (V2SFmode, a, V4SFmode, 0),
+				    simplify_gen_subreg (V2SFmode, b, V4SFmode, 0)));
+    DONE;
+  }
+)
+
+(define_expand "conjsc2"
+  [(match_operand:SC 0 "register_operand" "=r")
+   (match_operand:SC 1 "register_operand" "r")]
+  ""
+  {
+    emit_insn (gen_negdf2 (simplify_gen_subreg (DFmode, operands[0], SCmode, 0),
+			   simplify_gen_subreg (DFmode, operands[1], SCmode, 0)));
+    DONE;
+  }
+)
-- 
2.17.1






      parent reply	other threads:[~2023-09-12 10:08 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-17  9:02 [PATCH 0/9] Native complex operations Sylvain Noiry
2023-07-17  9:02 ` [PATCH 1/9] Native complex operations: Conditional lowering Sylvain Noiry
2023-07-17  9:02 ` [PATCH 2/9] Native complex operations: Move functions to hooks Sylvain Noiry
2023-07-17  9:02 ` [PATCH 3/9] Native complex operations: Add gen_rtx_complex hook Sylvain Noiry
2023-07-17  9:02 ` [PATCH 4/9] Native complex operations: Allow native complex regs and ops in rtl Sylvain Noiry
2023-07-17  9:02 ` [PATCH 5/9] Native complex operations: Add the conjugate op in optabs Sylvain Noiry
2023-07-17  9:02 ` [PATCH 6/9] Native complex operations: Update how complex rotations are handled Sylvain Noiry
2023-07-17  9:02 ` [PATCH 7/9] Native complex operations: Vectorization of native complex operations Sylvain Noiry
2023-07-17  9:02 ` [PATCH 8/9] Native complex operations: Add explicit vector of complex Sylvain Noiry
2023-07-17  9:02 ` [PATCH 9/9] Native complex operation: Experimental support in x86 backend Sylvain Noiry
2023-09-12 10:07   ` [PATCH v2 0/11] Native complex operations Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 01/11] Native complex ops : Conditional lowering Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 02/11] Native complex ops: Move functions to hooks Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 03/11] Native complex ops: Add gen_rtx_complex hook Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 04/11] Native complex ops: Allow native complex regs and ops in rtl Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 05/11] Native complex ops: Add the conjugate op in optabs Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 06/11] Native complex ops: Update how complex rotations are handled Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 07/11] Native complex ops: Vectorization of native complex operations Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 08/11] Native complex ops: Add explicit vector of complex Sylvain Noiry
2023-09-12 17:25       ` Joseph Myers
2023-09-13  6:48         ` Richard Biener
2023-09-12 10:07     ` [PATCH v2 09/11] Native complex ops: remove useless special cases Sylvain Noiry
2023-09-12 10:07     ` [PATCH v2 10/11] Native complex ops: Add a fast complex multiplication pattern Sylvain Noiry
2023-09-12 10:07     ` Sylvain Noiry [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230912100713.1074-12-snoiry@kalrayinc.com \
    --to=snoiry@kalrayinc.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).