public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] LoongArch: Optimizations of vector construction.
@ 2023-09-21  1:19 Guo Jie
  2023-09-25  3:04 ` chenglulu
  0 siblings, 1 reply; 2+ messages in thread
From: Guo Jie @ 2023-09-21  1:19 UTC (permalink / raw)
  To: gcc-patches; +Cc: xuchenghua, chenglulu, i, xry111, Guo Jie

gcc/ChangeLog:

	* config/loongarch/lasx.md (lasx_vecinit_merge_<LASX:mode>): New
	pattern for vector construction.
	(vec_set<mode>_internal): Ditto.
	(lasx_xvinsgr2vr_<mode256_i_half>_internal): Ditto.
	(lasx_xvilvl_<lasxfmt_f>_internal): Ditto.
	* config/loongarch/loongarch.cc (loongarch_expand_vector_init):
	Optimized the implementation of vector construction.
	(loongarch_expand_vector_init_same): New function.
	* config/loongarch/lsx.md (lsx_vilvl_<lsxfmt_f>_internal): New
	pattern for vector construction.
	(lsx_vreplvei_mirror_<lsxfmt_f>): New pattern for vector
	construction.
	(vec_concatv2df): Ditto.
	(vec_concatv4sf): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c: New test.
	* gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c: New test.
---
 gcc/config/loongarch/lasx.md                  |  69 ++
 gcc/config/loongarch/loongarch.cc             | 716 +++++++++---------
 gcc/config/loongarch/lsx.md                   | 134 ++++
 .../vector/lasx/lasx-vec-construct-opt.c      | 102 +++
 .../vector/lsx/lsx-vec-construct-opt.c        |  85 +++
 5 files changed, 732 insertions(+), 374 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 8111c8bb79a..2bc5d47ed4a 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -186,6 +186,9 @@ (define_c_enum "unspec" [
   UNSPEC_LASX_XVLDI
   UNSPEC_LASX_XVLDX
   UNSPEC_LASX_XVSTX
+  UNSPEC_LASX_VECINIT_MERGE
+  UNSPEC_LASX_VEC_SET_INTERNAL
+  UNSPEC_LASX_XVILVL_INTERNAL
 ])
 
 ;; All vector modes with 256 bits.
@@ -255,6 +258,15 @@ (define_mode_attr VFHMODE256
    [(V8SF "V4SF")
    (V4DF "V2DF")])
 
+;; The attribute gives half int/float modes for vector modes.
+(define_mode_attr VHMODE256_ALL
+  [(V32QI "V16QI")
+   (V16HI "V8HI")
+   (V8SI "V4SI")
+   (V4DI "V2DI")
+   (V8SF "V4SF")
+   (V4DF "V2DF")])
+
 ;; The attribute gives double modes for vector modes in LASX.
 (define_mode_attr VDMODE256
   [(V8SI "V4DI")
@@ -312,6 +324,11 @@ (define_mode_attr mode256_f
    (V4DI "v4df")
    (V8SI "v8sf")])
 
+;; This attribute gives V32QI mode and V16HI mode with half size.
+(define_mode_attr mode256_i_half
+  [(V32QI "v16qi")
+   (V16HI "v8hi")])
+
  ;; This attribute gives suffix for LASX instructions.  HOW?
 (define_mode_attr lasxfmt
   [(V4DF "d")
@@ -756,6 +773,20 @@ (define_insn "lasx_xvpermi_q_<LASX:mode>"
   [(set_attr "type" "simd_splat")
    (set_attr "mode" "<MODE>")])
 
+;; Only for loongarch_expand_vector_init in loongarch.cc.
+;; Support a LSX-mode input op2.
+(define_insn "lasx_vecinit_merge_<LASX:mode>"
+  [(set (match_operand:LASX 0 "register_operand" "=f")
+	(unspec:LASX
+	  [(match_operand:LASX 1 "register_operand" "0")
+	   (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")
+	   (match_operand     3 "const_uimm8_operand")]
+	   UNSPEC_LASX_VECINIT_MERGE))]
+  "ISA_HAS_LASX"
+  "xvpermi.q\t%u0,%u2,%3"
+  [(set_attr "type" "simd_splat")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "lasx_xvpickve2gr_d<u>"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(any_extend:DI
@@ -779,6 +810,33 @@ (define_expand "vec_set<mode>"
   DONE;
 })
 
+;; Only for loongarch_expand_vector_init in loongarch.cc.
+;; Simulate missing instructions xvinsgr2vr.b and xvinsgr2vr.h.
+(define_expand "vec_set<mode>_internal"
+  [(match_operand:ILASX_HB 0 "register_operand")
+   (match_operand:<UNITMODE> 1 "reg_or_0_operand")
+   (match_operand 2 "const_<indeximm256>_operand")]
+  "ISA_HAS_LASX"
+{
+  rtx index = GEN_INT (1 << INTVAL (operands[2]));
+  emit_insn (gen_lasx_xvinsgr2vr_<mode256_i_half>_internal
+	     (operands[0], operands[1], operands[0], index));
+  DONE;
+})
+
+(define_insn "lasx_xvinsgr2vr_<mode256_i_half>_internal"
+  [(set (match_operand:ILASX_HB 0 "register_operand" "=f")
+	(unspec:ILASX_HB [(match_operand:<UNITMODE> 1 "reg_or_0_operand" "rJ")
+			  (match_operand:ILASX_HB 2 "register_operand" "0")
+			  (match_operand 3 "const_<bitmask256>_operand" "")]
+			 UNSPEC_LASX_VEC_SET_INTERNAL))]
+  "ISA_HAS_LASX"
+{
+  return "vinsgr2vr.<lasxfmt>\t%w0,%z1,%y3";
+}
+  [(set_attr "type" "simd_insert")
+   (set_attr "mode" "<MODE>")])
+
 (define_expand "vec_set<mode>"
   [(match_operand:FLASX 0 "register_operand")
    (match_operand:<UNITMODE> 1 "reg_or_0_operand")
@@ -1567,6 +1625,17 @@ (define_insn "logb<mode>2"
   [(set_attr "type" "simd_flog2")
    (set_attr "mode" "<MODE>")])
 
+;; Only for loongarch_expand_vector_init in loongarch.cc.
+;; Merge two scalar floating-point op1 and op2 into a LASX op0.
+(define_insn "lasx_xvilvl_<lasxfmt_f>_internal"
+  [(set (match_operand:FLASX 0 "register_operand" "=f")
+	(unspec:FLASX [(match_operand:<UNITMODE> 1 "register_operand" "f")
+		       (match_operand:<UNITMODE> 2 "register_operand" "f")]
+		      UNSPEC_LASX_XVILVL_INTERNAL))]
+  "ISA_HAS_LASX"
+  "xvilvl.<lasxfmt>\t%u0,%u2,%u1"
+  [(set_attr "type" "simd_permute")
+   (set_attr "mode" "<MODE>")])
 
 (define_insn "smax<mode>3"
   [(set (match_operand:FLASX 0 "register_operand" "=f")
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 845fad5a8e8..9e1b0d0cfa8 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -10199,300 +10199,344 @@ loongarch_expand_vector_group_init (rtx target, rtx vals)
 						      ops[1])));
 }
 
+/* Expand initialization of a vector which has all same elements.  */
+
 void
-loongarch_expand_vector_init (rtx target, rtx vals)
+loongarch_expand_vector_init_same (rtx target, rtx vals, unsigned nvar)
 {
   machine_mode vmode = GET_MODE (target);
   machine_mode imode = GET_MODE_INNER (vmode);
-  unsigned i, nelt = GET_MODE_NUNITS (vmode);
-  unsigned nvar = 0;
-  bool all_same = true;
-  rtx x;
+  rtx same = XVECEXP (vals, 0, 0);
+  rtx temp, temp2;
 
-  for (i = 0; i < nelt; ++i)
+  if (CONST_INT_P (same) && nvar == 0
+      && loongarch_signed_immediate_p (INTVAL (same), 10, 0))
+    {
+      switch (vmode)
+	{
+	case E_V32QImode:
+	case E_V16HImode:
+	case E_V8SImode:
+	case E_V4DImode:
+	case E_V16QImode:
+	case E_V8HImode:
+	case E_V4SImode:
+	case E_V2DImode:
+	  temp = gen_rtx_CONST_VECTOR (vmode, XVEC (vals, 0));
+	  emit_move_insn (target, temp);
+	  return;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  temp = gen_reg_rtx (imode);
+  if (imode == GET_MODE (same))
+    temp2 = same;
+  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
     {
-      x = XVECEXP (vals, 0, i);
-      if (!loongarch_constant_elt_p (x))
-	nvar++;
-      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
-	all_same = false;
+      if (GET_CODE (same) == MEM)
+	{
+	  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
+	  loongarch_emit_move (reg_tmp, same);
+	  temp2 = simplify_gen_subreg (imode, reg_tmp, GET_MODE (reg_tmp), 0);
+	}
+      else
+	temp2 = simplify_gen_subreg (imode, same, GET_MODE (same), 0);
     }
-
-  if (ISA_HAS_LASX && GET_MODE_SIZE (vmode) == 32)
+  else
     {
-      if (all_same)
+      if (GET_CODE (same) == MEM)
 	{
-	  rtx same = XVECEXP (vals, 0, 0);
-	  rtx temp, temp2;
+	  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
+	  loongarch_emit_move (reg_tmp, same);
+	  temp2 = lowpart_subreg (imode, reg_tmp, GET_MODE (reg_tmp));
+	}
+      else
+	temp2 = lowpart_subreg (imode, same, GET_MODE (same));
+    }
+  emit_move_insn (temp, temp2);
 
-	  if (CONST_INT_P (same) && nvar == 0
-	      && loongarch_signed_immediate_p (INTVAL (same), 10, 0))
-	    {
-	      switch (vmode)
-		{
-		case E_V32QImode:
-		case E_V16HImode:
-		case E_V8SImode:
-		case E_V4DImode:
-		  temp = gen_rtx_CONST_VECTOR (vmode, XVEC (vals, 0));
-		  emit_move_insn (target, temp);
-		  return;
+  switch (vmode)
+    {
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SImode:
+    case E_V4DImode:
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+      loongarch_emit_move (target, gen_rtx_VEC_DUPLICATE (vmode, temp));
+      break;
 
-		default:
-		  gcc_unreachable ();
-		}
-	    }
+    case E_V8SFmode:
+      emit_insn (gen_lasx_xvreplve0_w_f_scalar (target, temp));
+      break;
 
-	  temp = gen_reg_rtx (imode);
-	  if (imode == GET_MODE (same))
-	    temp2 = same;
-	  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
-	    {
-	      if (GET_CODE (same) == MEM)
-		{
-		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
-		  loongarch_emit_move (reg_tmp, same);
-		  temp2 = simplify_gen_subreg (imode, reg_tmp,
-					       GET_MODE (reg_tmp), 0);
-		}
-	      else
-		temp2 = simplify_gen_subreg (imode, same,
-					     GET_MODE (same), 0);
-	    }
-	  else
-	    {
-	      if (GET_CODE (same) == MEM)
-		{
-		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
-		  loongarch_emit_move (reg_tmp, same);
-		  temp2 = lowpart_subreg (imode, reg_tmp,
-					  GET_MODE (reg_tmp));
-		}
-	      else
-		temp2 = lowpart_subreg (imode, same, GET_MODE (same));
-	    }
-	  emit_move_insn (temp, temp2);
+    case E_V4DFmode:
+      emit_insn (gen_lasx_xvreplve0_d_f_scalar (target, temp));
+      break;
 
-	  switch (vmode)
-	    {
-	    case E_V32QImode:
-	    case E_V16HImode:
-	    case E_V8SImode:
-	    case E_V4DImode:
-	      loongarch_emit_move (target,
-				   gen_rtx_VEC_DUPLICATE (vmode, temp));
-	      break;
+    case E_V4SFmode:
+      emit_insn (gen_lsx_vreplvei_w_f_scalar (target, temp));
+      break;
 
-	    case E_V8SFmode:
-	      emit_insn (gen_lasx_xvreplve0_w_f_scalar (target, temp));
-	      break;
+    case E_V2DFmode:
+      emit_insn (gen_lsx_vreplvei_d_f_scalar (target, temp));
+      break;
 
-	    case E_V4DFmode:
-	      emit_insn (gen_lasx_xvreplve0_d_f_scalar (target, temp));
-	      break;
+    default:
+      gcc_unreachable ();
+    }
+}
 
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-      else
-	{
-	  rtvec vec = shallow_copy_rtvec (XVEC (vals, 0));
+/* Expand a vector initialization.  */
 
-	  for (i = 0; i < nelt; ++i)
-	    RTVEC_ELT (vec, i) = CONST0_RTX (imode);
+void
+loongarch_expand_vector_init (rtx target, rtx vals)
+{
+  machine_mode vmode = GET_MODE (target);
+  machine_mode imode = GET_MODE_INNER (vmode);
+  unsigned i, nelt = GET_MODE_NUNITS (vmode);
+  /* VALS is divided into high and low half-part.  */
+  /* Number of non constant elements in corresponding parts of VALS.  */
+  unsigned nvar = 0, hi_nvar = 0, lo_nvar = 0;
+  /* all_same : true if all elements of VALS are the same.
+     hi_same : true if all elements of the high half-part are the same.
+     lo_same : true if all elements of the low half-part are the same.
+     half_same : true if the high half-part is the same as the low one.  */
+  bool all_same = false, hi_same = true, lo_same = true, half_same = true;
+  rtx val[32], val_hi[32], val_lo[16];
+  rtx x, op0, op1;
+  /* Copy one element of vals to per element of target vector.  */
+  typedef rtx (*loongarch_vec_repl1_fn) (rtx, rtx);
+  /* Copy two elements of vals to target vector.  */
+  typedef rtx (*loongarch_vec_repl2_fn) (rtx, rtx, rtx);
+  /* Insert scalar operands into the specified position of the vector.  */
+  typedef rtx (*loongarch_vec_set_fn) (rtx, rtx, rtx);
+  /* Copy 64bit lowpart to highpart.  */
+  typedef rtx (*loongarch_vec_mirror_fn) (rtx, rtx, rtx);
+  /* Merge lowpart and highpart into target.  */
+  typedef rtx (*loongarch_vec_merge_fn) (rtx, rtx, rtx, rtx);
+
+  loongarch_vec_repl1_fn loongarch_vec_repl1_128 = NULL,
+			 loongarch_vec_repl1_256 = NULL;
+  loongarch_vec_repl2_fn loongarch_vec_repl2_128 = NULL,
+			 loongarch_vec_repl2_256 = NULL;
+  loongarch_vec_set_fn loongarch_vec_set128 = NULL, loongarch_vec_set256 = NULL;
+  loongarch_vec_mirror_fn loongarch_vec_mirror = NULL;
+  loongarch_vec_merge_fn loongarch_lasx_vecinit_merge = NULL;
+  machine_mode half_mode = VOIDmode;
+
+  /* Check whether elements of each part are the same.  */
+  for (i = 0; i < nelt / 2; ++i)
+    {
+      val_hi[i] = val_hi[i + nelt / 2] = val[i + nelt / 2]
+	= XVECEXP (vals, 0, i + nelt / 2);
+      val_lo[i] = val[i] = XVECEXP (vals, 0, i);
+      if (!loongarch_constant_elt_p (val_hi[i]))
+	hi_nvar++;
+      if (!loongarch_constant_elt_p (val_lo[i]))
+	lo_nvar++;
+      if (i > 0 && !rtx_equal_p (val_hi[i], val_hi[0]))
+	hi_same = false;
+      if (i > 0 && !rtx_equal_p (val_lo[i], val_lo[0]))
+	lo_same = false;
+      if (!rtx_equal_p (val_hi[i], val_lo[i]))
+	half_same = false;
+    }
+
+  /* If all elements are the same, set all_same true.  */
+  if (hi_same && lo_same && half_same)
+    all_same = true;
+
+  nvar = hi_nvar + lo_nvar;
 
-	  emit_move_insn (target, gen_rtx_CONST_VECTOR (vmode, vec));
+  switch (vmode)
+    {
+    case E_V32QImode:
+      half_mode = E_V16QImode;
+      loongarch_vec_set256 = gen_vec_setv32qi_internal;
+      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_b;
+      loongarch_lasx_vecinit_merge
+	= half_same ? gen_lasx_xvpermi_q_v32qi : gen_lasx_vecinit_merge_v32qi;
+      /* FALLTHRU.  */
+    case E_V16QImode:
+      loongarch_vec_set128 = gen_vec_setv16qi;
+      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_b;
+      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_b;
+      break;
 
-	  machine_mode half_mode = VOIDmode;
-	  rtx target_hi, target_lo;
+    case E_V16HImode:
+      half_mode = E_V8HImode;
+      loongarch_vec_set256 = gen_vec_setv16hi_internal;
+      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_h;
+      loongarch_lasx_vecinit_merge
+	= half_same ? gen_lasx_xvpermi_q_v16hi : gen_lasx_vecinit_merge_v16hi;
+      /* FALLTHRU.  */
+    case E_V8HImode:
+      loongarch_vec_set128 = gen_vec_setv8hi;
+      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_h;
+      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_h;
+      break;
 
-	  switch (vmode)
-	    {
-	    case E_V32QImode:
-	      half_mode=E_V16QImode;
-	      target_hi = gen_reg_rtx (half_mode);
-	      target_lo = gen_reg_rtx (half_mode);
-	      for (i = 0; i < nelt/2; ++i)
-		{
-		  rtx temp_hi = gen_reg_rtx (imode);
-		  rtx temp_lo = gen_reg_rtx (imode);
-		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
-		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
-		  if (i == 0)
-		    {
-		      emit_insn (gen_lsx_vreplvei_b_scalar (target_hi,
-							    temp_hi));
-		      emit_insn (gen_lsx_vreplvei_b_scalar (target_lo,
-							    temp_lo));
-		    }
-		  else
-		    {
-		      emit_insn (gen_vec_setv16qi (target_hi, temp_hi,
-						   GEN_INT (i)));
-		      emit_insn (gen_vec_setv16qi (target_lo, temp_lo,
-						   GEN_INT (i)));
-		    }
-		}
-	      emit_insn (gen_rtx_SET (target,
-				      gen_rtx_VEC_CONCAT (vmode, target_hi,
-							  target_lo)));
-	      break;
+    case E_V8SImode:
+      half_mode = V4SImode;
+      loongarch_vec_set256 = gen_vec_setv8si;
+      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_w;
+      loongarch_lasx_vecinit_merge
+	= half_same ? gen_lasx_xvpermi_q_v8si : gen_lasx_vecinit_merge_v8si;
+      /* FALLTHRU.  */
+    case E_V4SImode:
+      loongarch_vec_set128 = gen_vec_setv4si;
+      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_w;
+      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_w;
+      break;
 
-	    case E_V16HImode:
-	      half_mode=E_V8HImode;
-	      target_hi = gen_reg_rtx (half_mode);
-	      target_lo = gen_reg_rtx (half_mode);
-	      for (i = 0; i < nelt/2; ++i)
-		{
-		  rtx temp_hi = gen_reg_rtx (imode);
-		  rtx temp_lo = gen_reg_rtx (imode);
-		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
-		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
-		  if (i == 0)
-		    {
-		      emit_insn (gen_lsx_vreplvei_h_scalar (target_hi,
-							    temp_hi));
-		      emit_insn (gen_lsx_vreplvei_h_scalar (target_lo,
-							    temp_lo));
-		    }
-		  else
-		    {
-		      emit_insn (gen_vec_setv8hi (target_hi, temp_hi,
-						  GEN_INT (i)));
-		      emit_insn (gen_vec_setv8hi (target_lo, temp_lo,
-						  GEN_INT (i)));
-		    }
-		}
-	      emit_insn (gen_rtx_SET (target,
-				      gen_rtx_VEC_CONCAT (vmode, target_hi,
-							  target_lo)));
-	      break;
+    case E_V4DImode:
+      half_mode = E_V2DImode;
+      loongarch_vec_set256 = gen_vec_setv4di;
+      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_d;
+      loongarch_lasx_vecinit_merge
+	= half_same ? gen_lasx_xvpermi_q_v4di : gen_lasx_vecinit_merge_v4di;
+      /* FALLTHRU.  */
+    case E_V2DImode:
+      loongarch_vec_set128 = gen_vec_setv2di;
+      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_d;
+      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_d;
+      break;
 
-	    case E_V8SImode:
-	      half_mode=V4SImode;
-	      target_hi = gen_reg_rtx (half_mode);
-	      target_lo = gen_reg_rtx (half_mode);
-	      for (i = 0; i < nelt/2; ++i)
-		{
-		  rtx temp_hi = gen_reg_rtx (imode);
-		  rtx temp_lo = gen_reg_rtx (imode);
-		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
-		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
-		  if (i == 0)
-		    {
-		      emit_insn (gen_lsx_vreplvei_w_scalar (target_hi,
-							    temp_hi));
-		      emit_insn (gen_lsx_vreplvei_w_scalar (target_lo,
-							    temp_lo));
-		    }
-		  else
-		    {
-		      emit_insn (gen_vec_setv4si (target_hi, temp_hi,
-						  GEN_INT (i)));
-		      emit_insn (gen_vec_setv4si (target_lo, temp_lo,
-						  GEN_INT (i)));
-		    }
-		}
-	      emit_insn (gen_rtx_SET (target,
-				      gen_rtx_VEC_CONCAT (vmode, target_hi,
-							  target_lo)));
-	      break;
+    case E_V8SFmode:
+      half_mode = E_V4SFmode;
+      loongarch_vec_set256 = gen_vec_setv8sf;
+      loongarch_vec_repl1_128 = gen_lsx_vreplvei_w_f_scalar;
+      loongarch_vec_repl2_256 = gen_lasx_xvilvl_w_f_internal;
+      loongarch_lasx_vecinit_merge
+	= half_same ? gen_lasx_xvpermi_q_v8sf : gen_lasx_vecinit_merge_v8sf;
+      /* FALLTHRU.  */
+    case E_V4SFmode:
+      loongarch_vec_set128 = gen_vec_setv4sf;
+      loongarch_vec_repl2_128 = gen_lsx_vilvl_w_f_internal;
+      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_w_f;
+      break;
 
-	    case E_V4DImode:
-	      half_mode=E_V2DImode;
-	      target_hi = gen_reg_rtx (half_mode);
-	      target_lo = gen_reg_rtx (half_mode);
-	      for (i = 0; i < nelt/2; ++i)
-		{
-		  rtx temp_hi = gen_reg_rtx (imode);
-		  rtx temp_lo = gen_reg_rtx (imode);
-		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
-		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
-		  if (i == 0)
-		    {
-		      emit_insn (gen_lsx_vreplvei_d_scalar (target_hi,
-							    temp_hi));
-		      emit_insn (gen_lsx_vreplvei_d_scalar (target_lo,
-							    temp_lo));
-		    }
-		  else
-		    {
-		      emit_insn (gen_vec_setv2di (target_hi, temp_hi,
-						  GEN_INT (i)));
-		      emit_insn (gen_vec_setv2di (target_lo, temp_lo,
-						  GEN_INT (i)));
-		    }
-		}
-	      emit_insn (gen_rtx_SET (target,
-				      gen_rtx_VEC_CONCAT (vmode, target_hi,
-							  target_lo)));
-	      break;
+    case E_V4DFmode:
+      half_mode = E_V2DFmode;
+      loongarch_vec_set256 = gen_vec_setv4df;
+      loongarch_vec_repl1_128 = gen_lsx_vreplvei_d_f_scalar;
+      loongarch_vec_repl2_256 = gen_lasx_xvilvl_d_f_internal;
+      loongarch_lasx_vecinit_merge
+	= half_same ? gen_lasx_xvpermi_q_v4df : gen_lasx_vecinit_merge_v4df;
+      /* FALLTHRU.  */
+    case E_V2DFmode:
+      loongarch_vec_set128 = gen_vec_setv2df;
+      loongarch_vec_repl2_128 = gen_lsx_vilvl_d_f_internal;
+      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_d_f;
+      break;
 
-	    case E_V8SFmode:
-	      half_mode=E_V4SFmode;
-	      target_hi = gen_reg_rtx (half_mode);
-	      target_lo = gen_reg_rtx (half_mode);
-	      for (i = 0; i < nelt/2; ++i)
+    default:
+      gcc_unreachable ();
+    }
+
+  if (ISA_HAS_LASX && GET_MODE_SIZE (vmode) == 32)
+    {
+      /* If all elements are the same, just do a broadcost.  */
+      if (all_same)
+	loongarch_expand_vector_init_same (target, vals, nvar);
+      else
+	{
+	  gcc_assert (nelt >= 4);
+
+	  rtx target_hi, target_lo;
+	  /* Write elements of high half-part in target directly.  */
+	  target_hi = target;
+	  target_lo = gen_reg_rtx (half_mode);
+
+	  /* If all elements of high half-part are the same,
+	     just do a broadcost.  Also applicable to low half-part.  */
+	  if (hi_same)
+	    {
+	      rtx vtmp = gen_rtx_PARALLEL (vmode, gen_rtvec_v (nelt, val_hi));
+	      loongarch_expand_vector_init_same (target_hi, vtmp, hi_nvar);
+	    }
+	  if (lo_same)
+	    {
+	      rtx vtmp
+		= gen_rtx_PARALLEL (half_mode, gen_rtvec_v (nelt / 2, val_lo));
+	      loongarch_expand_vector_init_same (target_lo, vtmp, lo_nvar);
+	    }
+
+	  for (i = 0; i < nelt / 2; ++i)
+	    {
+	      if (!hi_same)
 		{
-		  rtx temp_hi = gen_reg_rtx (imode);
-		  rtx temp_lo = gen_reg_rtx (imode);
-		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
-		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
-		  if (i == 0)
+		  if (vmode == E_V8SFmode || vmode == E_V4DFmode)
 		    {
-		      emit_insn (gen_lsx_vreplvei_w_f_scalar (target_hi,
-							      temp_hi));
-		      emit_insn (gen_lsx_vreplvei_w_f_scalar (target_lo,
-							      temp_lo));
+		      /* Using xvilvl to load lowest 2 elements simultaneously
+			 to reduce the number of instructions.  */
+		      if (i == 1)
+			{
+			  op0 = gen_reg_rtx (imode);
+			  emit_move_insn (op0, val_hi[0]);
+			  op1 = gen_reg_rtx (imode);
+			  emit_move_insn (op1, val_hi[1]);
+			  emit_insn (
+			    loongarch_vec_repl2_256 (target_hi, op0, op1));
+			}
+		      else if (i > 1)
+			{
+			  op0 = gen_reg_rtx (imode);
+			  emit_move_insn (op0, val_hi[i]);
+			  emit_insn (
+			    loongarch_vec_set256 (target_hi, op0, GEN_INT (i)));
+			}
 		    }
 		  else
 		    {
-		      emit_insn (gen_vec_setv4sf (target_hi, temp_hi,
-						  GEN_INT (i)));
-		      emit_insn (gen_vec_setv4sf (target_lo, temp_lo,
-						  GEN_INT (i)));
+		      /* Assign the lowest element of val_hi to all elements
+			 of target_hi.  */
+		      if (i == 0)
+			{
+			  op0 = gen_reg_rtx (imode);
+			  emit_move_insn (op0, val_hi[0]);
+			  emit_insn (loongarch_vec_repl1_256 (target_hi, op0));
+			}
+		      else if (!rtx_equal_p (val_hi[i], val_hi[0]))
+			{
+			  op0 = gen_reg_rtx (imode);
+			  emit_move_insn (op0, val_hi[i]);
+			  emit_insn (
+			    loongarch_vec_set256 (target_hi, op0, GEN_INT (i)));
+			}
 		    }
 		}
-	      emit_insn (gen_rtx_SET (target,
-				      gen_rtx_VEC_CONCAT (vmode, target_hi,
-							  target_lo)));
-	      break;
-
-	    case E_V4DFmode:
-	      half_mode=E_V2DFmode;
-	      target_hi = gen_reg_rtx (half_mode);
-	      target_lo = gen_reg_rtx (half_mode);
-	      for (i = 0; i < nelt/2; ++i)
+	      if (!lo_same && !half_same)
 		{
-		  rtx temp_hi = gen_reg_rtx (imode);
-		  rtx temp_lo = gen_reg_rtx (imode);
-		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
-		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
+		  /* Assign the lowest element of val_lo to all elements
+		     of target_lo.  */
 		  if (i == 0)
 		    {
-		      emit_insn (gen_lsx_vreplvei_d_f_scalar (target_hi,
-							      temp_hi));
-		      emit_insn (gen_lsx_vreplvei_d_f_scalar (target_lo,
-							      temp_lo));
+		      op0 = gen_reg_rtx (imode);
+		      emit_move_insn (op0, val_lo[0]);
+		      emit_insn (loongarch_vec_repl1_128 (target_lo, op0));
 		    }
-		  else
+		  else if (!rtx_equal_p (val_lo[i], val_lo[0]))
 		    {
-		      emit_insn (gen_vec_setv2df (target_hi, temp_hi,
-						  GEN_INT (i)));
-		      emit_insn (gen_vec_setv2df (target_lo, temp_lo,
-						  GEN_INT (i)));
+		      op0 = gen_reg_rtx (imode);
+		      emit_move_insn (op0, val_lo[i]);
+		      emit_insn (
+			loongarch_vec_set128 (target_lo, op0, GEN_INT (i)));
 		    }
 		}
-	      emit_insn (gen_rtx_SET (target,
-				      gen_rtx_VEC_CONCAT (vmode, target_hi,
-							  target_lo)));
-	      break;
-
-	    default:
-	      gcc_unreachable ();
 	    }
-
+	  if (half_same)
+	    {
+	      emit_insn (loongarch_lasx_vecinit_merge (target, target_hi,
+						       target_hi, const0_rtx));
+	      return;
+	    }
+	  emit_insn (loongarch_lasx_vecinit_merge (target, target_hi, target_lo,
+						   GEN_INT (0x20)));
 	}
       return;
     }
@@ -10500,130 +10544,54 @@ loongarch_expand_vector_init (rtx target, rtx vals)
   if (ISA_HAS_LSX)
     {
       if (all_same)
+	loongarch_expand_vector_init_same (target, vals, nvar);
+      else
 	{
-	  rtx same = XVECEXP (vals, 0, 0);
-	  rtx temp, temp2;
-
-	  if (CONST_INT_P (same) && nvar == 0
-	      && loongarch_signed_immediate_p (INTVAL (same), 10, 0))
-	    {
-	      switch (vmode)
-		{
-		case E_V16QImode:
-		case E_V8HImode:
-		case E_V4SImode:
-		case E_V2DImode:
-		  temp = gen_rtx_CONST_VECTOR (vmode, XVEC (vals, 0));
-		  emit_move_insn (target, temp);
-		  return;
-
-		default:
-		  gcc_unreachable ();
-		}
-	    }
-	  temp = gen_reg_rtx (imode);
-	  if (imode == GET_MODE (same))
-	    temp2 = same;
-	  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
-	    {
-	      if (GET_CODE (same) == MEM)
-		{
-		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
-		  loongarch_emit_move (reg_tmp, same);
-		  temp2 = simplify_gen_subreg (imode, reg_tmp,
-					       GET_MODE (reg_tmp), 0);
-		}
-	      else
-		temp2 = simplify_gen_subreg (imode, same, GET_MODE (same), 0);
-	    }
-	  else
+	  for (i = 0; i < nelt; ++i)
 	    {
-	      if (GET_CODE (same) == MEM)
+	      if (vmode == E_V4SFmode || vmode == E_V2DFmode)
 		{
-		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
-		  loongarch_emit_move (reg_tmp, same);
-		  temp2 = lowpart_subreg (imode, reg_tmp, GET_MODE (reg_tmp));
+		  /* Using vilvl to load lowest 2 elements simultaneously to
+		     reduce the number of instructions.  */
+		  if (i == 1)
+		    {
+		      op0 = gen_reg_rtx (imode);
+		      emit_move_insn (op0, val[0]);
+		      op1 = gen_reg_rtx (imode);
+		      emit_move_insn (op1, val[1]);
+		      emit_insn (loongarch_vec_repl2_128 (target, op0, op1));
+		    }
+		  else if (i > 1)
+		    {
+		      op0 = gen_reg_rtx (imode);
+		      emit_move_insn (op0, val[i]);
+		      emit_insn (
+			loongarch_vec_set128 (target, op0, GEN_INT (i)));
+		    }
 		}
 	      else
-		temp2 = lowpart_subreg (imode, same, GET_MODE (same));
-	    }
-	  emit_move_insn (temp, temp2);
-
-	  switch (vmode)
-	    {
-	    case E_V16QImode:
-	    case E_V8HImode:
-	    case E_V4SImode:
-	    case E_V2DImode:
-	      loongarch_emit_move (target, gen_rtx_VEC_DUPLICATE (vmode, temp));
-	      break;
-
-	    case E_V4SFmode:
-	      emit_insn (gen_lsx_vreplvei_w_f_scalar (target, temp));
-	      break;
-
-	    case E_V2DFmode:
-	      emit_insn (gen_lsx_vreplvei_d_f_scalar (target, temp));
-	      break;
-
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-      else
-	{
-	  emit_move_insn (target, CONST0_RTX (vmode));
-
-	  for (i = 0; i < nelt; ++i)
-	    {
-	      rtx temp = gen_reg_rtx (imode);
-	      emit_move_insn (temp, XVECEXP (vals, 0, i));
-	      switch (vmode)
 		{
-		case E_V16QImode:
-		  if (i == 0)
-		    emit_insn (gen_lsx_vreplvei_b_scalar (target, temp));
-		  else
-		    emit_insn (gen_vec_setv16qi (target, temp, GEN_INT (i)));
-		  break;
-
-		case E_V8HImode:
-		  if (i == 0)
-		    emit_insn (gen_lsx_vreplvei_h_scalar (target, temp));
-		  else
-		    emit_insn (gen_vec_setv8hi (target, temp, GEN_INT (i)));
-		  break;
-
-		case E_V4SImode:
-		  if (i == 0)
-		    emit_insn (gen_lsx_vreplvei_w_scalar (target, temp));
-		  else
-		    emit_insn (gen_vec_setv4si (target, temp, GEN_INT (i)));
-		  break;
-
-		case E_V2DImode:
-		  if (i == 0)
-		    emit_insn (gen_lsx_vreplvei_d_scalar (target, temp));
-		  else
-		    emit_insn (gen_vec_setv2di (target, temp, GEN_INT (i)));
-		  break;
-
-		case E_V4SFmode:
-		  if (i == 0)
-		    emit_insn (gen_lsx_vreplvei_w_f_scalar (target, temp));
-		  else
-		    emit_insn (gen_vec_setv4sf (target, temp, GEN_INT (i)));
-		  break;
-
-		case E_V2DFmode:
+		  if (half_same && i == nelt / 2)
+		    {
+		      emit_insn (
+			loongarch_vec_mirror (target, target, const0_rtx));
+		      return;
+		    }
+		  /* Assign the lowest element of val to all elements of
+		     target.  */
 		  if (i == 0)
-		    emit_insn (gen_lsx_vreplvei_d_f_scalar (target, temp));
-		  else
-		    emit_insn (gen_vec_setv2df (target, temp, GEN_INT (i)));
-		  break;
-
-		default:
-		  gcc_unreachable ();
+		    {
+		      op0 = gen_reg_rtx (imode);
+		      emit_move_insn (op0, val[0]);
+		      emit_insn (loongarch_vec_repl1_128 (target, op0));
+		    }
+		  else if (!rtx_equal_p (val[i], val[0]))
+		    {
+		      op0 = gen_reg_rtx (imode);
+		      emit_move_insn (op0, val[i]);
+		      emit_insn (
+			loongarch_vec_set128 (target, op0, GEN_INT (i)));
+		    }
 		}
 	    }
 	}
@@ -10640,8 +10608,8 @@ loongarch_expand_vector_init (rtx target, rtx vals)
   /* For two-part initialization, always use CONCAT.  */
   if (nelt == 2)
     {
-      rtx op0 = force_reg (imode, XVECEXP (vals, 0, 0));
-      rtx op1 = force_reg (imode, XVECEXP (vals, 0, 1));
+      rtx op0 = force_reg (imode, val[0]);
+      rtx op1 = force_reg (imode, val[1]);
       x = gen_rtx_VEC_CONCAT (vmode, op0, op1);
       emit_insn (gen_rtx_SET (target, x));
       return;
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index fb4d228ba84..075f6ba569d 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -176,6 +176,8 @@ (define_c_enum "unspec" [
   UNSPEC_LSX_VSSRARNI
   UNSPEC_LSX_VSSRARNI2
   UNSPEC_LSX_VPERMI
+  UNSPEC_LSX_VILVL_INTERNAL
+  UNSPEC_LSX_VREPLVEI_MIRROR
 ])
 
 ;; This attribute gives suffix for integers in VHMODE.
@@ -1551,6 +1553,18 @@ (define_insn "logb<mode>2"
   [(set_attr "type" "simd_flog2")
    (set_attr "mode" "<MODE>")])
 
+;; Only for loongarch_expand_vector_init in loongarch.cc.
+;; Merge two scalar floating-point op1 and op2 into a LSX op0.
+(define_insn "lsx_vilvl_<lsxfmt_f>_internal"
+  [(set (match_operand:FLSX 0 "register_operand" "=f")
+	(unspec:FLSX [(match_operand:<UNITMODE> 1 "register_operand" "f")
+		      (match_operand:<UNITMODE> 2 "register_operand" "f")]
+		     UNSPEC_LSX_VILVL_INTERNAL))]
+  "ISA_HAS_LSX"
+  "vilvl.<lsxfmt>\t%w0,%w2,%w1"
+  [(set_attr "type" "simd_permute")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "smax<mode>3"
   [(set (match_operand:FLSX 0 "register_operand" "=f")
 	(smax:FLSX (match_operand:FLSX 1 "register_operand" "f")
@@ -2289,6 +2303,16 @@ (define_insn "lsx_vreplve_<lsxfmt_f>"
   [(set_attr "type" "simd_splat")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "lsx_vreplvei_mirror_<lsxfmt_f>"
+  [(set (match_operand:LSX 0 "register_operand" "=f")
+	(unspec: LSX [(match_operand:LSX 1 "register_operand" "f")
+				(match_operand 2 "const_<indeximm>_operand" "")]
+				UNSPEC_LSX_VREPLVEI_MIRROR))]
+  "ISA_HAS_LSX"
+  "vreplvei.d\t%w0,%w1,%2"
+  [(set_attr "type" "simd_splat")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "lsx_vreplvei_<lsxfmt_f>"
   [(set (match_operand:LSX 0 "register_operand" "=f")
 	(vec_duplicate:LSX
@@ -2450,6 +2474,99 @@ (define_expand "vec_concatv2di"
   DONE;
 })
 
+;; Implement vec_concatv2df by vilvl.d.
+(define_insn_and_split "vec_concatv2df"
+  [(set (match_operand:V2DF 0 "register_operand" "=f")
+	(vec_concat:V2DF
+	  (match_operand:DF 1 "register_operand" "f")
+	  (match_operand:DF 2 "register_operand" "f")))]
+  "ISA_HAS_LSX"
+  ""
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  emit_insn (gen_lsx_vilvl_d_f (operands[0],
+				gen_rtx_REG (V2DFmode, REGNO (operands[1])),
+				gen_rtx_REG (V2DFmode, REGNO (operands[2]))));
+  DONE;
+}
+  [(set_attr "mode" "V2DF")])
+
+;; Implement vec_concatv4sf.
+;; Optimize based on hardware register allocation of operands.
+(define_insn_and_split "vec_concatv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=f")
+	(vec_concat:V4SF
+	  (vec_concat:V2SF
+	    (match_operand:SF 1 "register_operand" "f")
+	    (match_operand:SF 2 "register_operand" "f"))
+	  (vec_concat:V2SF
+	    (match_operand:SF 3 "register_operand" "f")
+	    (match_operand:SF 4 "register_operand" "f"))))]
+  "ISA_HAS_LSX"
+  ""
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  operands[5] = GEN_INT (1);
+  operands[6] = GEN_INT (2);
+  operands[7] = GEN_INT (4);
+  operands[8] = GEN_INT (8);
+
+  /* If all input are same, use vreplvei.w to broadcast.  */
+  if (REGNO (operands[1]) == REGNO (operands[2])
+      && REGNO (operands[1]) == REGNO (operands[3])
+      && REGNO (operands[1]) == REGNO (operands[4]))
+    {
+      emit_insn (gen_lsx_vreplvei_w_f_scalar (operands[0], operands[1]));
+    }
+  /* If op0 is equal to op3, use vreplvei.w to set each element of op0 as op3.
+     If other input is different from op3, use vextrins.w to insert.  */
+  else if (REGNO (operands[0]) == REGNO (operands[3]))
+    {
+      emit_insn (gen_lsx_vreplvei_w_f_scalar (operands[0], operands[3]));
+      if (REGNO (operands[1]) != REGNO (operands[3]))
+	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[1],
+						operands[0], operands[5]));
+      if (REGNO (operands[2]) != REGNO (operands[3]))
+	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[2],
+						operands[0], operands[6]));
+      if (REGNO (operands[4]) != REGNO (operands[3]))
+	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[4],
+						operands[0], operands[8]));
+    }
+  /* If op0 is equal to op4, use vreplvei.w to set each element of op0 as op4.
+     If other input is different from op4, use vextrins.w to insert.  */
+  else if (REGNO (operands[0]) == REGNO (operands[4]))
+    {
+      emit_insn (gen_lsx_vreplvei_w_f_scalar (operands[0], operands[4]));
+      if (REGNO (operands[1]) != REGNO (operands[4]))
+	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[1],
+						operands[0], operands[5]));
+      if (REGNO (operands[2]) != REGNO (operands[4]))
+	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[2],
+						operands[0], operands[6]));
+      if (REGNO (operands[3]) != REGNO (operands[4]))
+	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[3],
+						operands[0], operands[7]));
+    }
+  /* Otherwise, use vilvl.w to merge op1 and op2 first.
+     If op3 is different from op1, use vextrins.w to insert.
+     If op4 is different from op2, use vextrins.w to insert.  */
+  else
+    {
+      emit_insn (
+	gen_lsx_vilvl_w_f (operands[0],
+			   gen_rtx_REG (V4SFmode, REGNO (operands[1])),
+			   gen_rtx_REG (V4SFmode, REGNO (operands[2]))));
+      emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[3],
+					      operands[0], operands[7]));
+      emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[4],
+					      operands[0], operands[8]));
+    }
+  DONE;
+}
+  [(set_attr "mode" "V4SF")])
 
 (define_insn "vandn<mode>3"
   [(set (match_operand:LSX 0 "register_operand" "=f")
@@ -4465,3 +4582,20 @@ (define_insn "lsx_vpermi_w"
   "vpermi.w\t%w0,%w2,%3"
   [(set_attr "type" "simd_bit")
    (set_attr "mode" "V4SI")])
+
+;; Delete one of two instructions that exactly play the same role.
+(define_peephole2
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_duplicate:V2DI (match_operand:DI 1 "register_operand")))
+   (set (match_operand:V2DI 2 "register_operand")
+	(vec_merge:V2DI
+	  (vec_duplicate:V2DI (match_operand:DI 3 "register_operand"))
+	  (match_operand:V2DI 4 "register_operand")
+	  (match_operand 5 "const_int_operand")))]
+  "operands[0] == operands[2] &&
+   operands[1] == operands[3] &&
+   operands[2] == operands[4] &&
+   INTVAL (operands[5]) == 2"
+  [(set (match_dup 0)
+	(vec_duplicate:V2DI (match_dup 1)))]
+  "")
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c
new file mode 100644
index 00000000000..487816a483f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-mlasx -O3" } */
+
+#include <lasxintrin.h>
+
+extern long long *x_di;
+extern int *x_si;
+extern short int *x_hi;
+extern char *x_qi;
+extern double *y_df;
+extern float *y_sf;
+
+/* Remove some unnecessary vinsgr2vr.d as the corresponding elements
+   have already been set.  */
+/* { dg-final { scan-assembler-not "v4i64:.*\tvinsgr2vr\\.d.*v4i64" } } */
+/* { dg-final { scan-assembler-times "v4i64:.*\txvldrepl\\.d.*v4i64" 1 } } */
+v4i64
+vec_construct_v4i64 ()
+{
+  v4i64 res =
+  { x_di[0], x_di[0], x_di[1], x_di[1] }
+  ;
+  return res;
+}
+
+/* Remove some unnecessary vinsgr2vr.w as the corresponding elements
+   have already been set.  */
+/* { dg-final { scan-assembler-not "v8i32:.*\tvinsgr2vr\\.w.*v8i32" } } */
+/* { dg-final { scan-assembler-times "v8i32:.*\txvreplgr2vr\\.w.*v8i32" 1 } } */
+v8i32
+vec_construct_v8i32 ()
+{
+  v8i32 res =
+  { x_si[0], x_si[0], x_si[0], x_si[0], 
+    x_si[0], x_si[2], x_si[0], x_si[0] }
+  ;
+  return res;
+}
+
+/* Remove some unnecessary vinsgr2vr.h as the corresponding elements
+   have already been set.  */
+/* { dg-final { scan-assembler-not "v16i16:.*\tvori\\.b.*v16i16" } } */
+/* { dg-final { scan-assembler-times "v16i16:.*\txvreplgr2vr\\.h.*v16i1" 1 } } */
+v16i16
+vec_construct_v16i16 ()
+{
+  v16i16 res =
+  { x_hi[1], x_hi[2], x_hi[1], x_hi[1], 
+    x_hi[1], x_hi[1], x_hi[1], x_hi[1],
+    x_hi[1], x_hi[1], x_hi[1], x_hi[1], 
+    x_hi[1], x_hi[1], x_hi[1], x_hi[2] }
+  ;
+  return res;
+}
+
+/* Remove some unnecessary vinsgr2vr.b as the corresponding elements
+   have already been set.  */
+/* { dg-final { scan-assembler-not "v32i8:.*\tvori\\.b.*v32i8" } } */
+/* { dg-final { scan-assembler-times "v32i8:.*\txvreplgr2vr\\.b.*v32i8" 1 } } */
+v32i8
+vec_construct_v32i8 ()
+{
+  v32i8 res =
+  { x_qi[0], x_qi[0], x_qi[0], x_qi[0], 
+    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
+    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
+    x_qi[0], x_qi[0], x_qi[0], x_qi[2],
+    x_qi[0], x_qi[0], x_qi[0], x_qi[0], 
+    x_qi[0], x_qi[0], x_qi[0], x_qi[0], 
+    x_qi[0], x_qi[0], x_qi[0], x_qi[0], 
+    x_qi[0], x_qi[0], x_qi[0], x_qi[3] }
+  ;
+  return res;
+}
+
+/* Set 2 elements of a vector simultaneously by vilvl.d
+   and reducing more vextrins.d.  */
+/* { dg-final { scan-assembler-not "v4f64:.*\tvori\\.b.*v4f64" } } */
+/* { dg-final { scan-assembler-not "v4f64:.*\tvextrins\\.d.*v4f64" } } */
+/* { dg-final { scan-assembler-times "v4f64:.*\tvilvl\\.d.*v4f64" 1 } } */
+v4f64
+vec_construct_v4f64 ()
+{
+  v4f64 res =
+  { y_df[0], y_df[2], y_df[0], y_df[0]} 
+  ;
+  return res;
+}
+
+/* Set 2 elements of a vector simultaneously by vilvl.w
+   and reducing more vextrins.w.  */
+/* { dg-final { scan-assembler-not "v8f32:.*\tvextrins\\.w.*v8f32" } } */
+/* { dg-final { scan-assembler-times "v8f32:.*\txvilvl\\.w.*v8f32" 1 } } */
+v8f32
+vec_construct_v8f32 ()
+{
+  v8f32 res =
+  { y_sf[2], y_sf[1], y_sf[2], y_sf[3], 
+    y_sf[2], y_sf[1], y_sf[2], y_sf[3] }
+  ;
+  return res;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
new file mode 100644
index 00000000000..92da1c8af9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-mlsx -O3" } */
+
+#include <lsxintrin.h>
+
+extern long long *x_di;
+extern int *x_si;
+extern short int *x_hi;
+extern char *x_qi;
+extern double *y_df;
+extern float *y_sf;
+
+/* No change for V2DI mode.  */
+v2i64
+vec_construct_v2i64 ()
+{
+  v2i64 res =
+  { x_di[1], x_di[0]}
+  ;
+  return res;
+}
+
+/* Only load the lowest 2 elements and directly copy them to high half-part,
+   reducing more vinsgr2vr.w.  */
+/* { dg-final { scan-assembler-times "v4i32:.*\tvreplvei\\.d.*v4i32" 1 } } */
+v4i32
+vec_construct_v4i32 ()
+{
+  v4i32 res =
+  { x_si[0], x_si[1], x_si[0], x_si[1]} 
+  ;
+  return res;
+}
+
+/* Only load the lowest 4 elements and directly copy them to high half-part,
+   reducing more vinsgr2vr.h.  */
+/* { dg-final { scan-assembler-times "v8i16:.*\tvreplvei\\.d.*v8i16" 1 } } */
+v8i16
+vec_construct_v8i16 ()
+{
+  v8i16 res =
+  { x_hi[0], x_hi[0], x_hi[0], x_hi[1], 
+    x_hi[0], x_hi[0], x_hi[0], x_hi[1] }
+  ;
+  return res;
+}
+
+/* Only load the lowest 8 elements and directly copy them to high half-part,
+   reducing more vinsgr2vr.b.  */
+/* { dg-final { scan-assembler-times "v16i8:.*\tvreplvei\\.d.*v16i8" 1 } } */
+v16i8
+vec_construct_v16i8 ()
+{
+  v16i8 res =
+  { x_qi[0], x_qi[1], x_qi[0], x_qi[2], 
+    x_qi[0], x_qi[0], x_qi[0], x_qi[3],
+    x_qi[0], x_qi[1], x_qi[0], x_qi[2], 
+    x_qi[0], x_qi[0], x_qi[0], x_qi[3] }
+  ;
+  return res;
+}
+
+/* Set 2 elements of a vector simultaneously by vilvl.d.  */
+/* { dg-final { scan-assembler-not "v2f64:.*\tvextrins\\.d.*v2f64" } } */
+/* { dg-final { scan-assembler-times "v2f64:.*\tvilvl\\.d.*v2f64" 1 } } */
+v2f64
+vec_construct_v2f64 ()
+{
+  v2f64 res =
+  { y_df[0], y_df[2] } 
+  ;
+  return res;
+}
+
+/* Set 2 elements of a vector simultaneously by vilvl.w
+   and reducing more vextrins.w.  */
+/* { dg-final { scan-assembler-times "v4f32:.*\tvilvl\\.w.*v4f32" 1 } } */
+v4f32
+vec_construct_v4f32 ()
+{
+  v4f32 res =
+  { y_sf[0], y_sf[1], y_sf[0], y_sf[0] }
+  ;
+  return res;
+}
-- 
2.20.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re:[pushed] [PATCH] LoongArch: Optimizations of vector construction.
  2023-09-21  1:19 [PATCH] LoongArch: Optimizations of vector construction Guo Jie
@ 2023-09-25  3:04 ` chenglulu
  0 siblings, 0 replies; 2+ messages in thread
From: chenglulu @ 2023-09-25  3:04 UTC (permalink / raw)
  To: Guo Jie, gcc-patches; +Cc: xuchenghua, i, xry111

Pushed to r14-4245.

在 2023/9/21 上午9:19, Guo Jie 写道:
> gcc/ChangeLog:
>
> 	* config/loongarch/lasx.md (lasx_vecinit_merge_<LASX:mode>): New
> 	pattern for vector construction.
> 	(vec_set<mode>_internal): Ditto.
> 	(lasx_xvinsgr2vr_<mode256_i_half>_internal): Ditto.
> 	(lasx_xvilvl_<lasxfmt_f>_internal): Ditto.
> 	* config/loongarch/loongarch.cc (loongarch_expand_vector_init):
> 	Optimized the implementation of vector construction.
> 	(loongarch_expand_vector_init_same): New function.
> 	* config/loongarch/lsx.md (lsx_vilvl_<lsxfmt_f>_internal): New
> 	pattern for vector construction.
> 	(lsx_vreplvei_mirror_<lsxfmt_f>): New pattern for vector
> 	construction.
> 	(vec_concatv2df): Ditto.
> 	(vec_concatv4sf): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c: New test.
> 	* gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c: New test.
> ---
>   gcc/config/loongarch/lasx.md                  |  69 ++
>   gcc/config/loongarch/loongarch.cc             | 716 +++++++++---------
>   gcc/config/loongarch/lsx.md                   | 134 ++++
>   .../vector/lasx/lasx-vec-construct-opt.c      | 102 +++
>   .../vector/lsx/lsx-vec-construct-opt.c        |  85 +++
>   5 files changed, 732 insertions(+), 374 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
>
> diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
> index 8111c8bb79a..2bc5d47ed4a 100644
> --- a/gcc/config/loongarch/lasx.md
> +++ b/gcc/config/loongarch/lasx.md
> @@ -186,6 +186,9 @@ (define_c_enum "unspec" [
>     UNSPEC_LASX_XVLDI
>     UNSPEC_LASX_XVLDX
>     UNSPEC_LASX_XVSTX
> +  UNSPEC_LASX_VECINIT_MERGE
> +  UNSPEC_LASX_VEC_SET_INTERNAL
> +  UNSPEC_LASX_XVILVL_INTERNAL
>   ])
>   
>   ;; All vector modes with 256 bits.
> @@ -255,6 +258,15 @@ (define_mode_attr VFHMODE256
>      [(V8SF "V4SF")
>      (V4DF "V2DF")])
>   
> +;; The attribute gives half int/float modes for vector modes.
> +(define_mode_attr VHMODE256_ALL
> +  [(V32QI "V16QI")
> +   (V16HI "V8HI")
> +   (V8SI "V4SI")
> +   (V4DI "V2DI")
> +   (V8SF "V4SF")
> +   (V4DF "V2DF")])
> +
>   ;; The attribute gives double modes for vector modes in LASX.
>   (define_mode_attr VDMODE256
>     [(V8SI "V4DI")
> @@ -312,6 +324,11 @@ (define_mode_attr mode256_f
>      (V4DI "v4df")
>      (V8SI "v8sf")])
>   
> +;; This attribute gives V32QI mode and V16HI mode with half size.
> +(define_mode_attr mode256_i_half
> +  [(V32QI "v16qi")
> +   (V16HI "v8hi")])
> +
>    ;; This attribute gives suffix for LASX instructions.  HOW?
>   (define_mode_attr lasxfmt
>     [(V4DF "d")
> @@ -756,6 +773,20 @@ (define_insn "lasx_xvpermi_q_<LASX:mode>"
>     [(set_attr "type" "simd_splat")
>      (set_attr "mode" "<MODE>")])
>   
> +;; Only for loongarch_expand_vector_init in loongarch.cc.
> +;; Support a LSX-mode input op2.
> +(define_insn "lasx_vecinit_merge_<LASX:mode>"
> +  [(set (match_operand:LASX 0 "register_operand" "=f")
> +	(unspec:LASX
> +	  [(match_operand:LASX 1 "register_operand" "0")
> +	   (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")
> +	   (match_operand     3 "const_uimm8_operand")]
> +	   UNSPEC_LASX_VECINIT_MERGE))]
> +  "ISA_HAS_LASX"
> +  "xvpermi.q\t%u0,%u2,%3"
> +  [(set_attr "type" "simd_splat")
> +   (set_attr "mode" "<MODE>")])
> +
>   (define_insn "lasx_xvpickve2gr_d<u>"
>     [(set (match_operand:DI 0 "register_operand" "=r")
>   	(any_extend:DI
> @@ -779,6 +810,33 @@ (define_expand "vec_set<mode>"
>     DONE;
>   })
>   
> +;; Only for loongarch_expand_vector_init in loongarch.cc.
> +;; Simulate missing instructions xvinsgr2vr.b and xvinsgr2vr.h.
> +(define_expand "vec_set<mode>_internal"
> +  [(match_operand:ILASX_HB 0 "register_operand")
> +   (match_operand:<UNITMODE> 1 "reg_or_0_operand")
> +   (match_operand 2 "const_<indeximm256>_operand")]
> +  "ISA_HAS_LASX"
> +{
> +  rtx index = GEN_INT (1 << INTVAL (operands[2]));
> +  emit_insn (gen_lasx_xvinsgr2vr_<mode256_i_half>_internal
> +	     (operands[0], operands[1], operands[0], index));
> +  DONE;
> +})
> +
> +(define_insn "lasx_xvinsgr2vr_<mode256_i_half>_internal"
> +  [(set (match_operand:ILASX_HB 0 "register_operand" "=f")
> +	(unspec:ILASX_HB [(match_operand:<UNITMODE> 1 "reg_or_0_operand" "rJ")
> +			  (match_operand:ILASX_HB 2 "register_operand" "0")
> +			  (match_operand 3 "const_<bitmask256>_operand" "")]
> +			 UNSPEC_LASX_VEC_SET_INTERNAL))]
> +  "ISA_HAS_LASX"
> +{
> +  return "vinsgr2vr.<lasxfmt>\t%w0,%z1,%y3";
> +}
> +  [(set_attr "type" "simd_insert")
> +   (set_attr "mode" "<MODE>")])
> +
>   (define_expand "vec_set<mode>"
>     [(match_operand:FLASX 0 "register_operand")
>      (match_operand:<UNITMODE> 1 "reg_or_0_operand")
> @@ -1567,6 +1625,17 @@ (define_insn "logb<mode>2"
>     [(set_attr "type" "simd_flog2")
>      (set_attr "mode" "<MODE>")])
>   
> +;; Only for loongarch_expand_vector_init in loongarch.cc.
> +;; Merge two scalar floating-point op1 and op2 into a LASX op0.
> +(define_insn "lasx_xvilvl_<lasxfmt_f>_internal"
> +  [(set (match_operand:FLASX 0 "register_operand" "=f")
> +	(unspec:FLASX [(match_operand:<UNITMODE> 1 "register_operand" "f")
> +		       (match_operand:<UNITMODE> 2 "register_operand" "f")]
> +		      UNSPEC_LASX_XVILVL_INTERNAL))]
> +  "ISA_HAS_LASX"
> +  "xvilvl.<lasxfmt>\t%u0,%u2,%u1"
> +  [(set_attr "type" "simd_permute")
> +   (set_attr "mode" "<MODE>")])
>   
>   (define_insn "smax<mode>3"
>     [(set (match_operand:FLASX 0 "register_operand" "=f")
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index 845fad5a8e8..9e1b0d0cfa8 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -10199,300 +10199,344 @@ loongarch_expand_vector_group_init (rtx target, rtx vals)
>   						      ops[1])));
>   }
>   
> +/* Expand initialization of a vector which has all same elements.  */
> +
>   void
> -loongarch_expand_vector_init (rtx target, rtx vals)
> +loongarch_expand_vector_init_same (rtx target, rtx vals, unsigned nvar)
>   {
>     machine_mode vmode = GET_MODE (target);
>     machine_mode imode = GET_MODE_INNER (vmode);
> -  unsigned i, nelt = GET_MODE_NUNITS (vmode);
> -  unsigned nvar = 0;
> -  bool all_same = true;
> -  rtx x;
> +  rtx same = XVECEXP (vals, 0, 0);
> +  rtx temp, temp2;
>   
> -  for (i = 0; i < nelt; ++i)
> +  if (CONST_INT_P (same) && nvar == 0
> +      && loongarch_signed_immediate_p (INTVAL (same), 10, 0))
> +    {
> +      switch (vmode)
> +	{
> +	case E_V32QImode:
> +	case E_V16HImode:
> +	case E_V8SImode:
> +	case E_V4DImode:
> +	case E_V16QImode:
> +	case E_V8HImode:
> +	case E_V4SImode:
> +	case E_V2DImode:
> +	  temp = gen_rtx_CONST_VECTOR (vmode, XVEC (vals, 0));
> +	  emit_move_insn (target, temp);
> +	  return;
> +	default:
> +	  gcc_unreachable ();
> +	}
> +    }
> +  temp = gen_reg_rtx (imode);
> +  if (imode == GET_MODE (same))
> +    temp2 = same;
> +  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
>       {
> -      x = XVECEXP (vals, 0, i);
> -      if (!loongarch_constant_elt_p (x))
> -	nvar++;
> -      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
> -	all_same = false;
> +      if (GET_CODE (same) == MEM)
> +	{
> +	  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
> +	  loongarch_emit_move (reg_tmp, same);
> +	  temp2 = simplify_gen_subreg (imode, reg_tmp, GET_MODE (reg_tmp), 0);
> +	}
> +      else
> +	temp2 = simplify_gen_subreg (imode, same, GET_MODE (same), 0);
>       }
> -
> -  if (ISA_HAS_LASX && GET_MODE_SIZE (vmode) == 32)
> +  else
>       {
> -      if (all_same)
> +      if (GET_CODE (same) == MEM)
>   	{
> -	  rtx same = XVECEXP (vals, 0, 0);
> -	  rtx temp, temp2;
> +	  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
> +	  loongarch_emit_move (reg_tmp, same);
> +	  temp2 = lowpart_subreg (imode, reg_tmp, GET_MODE (reg_tmp));
> +	}
> +      else
> +	temp2 = lowpart_subreg (imode, same, GET_MODE (same));
> +    }
> +  emit_move_insn (temp, temp2);
>   
> -	  if (CONST_INT_P (same) && nvar == 0
> -	      && loongarch_signed_immediate_p (INTVAL (same), 10, 0))
> -	    {
> -	      switch (vmode)
> -		{
> -		case E_V32QImode:
> -		case E_V16HImode:
> -		case E_V8SImode:
> -		case E_V4DImode:
> -		  temp = gen_rtx_CONST_VECTOR (vmode, XVEC (vals, 0));
> -		  emit_move_insn (target, temp);
> -		  return;
> +  switch (vmode)
> +    {
> +    case E_V32QImode:
> +    case E_V16HImode:
> +    case E_V8SImode:
> +    case E_V4DImode:
> +    case E_V16QImode:
> +    case E_V8HImode:
> +    case E_V4SImode:
> +    case E_V2DImode:
> +      loongarch_emit_move (target, gen_rtx_VEC_DUPLICATE (vmode, temp));
> +      break;
>   
> -		default:
> -		  gcc_unreachable ();
> -		}
> -	    }
> +    case E_V8SFmode:
> +      emit_insn (gen_lasx_xvreplve0_w_f_scalar (target, temp));
> +      break;
>   
> -	  temp = gen_reg_rtx (imode);
> -	  if (imode == GET_MODE (same))
> -	    temp2 = same;
> -	  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
> -	    {
> -	      if (GET_CODE (same) == MEM)
> -		{
> -		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
> -		  loongarch_emit_move (reg_tmp, same);
> -		  temp2 = simplify_gen_subreg (imode, reg_tmp,
> -					       GET_MODE (reg_tmp), 0);
> -		}
> -	      else
> -		temp2 = simplify_gen_subreg (imode, same,
> -					     GET_MODE (same), 0);
> -	    }
> -	  else
> -	    {
> -	      if (GET_CODE (same) == MEM)
> -		{
> -		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
> -		  loongarch_emit_move (reg_tmp, same);
> -		  temp2 = lowpart_subreg (imode, reg_tmp,
> -					  GET_MODE (reg_tmp));
> -		}
> -	      else
> -		temp2 = lowpart_subreg (imode, same, GET_MODE (same));
> -	    }
> -	  emit_move_insn (temp, temp2);
> +    case E_V4DFmode:
> +      emit_insn (gen_lasx_xvreplve0_d_f_scalar (target, temp));
> +      break;
>   
> -	  switch (vmode)
> -	    {
> -	    case E_V32QImode:
> -	    case E_V16HImode:
> -	    case E_V8SImode:
> -	    case E_V4DImode:
> -	      loongarch_emit_move (target,
> -				   gen_rtx_VEC_DUPLICATE (vmode, temp));
> -	      break;
> +    case E_V4SFmode:
> +      emit_insn (gen_lsx_vreplvei_w_f_scalar (target, temp));
> +      break;
>   
> -	    case E_V8SFmode:
> -	      emit_insn (gen_lasx_xvreplve0_w_f_scalar (target, temp));
> -	      break;
> +    case E_V2DFmode:
> +      emit_insn (gen_lsx_vreplvei_d_f_scalar (target, temp));
> +      break;
>   
> -	    case E_V4DFmode:
> -	      emit_insn (gen_lasx_xvreplve0_d_f_scalar (target, temp));
> -	      break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +}
>   
> -	    default:
> -	      gcc_unreachable ();
> -	    }
> -	}
> -      else
> -	{
> -	  rtvec vec = shallow_copy_rtvec (XVEC (vals, 0));
> +/* Expand a vector initialization.  */
>   
> -	  for (i = 0; i < nelt; ++i)
> -	    RTVEC_ELT (vec, i) = CONST0_RTX (imode);
> +void
> +loongarch_expand_vector_init (rtx target, rtx vals)
> +{
> +  machine_mode vmode = GET_MODE (target);
> +  machine_mode imode = GET_MODE_INNER (vmode);
> +  unsigned i, nelt = GET_MODE_NUNITS (vmode);
> +  /* VALS is divided into high and low half-part.  */
> +  /* Number of non constant elements in corresponding parts of VALS.  */
> +  unsigned nvar = 0, hi_nvar = 0, lo_nvar = 0;
> +  /* all_same : true if all elements of VALS are the same.
> +     hi_same : true if all elements of the high half-part are the same.
> +     lo_same : true if all elements of the low half-part are the same.
> +     half_same : true if the high half-part is the same as the low one.  */
> +  bool all_same = false, hi_same = true, lo_same = true, half_same = true;
> +  rtx val[32], val_hi[32], val_lo[16];
> +  rtx x, op0, op1;
> +  /* Copy one element of vals to per element of target vector.  */
> +  typedef rtx (*loongarch_vec_repl1_fn) (rtx, rtx);
> +  /* Copy two elements of vals to target vector.  */
> +  typedef rtx (*loongarch_vec_repl2_fn) (rtx, rtx, rtx);
> +  /* Insert scalar operands into the specified position of the vector.  */
> +  typedef rtx (*loongarch_vec_set_fn) (rtx, rtx, rtx);
> +  /* Copy 64bit lowpart to highpart.  */
> +  typedef rtx (*loongarch_vec_mirror_fn) (rtx, rtx, rtx);
> +  /* Merge lowpart and highpart into target.  */
> +  typedef rtx (*loongarch_vec_merge_fn) (rtx, rtx, rtx, rtx);
> +
> +  loongarch_vec_repl1_fn loongarch_vec_repl1_128 = NULL,
> +			 loongarch_vec_repl1_256 = NULL;
> +  loongarch_vec_repl2_fn loongarch_vec_repl2_128 = NULL,
> +			 loongarch_vec_repl2_256 = NULL;
> +  loongarch_vec_set_fn loongarch_vec_set128 = NULL, loongarch_vec_set256 = NULL;
> +  loongarch_vec_mirror_fn loongarch_vec_mirror = NULL;
> +  loongarch_vec_merge_fn loongarch_lasx_vecinit_merge = NULL;
> +  machine_mode half_mode = VOIDmode;
> +
> +  /* Check whether elements of each part are the same.  */
> +  for (i = 0; i < nelt / 2; ++i)
> +    {
> +      val_hi[i] = val_hi[i + nelt / 2] = val[i + nelt / 2]
> +	= XVECEXP (vals, 0, i + nelt / 2);
> +      val_lo[i] = val[i] = XVECEXP (vals, 0, i);
> +      if (!loongarch_constant_elt_p (val_hi[i]))
> +	hi_nvar++;
> +      if (!loongarch_constant_elt_p (val_lo[i]))
> +	lo_nvar++;
> +      if (i > 0 && !rtx_equal_p (val_hi[i], val_hi[0]))
> +	hi_same = false;
> +      if (i > 0 && !rtx_equal_p (val_lo[i], val_lo[0]))
> +	lo_same = false;
> +      if (!rtx_equal_p (val_hi[i], val_lo[i]))
> +	half_same = false;
> +    }
> +
> +  /* If all elements are the same, set all_same true.  */
> +  if (hi_same && lo_same && half_same)
> +    all_same = true;
> +
> +  nvar = hi_nvar + lo_nvar;
>   
> -	  emit_move_insn (target, gen_rtx_CONST_VECTOR (vmode, vec));
> +  switch (vmode)
> +    {
> +    case E_V32QImode:
> +      half_mode = E_V16QImode;
> +      loongarch_vec_set256 = gen_vec_setv32qi_internal;
> +      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_b;
> +      loongarch_lasx_vecinit_merge
> +	= half_same ? gen_lasx_xvpermi_q_v32qi : gen_lasx_vecinit_merge_v32qi;
> +      /* FALLTHRU.  */
> +    case E_V16QImode:
> +      loongarch_vec_set128 = gen_vec_setv16qi;
> +      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_b;
> +      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_b;
> +      break;
>   
> -	  machine_mode half_mode = VOIDmode;
> -	  rtx target_hi, target_lo;
> +    case E_V16HImode:
> +      half_mode = E_V8HImode;
> +      loongarch_vec_set256 = gen_vec_setv16hi_internal;
> +      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_h;
> +      loongarch_lasx_vecinit_merge
> +	= half_same ? gen_lasx_xvpermi_q_v16hi : gen_lasx_vecinit_merge_v16hi;
> +      /* FALLTHRU.  */
> +    case E_V8HImode:
> +      loongarch_vec_set128 = gen_vec_setv8hi;
> +      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_h;
> +      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_h;
> +      break;
>   
> -	  switch (vmode)
> -	    {
> -	    case E_V32QImode:
> -	      half_mode=E_V16QImode;
> -	      target_hi = gen_reg_rtx (half_mode);
> -	      target_lo = gen_reg_rtx (half_mode);
> -	      for (i = 0; i < nelt/2; ++i)
> -		{
> -		  rtx temp_hi = gen_reg_rtx (imode);
> -		  rtx temp_lo = gen_reg_rtx (imode);
> -		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
> -		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
> -		  if (i == 0)
> -		    {
> -		      emit_insn (gen_lsx_vreplvei_b_scalar (target_hi,
> -							    temp_hi));
> -		      emit_insn (gen_lsx_vreplvei_b_scalar (target_lo,
> -							    temp_lo));
> -		    }
> -		  else
> -		    {
> -		      emit_insn (gen_vec_setv16qi (target_hi, temp_hi,
> -						   GEN_INT (i)));
> -		      emit_insn (gen_vec_setv16qi (target_lo, temp_lo,
> -						   GEN_INT (i)));
> -		    }
> -		}
> -	      emit_insn (gen_rtx_SET (target,
> -				      gen_rtx_VEC_CONCAT (vmode, target_hi,
> -							  target_lo)));
> -	      break;
> +    case E_V8SImode:
> +      half_mode = V4SImode;
> +      loongarch_vec_set256 = gen_vec_setv8si;
> +      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_w;
> +      loongarch_lasx_vecinit_merge
> +	= half_same ? gen_lasx_xvpermi_q_v8si : gen_lasx_vecinit_merge_v8si;
> +      /* FALLTHRU.  */
> +    case E_V4SImode:
> +      loongarch_vec_set128 = gen_vec_setv4si;
> +      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_w;
> +      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_w;
> +      break;
>   
> -	    case E_V16HImode:
> -	      half_mode=E_V8HImode;
> -	      target_hi = gen_reg_rtx (half_mode);
> -	      target_lo = gen_reg_rtx (half_mode);
> -	      for (i = 0; i < nelt/2; ++i)
> -		{
> -		  rtx temp_hi = gen_reg_rtx (imode);
> -		  rtx temp_lo = gen_reg_rtx (imode);
> -		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
> -		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
> -		  if (i == 0)
> -		    {
> -		      emit_insn (gen_lsx_vreplvei_h_scalar (target_hi,
> -							    temp_hi));
> -		      emit_insn (gen_lsx_vreplvei_h_scalar (target_lo,
> -							    temp_lo));
> -		    }
> -		  else
> -		    {
> -		      emit_insn (gen_vec_setv8hi (target_hi, temp_hi,
> -						  GEN_INT (i)));
> -		      emit_insn (gen_vec_setv8hi (target_lo, temp_lo,
> -						  GEN_INT (i)));
> -		    }
> -		}
> -	      emit_insn (gen_rtx_SET (target,
> -				      gen_rtx_VEC_CONCAT (vmode, target_hi,
> -							  target_lo)));
> -	      break;
> +    case E_V4DImode:
> +      half_mode = E_V2DImode;
> +      loongarch_vec_set256 = gen_vec_setv4di;
> +      loongarch_vec_repl1_256 = gen_lasx_xvreplgr2vr_d;
> +      loongarch_lasx_vecinit_merge
> +	= half_same ? gen_lasx_xvpermi_q_v4di : gen_lasx_vecinit_merge_v4di;
> +      /* FALLTHRU.  */
> +    case E_V2DImode:
> +      loongarch_vec_set128 = gen_vec_setv2di;
> +      loongarch_vec_repl1_128 = gen_lsx_vreplgr2vr_d;
> +      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_d;
> +      break;
>   
> -	    case E_V8SImode:
> -	      half_mode=V4SImode;
> -	      target_hi = gen_reg_rtx (half_mode);
> -	      target_lo = gen_reg_rtx (half_mode);
> -	      for (i = 0; i < nelt/2; ++i)
> -		{
> -		  rtx temp_hi = gen_reg_rtx (imode);
> -		  rtx temp_lo = gen_reg_rtx (imode);
> -		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
> -		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
> -		  if (i == 0)
> -		    {
> -		      emit_insn (gen_lsx_vreplvei_w_scalar (target_hi,
> -							    temp_hi));
> -		      emit_insn (gen_lsx_vreplvei_w_scalar (target_lo,
> -							    temp_lo));
> -		    }
> -		  else
> -		    {
> -		      emit_insn (gen_vec_setv4si (target_hi, temp_hi,
> -						  GEN_INT (i)));
> -		      emit_insn (gen_vec_setv4si (target_lo, temp_lo,
> -						  GEN_INT (i)));
> -		    }
> -		}
> -	      emit_insn (gen_rtx_SET (target,
> -				      gen_rtx_VEC_CONCAT (vmode, target_hi,
> -							  target_lo)));
> -	      break;
> +    case E_V8SFmode:
> +      half_mode = E_V4SFmode;
> +      loongarch_vec_set256 = gen_vec_setv8sf;
> +      loongarch_vec_repl1_128 = gen_lsx_vreplvei_w_f_scalar;
> +      loongarch_vec_repl2_256 = gen_lasx_xvilvl_w_f_internal;
> +      loongarch_lasx_vecinit_merge
> +	= half_same ? gen_lasx_xvpermi_q_v8sf : gen_lasx_vecinit_merge_v8sf;
> +      /* FALLTHRU.  */
> +    case E_V4SFmode:
> +      loongarch_vec_set128 = gen_vec_setv4sf;
> +      loongarch_vec_repl2_128 = gen_lsx_vilvl_w_f_internal;
> +      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_w_f;
> +      break;
>   
> -	    case E_V4DImode:
> -	      half_mode=E_V2DImode;
> -	      target_hi = gen_reg_rtx (half_mode);
> -	      target_lo = gen_reg_rtx (half_mode);
> -	      for (i = 0; i < nelt/2; ++i)
> -		{
> -		  rtx temp_hi = gen_reg_rtx (imode);
> -		  rtx temp_lo = gen_reg_rtx (imode);
> -		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
> -		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
> -		  if (i == 0)
> -		    {
> -		      emit_insn (gen_lsx_vreplvei_d_scalar (target_hi,
> -							    temp_hi));
> -		      emit_insn (gen_lsx_vreplvei_d_scalar (target_lo,
> -							    temp_lo));
> -		    }
> -		  else
> -		    {
> -		      emit_insn (gen_vec_setv2di (target_hi, temp_hi,
> -						  GEN_INT (i)));
> -		      emit_insn (gen_vec_setv2di (target_lo, temp_lo,
> -						  GEN_INT (i)));
> -		    }
> -		}
> -	      emit_insn (gen_rtx_SET (target,
> -				      gen_rtx_VEC_CONCAT (vmode, target_hi,
> -							  target_lo)));
> -	      break;
> +    case E_V4DFmode:
> +      half_mode = E_V2DFmode;
> +      loongarch_vec_set256 = gen_vec_setv4df;
> +      loongarch_vec_repl1_128 = gen_lsx_vreplvei_d_f_scalar;
> +      loongarch_vec_repl2_256 = gen_lasx_xvilvl_d_f_internal;
> +      loongarch_lasx_vecinit_merge
> +	= half_same ? gen_lasx_xvpermi_q_v4df : gen_lasx_vecinit_merge_v4df;
> +      /* FALLTHRU.  */
> +    case E_V2DFmode:
> +      loongarch_vec_set128 = gen_vec_setv2df;
> +      loongarch_vec_repl2_128 = gen_lsx_vilvl_d_f_internal;
> +      loongarch_vec_mirror = gen_lsx_vreplvei_mirror_d_f;
> +      break;
>   
> -	    case E_V8SFmode:
> -	      half_mode=E_V4SFmode;
> -	      target_hi = gen_reg_rtx (half_mode);
> -	      target_lo = gen_reg_rtx (half_mode);
> -	      for (i = 0; i < nelt/2; ++i)
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  if (ISA_HAS_LASX && GET_MODE_SIZE (vmode) == 32)
> +    {
> +      /* If all elements are the same, just do a broadcost.  */
> +      if (all_same)
> +	loongarch_expand_vector_init_same (target, vals, nvar);
> +      else
> +	{
> +	  gcc_assert (nelt >= 4);
> +
> +	  rtx target_hi, target_lo;
> +	  /* Write elements of high half-part in target directly.  */
> +	  target_hi = target;
> +	  target_lo = gen_reg_rtx (half_mode);
> +
> +	  /* If all elements of high half-part are the same,
> +	     just do a broadcost.  Also applicable to low half-part.  */
> +	  if (hi_same)
> +	    {
> +	      rtx vtmp = gen_rtx_PARALLEL (vmode, gen_rtvec_v (nelt, val_hi));
> +	      loongarch_expand_vector_init_same (target_hi, vtmp, hi_nvar);
> +	    }
> +	  if (lo_same)
> +	    {
> +	      rtx vtmp
> +		= gen_rtx_PARALLEL (half_mode, gen_rtvec_v (nelt / 2, val_lo));
> +	      loongarch_expand_vector_init_same (target_lo, vtmp, lo_nvar);
> +	    }
> +
> +	  for (i = 0; i < nelt / 2; ++i)
> +	    {
> +	      if (!hi_same)
>   		{
> -		  rtx temp_hi = gen_reg_rtx (imode);
> -		  rtx temp_lo = gen_reg_rtx (imode);
> -		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
> -		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
> -		  if (i == 0)
> +		  if (vmode == E_V8SFmode || vmode == E_V4DFmode)
>   		    {
> -		      emit_insn (gen_lsx_vreplvei_w_f_scalar (target_hi,
> -							      temp_hi));
> -		      emit_insn (gen_lsx_vreplvei_w_f_scalar (target_lo,
> -							      temp_lo));
> +		      /* Using xvilvl to load lowest 2 elements simultaneously
> +			 to reduce the number of instructions.  */
> +		      if (i == 1)
> +			{
> +			  op0 = gen_reg_rtx (imode);
> +			  emit_move_insn (op0, val_hi[0]);
> +			  op1 = gen_reg_rtx (imode);
> +			  emit_move_insn (op1, val_hi[1]);
> +			  emit_insn (
> +			    loongarch_vec_repl2_256 (target_hi, op0, op1));
> +			}
> +		      else if (i > 1)
> +			{
> +			  op0 = gen_reg_rtx (imode);
> +			  emit_move_insn (op0, val_hi[i]);
> +			  emit_insn (
> +			    loongarch_vec_set256 (target_hi, op0, GEN_INT (i)));
> +			}
>   		    }
>   		  else
>   		    {
> -		      emit_insn (gen_vec_setv4sf (target_hi, temp_hi,
> -						  GEN_INT (i)));
> -		      emit_insn (gen_vec_setv4sf (target_lo, temp_lo,
> -						  GEN_INT (i)));
> +		      /* Assign the lowest element of val_hi to all elements
> +			 of target_hi.  */
> +		      if (i == 0)
> +			{
> +			  op0 = gen_reg_rtx (imode);
> +			  emit_move_insn (op0, val_hi[0]);
> +			  emit_insn (loongarch_vec_repl1_256 (target_hi, op0));
> +			}
> +		      else if (!rtx_equal_p (val_hi[i], val_hi[0]))
> +			{
> +			  op0 = gen_reg_rtx (imode);
> +			  emit_move_insn (op0, val_hi[i]);
> +			  emit_insn (
> +			    loongarch_vec_set256 (target_hi, op0, GEN_INT (i)));
> +			}
>   		    }
>   		}
> -	      emit_insn (gen_rtx_SET (target,
> -				      gen_rtx_VEC_CONCAT (vmode, target_hi,
> -							  target_lo)));
> -	      break;
> -
> -	    case E_V4DFmode:
> -	      half_mode=E_V2DFmode;
> -	      target_hi = gen_reg_rtx (half_mode);
> -	      target_lo = gen_reg_rtx (half_mode);
> -	      for (i = 0; i < nelt/2; ++i)
> +	      if (!lo_same && !half_same)
>   		{
> -		  rtx temp_hi = gen_reg_rtx (imode);
> -		  rtx temp_lo = gen_reg_rtx (imode);
> -		  emit_move_insn (temp_hi, XVECEXP (vals, 0, i+nelt/2));
> -		  emit_move_insn (temp_lo, XVECEXP (vals, 0, i));
> +		  /* Assign the lowest element of val_lo to all elements
> +		     of target_lo.  */
>   		  if (i == 0)
>   		    {
> -		      emit_insn (gen_lsx_vreplvei_d_f_scalar (target_hi,
> -							      temp_hi));
> -		      emit_insn (gen_lsx_vreplvei_d_f_scalar (target_lo,
> -							      temp_lo));
> +		      op0 = gen_reg_rtx (imode);
> +		      emit_move_insn (op0, val_lo[0]);
> +		      emit_insn (loongarch_vec_repl1_128 (target_lo, op0));
>   		    }
> -		  else
> +		  else if (!rtx_equal_p (val_lo[i], val_lo[0]))
>   		    {
> -		      emit_insn (gen_vec_setv2df (target_hi, temp_hi,
> -						  GEN_INT (i)));
> -		      emit_insn (gen_vec_setv2df (target_lo, temp_lo,
> -						  GEN_INT (i)));
> +		      op0 = gen_reg_rtx (imode);
> +		      emit_move_insn (op0, val_lo[i]);
> +		      emit_insn (
> +			loongarch_vec_set128 (target_lo, op0, GEN_INT (i)));
>   		    }
>   		}
> -	      emit_insn (gen_rtx_SET (target,
> -				      gen_rtx_VEC_CONCAT (vmode, target_hi,
> -							  target_lo)));
> -	      break;
> -
> -	    default:
> -	      gcc_unreachable ();
>   	    }
> -
> +	  if (half_same)
> +	    {
> +	      emit_insn (loongarch_lasx_vecinit_merge (target, target_hi,
> +						       target_hi, const0_rtx));
> +	      return;
> +	    }
> +	  emit_insn (loongarch_lasx_vecinit_merge (target, target_hi, target_lo,
> +						   GEN_INT (0x20)));
>   	}
>         return;
>       }
> @@ -10500,130 +10544,54 @@ loongarch_expand_vector_init (rtx target, rtx vals)
>     if (ISA_HAS_LSX)
>       {
>         if (all_same)
> +	loongarch_expand_vector_init_same (target, vals, nvar);
> +      else
>   	{
> -	  rtx same = XVECEXP (vals, 0, 0);
> -	  rtx temp, temp2;
> -
> -	  if (CONST_INT_P (same) && nvar == 0
> -	      && loongarch_signed_immediate_p (INTVAL (same), 10, 0))
> -	    {
> -	      switch (vmode)
> -		{
> -		case E_V16QImode:
> -		case E_V8HImode:
> -		case E_V4SImode:
> -		case E_V2DImode:
> -		  temp = gen_rtx_CONST_VECTOR (vmode, XVEC (vals, 0));
> -		  emit_move_insn (target, temp);
> -		  return;
> -
> -		default:
> -		  gcc_unreachable ();
> -		}
> -	    }
> -	  temp = gen_reg_rtx (imode);
> -	  if (imode == GET_MODE (same))
> -	    temp2 = same;
> -	  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
> -	    {
> -	      if (GET_CODE (same) == MEM)
> -		{
> -		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
> -		  loongarch_emit_move (reg_tmp, same);
> -		  temp2 = simplify_gen_subreg (imode, reg_tmp,
> -					       GET_MODE (reg_tmp), 0);
> -		}
> -	      else
> -		temp2 = simplify_gen_subreg (imode, same, GET_MODE (same), 0);
> -	    }
> -	  else
> +	  for (i = 0; i < nelt; ++i)
>   	    {
> -	      if (GET_CODE (same) == MEM)
> +	      if (vmode == E_V4SFmode || vmode == E_V2DFmode)
>   		{
> -		  rtx reg_tmp = gen_reg_rtx (GET_MODE (same));
> -		  loongarch_emit_move (reg_tmp, same);
> -		  temp2 = lowpart_subreg (imode, reg_tmp, GET_MODE (reg_tmp));
> +		  /* Using vilvl to load lowest 2 elements simultaneously to
> +		     reduce the number of instructions.  */
> +		  if (i == 1)
> +		    {
> +		      op0 = gen_reg_rtx (imode);
> +		      emit_move_insn (op0, val[0]);
> +		      op1 = gen_reg_rtx (imode);
> +		      emit_move_insn (op1, val[1]);
> +		      emit_insn (loongarch_vec_repl2_128 (target, op0, op1));
> +		    }
> +		  else if (i > 1)
> +		    {
> +		      op0 = gen_reg_rtx (imode);
> +		      emit_move_insn (op0, val[i]);
> +		      emit_insn (
> +			loongarch_vec_set128 (target, op0, GEN_INT (i)));
> +		    }
>   		}
>   	      else
> -		temp2 = lowpart_subreg (imode, same, GET_MODE (same));
> -	    }
> -	  emit_move_insn (temp, temp2);
> -
> -	  switch (vmode)
> -	    {
> -	    case E_V16QImode:
> -	    case E_V8HImode:
> -	    case E_V4SImode:
> -	    case E_V2DImode:
> -	      loongarch_emit_move (target, gen_rtx_VEC_DUPLICATE (vmode, temp));
> -	      break;
> -
> -	    case E_V4SFmode:
> -	      emit_insn (gen_lsx_vreplvei_w_f_scalar (target, temp));
> -	      break;
> -
> -	    case E_V2DFmode:
> -	      emit_insn (gen_lsx_vreplvei_d_f_scalar (target, temp));
> -	      break;
> -
> -	    default:
> -	      gcc_unreachable ();
> -	    }
> -	}
> -      else
> -	{
> -	  emit_move_insn (target, CONST0_RTX (vmode));
> -
> -	  for (i = 0; i < nelt; ++i)
> -	    {
> -	      rtx temp = gen_reg_rtx (imode);
> -	      emit_move_insn (temp, XVECEXP (vals, 0, i));
> -	      switch (vmode)
>   		{
> -		case E_V16QImode:
> -		  if (i == 0)
> -		    emit_insn (gen_lsx_vreplvei_b_scalar (target, temp));
> -		  else
> -		    emit_insn (gen_vec_setv16qi (target, temp, GEN_INT (i)));
> -		  break;
> -
> -		case E_V8HImode:
> -		  if (i == 0)
> -		    emit_insn (gen_lsx_vreplvei_h_scalar (target, temp));
> -		  else
> -		    emit_insn (gen_vec_setv8hi (target, temp, GEN_INT (i)));
> -		  break;
> -
> -		case E_V4SImode:
> -		  if (i == 0)
> -		    emit_insn (gen_lsx_vreplvei_w_scalar (target, temp));
> -		  else
> -		    emit_insn (gen_vec_setv4si (target, temp, GEN_INT (i)));
> -		  break;
> -
> -		case E_V2DImode:
> -		  if (i == 0)
> -		    emit_insn (gen_lsx_vreplvei_d_scalar (target, temp));
> -		  else
> -		    emit_insn (gen_vec_setv2di (target, temp, GEN_INT (i)));
> -		  break;
> -
> -		case E_V4SFmode:
> -		  if (i == 0)
> -		    emit_insn (gen_lsx_vreplvei_w_f_scalar (target, temp));
> -		  else
> -		    emit_insn (gen_vec_setv4sf (target, temp, GEN_INT (i)));
> -		  break;
> -
> -		case E_V2DFmode:
> +		  if (half_same && i == nelt / 2)
> +		    {
> +		      emit_insn (
> +			loongarch_vec_mirror (target, target, const0_rtx));
> +		      return;
> +		    }
> +		  /* Assign the lowest element of val to all elements of
> +		     target.  */
>   		  if (i == 0)
> -		    emit_insn (gen_lsx_vreplvei_d_f_scalar (target, temp));
> -		  else
> -		    emit_insn (gen_vec_setv2df (target, temp, GEN_INT (i)));
> -		  break;
> -
> -		default:
> -		  gcc_unreachable ();
> +		    {
> +		      op0 = gen_reg_rtx (imode);
> +		      emit_move_insn (op0, val[0]);
> +		      emit_insn (loongarch_vec_repl1_128 (target, op0));
> +		    }
> +		  else if (!rtx_equal_p (val[i], val[0]))
> +		    {
> +		      op0 = gen_reg_rtx (imode);
> +		      emit_move_insn (op0, val[i]);
> +		      emit_insn (
> +			loongarch_vec_set128 (target, op0, GEN_INT (i)));
> +		    }
>   		}
>   	    }
>   	}
> @@ -10640,8 +10608,8 @@ loongarch_expand_vector_init (rtx target, rtx vals)
>     /* For two-part initialization, always use CONCAT.  */
>     if (nelt == 2)
>       {
> -      rtx op0 = force_reg (imode, XVECEXP (vals, 0, 0));
> -      rtx op1 = force_reg (imode, XVECEXP (vals, 0, 1));
> +      rtx op0 = force_reg (imode, val[0]);
> +      rtx op1 = force_reg (imode, val[1]);
>         x = gen_rtx_VEC_CONCAT (vmode, op0, op1);
>         emit_insn (gen_rtx_SET (target, x));
>         return;
> diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
> index fb4d228ba84..075f6ba569d 100644
> --- a/gcc/config/loongarch/lsx.md
> +++ b/gcc/config/loongarch/lsx.md
> @@ -176,6 +176,8 @@ (define_c_enum "unspec" [
>     UNSPEC_LSX_VSSRARNI
>     UNSPEC_LSX_VSSRARNI2
>     UNSPEC_LSX_VPERMI
> +  UNSPEC_LSX_VILVL_INTERNAL
> +  UNSPEC_LSX_VREPLVEI_MIRROR
>   ])
>   
>   ;; This attribute gives suffix for integers in VHMODE.
> @@ -1551,6 +1553,18 @@ (define_insn "logb<mode>2"
>     [(set_attr "type" "simd_flog2")
>      (set_attr "mode" "<MODE>")])
>   
> +;; Only for loongarch_expand_vector_init in loongarch.cc.
> +;; Merge two scalar floating-point op1 and op2 into a LSX op0.
> +(define_insn "lsx_vilvl_<lsxfmt_f>_internal"
> +  [(set (match_operand:FLSX 0 "register_operand" "=f")
> +	(unspec:FLSX [(match_operand:<UNITMODE> 1 "register_operand" "f")
> +		      (match_operand:<UNITMODE> 2 "register_operand" "f")]
> +		     UNSPEC_LSX_VILVL_INTERNAL))]
> +  "ISA_HAS_LSX"
> +  "vilvl.<lsxfmt>\t%w0,%w2,%w1"
> +  [(set_attr "type" "simd_permute")
> +   (set_attr "mode" "<MODE>")])
> +
>   (define_insn "smax<mode>3"
>     [(set (match_operand:FLSX 0 "register_operand" "=f")
>   	(smax:FLSX (match_operand:FLSX 1 "register_operand" "f")
> @@ -2289,6 +2303,16 @@ (define_insn "lsx_vreplve_<lsxfmt_f>"
>     [(set_attr "type" "simd_splat")
>      (set_attr "mode" "<MODE>")])
>   
> +(define_insn "lsx_vreplvei_mirror_<lsxfmt_f>"
> +  [(set (match_operand:LSX 0 "register_operand" "=f")
> +	(unspec: LSX [(match_operand:LSX 1 "register_operand" "f")
> +				(match_operand 2 "const_<indeximm>_operand" "")]
> +				UNSPEC_LSX_VREPLVEI_MIRROR))]
> +  "ISA_HAS_LSX"
> +  "vreplvei.d\t%w0,%w1,%2"
> +  [(set_attr "type" "simd_splat")
> +   (set_attr "mode" "<MODE>")])
> +
>   (define_insn "lsx_vreplvei_<lsxfmt_f>"
>     [(set (match_operand:LSX 0 "register_operand" "=f")
>   	(vec_duplicate:LSX
> @@ -2450,6 +2474,99 @@ (define_expand "vec_concatv2di"
>     DONE;
>   })
>   
> +;; Implement vec_concatv2df by vilvl.d.
> +(define_insn_and_split "vec_concatv2df"
> +  [(set (match_operand:V2DF 0 "register_operand" "=f")
> +	(vec_concat:V2DF
> +	  (match_operand:DF 1 "register_operand" "f")
> +	  (match_operand:DF 2 "register_operand" "f")))]
> +  "ISA_HAS_LSX"
> +  ""
> +  "&& reload_completed"
> +  [(const_int 0)]
> +{
> +  emit_insn (gen_lsx_vilvl_d_f (operands[0],
> +				gen_rtx_REG (V2DFmode, REGNO (operands[1])),
> +				gen_rtx_REG (V2DFmode, REGNO (operands[2]))));
> +  DONE;
> +}
> +  [(set_attr "mode" "V2DF")])
> +
> +;; Implement vec_concatv4sf.
> +;; Optimize based on hardware register allocation of operands.
> +(define_insn_and_split "vec_concatv4sf"
> +  [(set (match_operand:V4SF 0 "register_operand" "=f")
> +	(vec_concat:V4SF
> +	  (vec_concat:V2SF
> +	    (match_operand:SF 1 "register_operand" "f")
> +	    (match_operand:SF 2 "register_operand" "f"))
> +	  (vec_concat:V2SF
> +	    (match_operand:SF 3 "register_operand" "f")
> +	    (match_operand:SF 4 "register_operand" "f"))))]
> +  "ISA_HAS_LSX"
> +  ""
> +  "&& reload_completed"
> +  [(const_int 0)]
> +{
> +  operands[5] = GEN_INT (1);
> +  operands[6] = GEN_INT (2);
> +  operands[7] = GEN_INT (4);
> +  operands[8] = GEN_INT (8);
> +
> +  /* If all input are same, use vreplvei.w to broadcast.  */
> +  if (REGNO (operands[1]) == REGNO (operands[2])
> +      && REGNO (operands[1]) == REGNO (operands[3])
> +      && REGNO (operands[1]) == REGNO (operands[4]))
> +    {
> +      emit_insn (gen_lsx_vreplvei_w_f_scalar (operands[0], operands[1]));
> +    }
> +  /* If op0 is equal to op3, use vreplvei.w to set each element of op0 as op3.
> +     If other input is different from op3, use vextrins.w to insert.  */
> +  else if (REGNO (operands[0]) == REGNO (operands[3]))
> +    {
> +      emit_insn (gen_lsx_vreplvei_w_f_scalar (operands[0], operands[3]));
> +      if (REGNO (operands[1]) != REGNO (operands[3]))
> +	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[1],
> +						operands[0], operands[5]));
> +      if (REGNO (operands[2]) != REGNO (operands[3]))
> +	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[2],
> +						operands[0], operands[6]));
> +      if (REGNO (operands[4]) != REGNO (operands[3]))
> +	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[4],
> +						operands[0], operands[8]));
> +    }
> +  /* If op0 is equal to op4, use vreplvei.w to set each element of op0 as op4.
> +     If other input is different from op4, use vextrins.w to insert.  */
> +  else if (REGNO (operands[0]) == REGNO (operands[4]))
> +    {
> +      emit_insn (gen_lsx_vreplvei_w_f_scalar (operands[0], operands[4]));
> +      if (REGNO (operands[1]) != REGNO (operands[4]))
> +	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[1],
> +						operands[0], operands[5]));
> +      if (REGNO (operands[2]) != REGNO (operands[4]))
> +	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[2],
> +						operands[0], operands[6]));
> +      if (REGNO (operands[3]) != REGNO (operands[4]))
> +	emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[3],
> +						operands[0], operands[7]));
> +    }
> +  /* Otherwise, use vilvl.w to merge op1 and op2 first.
> +     If op3 is different from op1, use vextrins.w to insert.
> +     If op4 is different from op2, use vextrins.w to insert.  */
> +  else
> +    {
> +      emit_insn (
> +	gen_lsx_vilvl_w_f (operands[0],
> +			   gen_rtx_REG (V4SFmode, REGNO (operands[1])),
> +			   gen_rtx_REG (V4SFmode, REGNO (operands[2]))));
> +      emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[3],
> +					      operands[0], operands[7]));
> +      emit_insn (gen_lsx_vextrins_w_f_scalar (operands[0], operands[4],
> +					      operands[0], operands[8]));
> +    }
> +  DONE;
> +}
> +  [(set_attr "mode" "V4SF")])
>   
>   (define_insn "vandn<mode>3"
>     [(set (match_operand:LSX 0 "register_operand" "=f")
> @@ -4465,3 +4582,20 @@ (define_insn "lsx_vpermi_w"
>     "vpermi.w\t%w0,%w2,%3"
>     [(set_attr "type" "simd_bit")
>      (set_attr "mode" "V4SI")])
> +
> +;; Delete one of two instructions that exactly play the same role.
> +(define_peephole2
> +  [(set (match_operand:V2DI 0 "register_operand")
> +	(vec_duplicate:V2DI (match_operand:DI 1 "register_operand")))
> +   (set (match_operand:V2DI 2 "register_operand")
> +	(vec_merge:V2DI
> +	  (vec_duplicate:V2DI (match_operand:DI 3 "register_operand"))
> +	  (match_operand:V2DI 4 "register_operand")
> +	  (match_operand 5 "const_int_operand")))]
> +  "operands[0] == operands[2] &&
> +   operands[1] == operands[3] &&
> +   operands[2] == operands[4] &&
> +   INTVAL (operands[5]) == 2"
> +  [(set (match_dup 0)
> +	(vec_duplicate:V2DI (match_dup 1)))]
> +  "")
> diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c
> new file mode 100644
> index 00000000000..487816a483f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-vec-construct-opt.c
> @@ -0,0 +1,102 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mlasx -O3" } */
> +
> +#include <lasxintrin.h>
> +
> +extern long long *x_di;
> +extern int *x_si;
> +extern short int *x_hi;
> +extern char *x_qi;
> +extern double *y_df;
> +extern float *y_sf;
> +
> +/* Remove some unnecessary vinsgr2vr.d as the corresponding elements
> +   have already been set.  */
> +/* { dg-final { scan-assembler-not "v4i64:.*\tvinsgr2vr\\.d.*v4i64" } } */
> +/* { dg-final { scan-assembler-times "v4i64:.*\txvldrepl\\.d.*v4i64" 1 } } */
> +v4i64
> +vec_construct_v4i64 ()
> +{
> +  v4i64 res =
> +  { x_di[0], x_di[0], x_di[1], x_di[1] }
> +  ;
> +  return res;
> +}
> +
> +/* Remove some unnecessary vinsgr2vr.w as the corresponding elements
> +   have already been set.  */
> +/* { dg-final { scan-assembler-not "v8i32:.*\tvinsgr2vr\\.w.*v8i32" } } */
> +/* { dg-final { scan-assembler-times "v8i32:.*\txvreplgr2vr\\.w.*v8i32" 1 } } */
> +v8i32
> +vec_construct_v8i32 ()
> +{
> +  v8i32 res =
> +  { x_si[0], x_si[0], x_si[0], x_si[0],
> +    x_si[0], x_si[2], x_si[0], x_si[0] }
> +  ;
> +  return res;
> +}
> +
> +/* Remove some unnecessary vinsgr2vr.h as the corresponding elements
> +   have already been set.  */
> +/* { dg-final { scan-assembler-not "v16i16:.*\tvori\\.b.*v16i16" } } */
> +/* { dg-final { scan-assembler-times "v16i16:.*\txvreplgr2vr\\.h.*v16i1" 1 } } */
> +v16i16
> +vec_construct_v16i16 ()
> +{
> +  v16i16 res =
> +  { x_hi[1], x_hi[2], x_hi[1], x_hi[1],
> +    x_hi[1], x_hi[1], x_hi[1], x_hi[1],
> +    x_hi[1], x_hi[1], x_hi[1], x_hi[1],
> +    x_hi[1], x_hi[1], x_hi[1], x_hi[2] }
> +  ;
> +  return res;
> +}
> +
> +/* Remove some unnecessary vinsgr2vr.b as the corresponding elements
> +   have already been set.  */
> +/* { dg-final { scan-assembler-not "v32i8:.*\tvori\\.b.*v32i8" } } */
> +/* { dg-final { scan-assembler-times "v32i8:.*\txvreplgr2vr\\.b.*v32i8" 1 } } */
> +v32i8
> +vec_construct_v32i8 ()
> +{
> +  v32i8 res =
> +  { x_qi[0], x_qi[0], x_qi[0], x_qi[0],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[2],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[0],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[3] }
> +  ;
> +  return res;
> +}
> +
> +/* Set 2 elements of a vector simultaneously by vilvl.d
> +   and reducing more vextrins.d.  */
> +/* { dg-final { scan-assembler-not "v4f64:.*\tvori\\.b.*v4f64" } } */
> +/* { dg-final { scan-assembler-not "v4f64:.*\tvextrins\\.d.*v4f64" } } */
> +/* { dg-final { scan-assembler-times "v4f64:.*\tvilvl\\.d.*v4f64" 1 } } */
> +v4f64
> +vec_construct_v4f64 ()
> +{
> +  v4f64 res =
> +  { y_df[0], y_df[2], y_df[0], y_df[0]}
> +  ;
> +  return res;
> +}
> +
> +/* Set 2 elements of a vector simultaneously by vilvl.w
> +   and reducing more vextrins.w.  */
> +/* { dg-final { scan-assembler-not "v8f32:.*\tvextrins\\.w.*v8f32" } } */
> +/* { dg-final { scan-assembler-times "v8f32:.*\txvilvl\\.w.*v8f32" 1 } } */
> +v8f32
> +vec_construct_v8f32 ()
> +{
> +  v8f32 res =
> +  { y_sf[2], y_sf[1], y_sf[2], y_sf[3],
> +    y_sf[2], y_sf[1], y_sf[2], y_sf[3] }
> +  ;
> +  return res;
> +}
> diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
> new file mode 100644
> index 00000000000..92da1c8af9c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mlsx -O3" } */
> +
> +#include <lsxintrin.h>
> +
> +extern long long *x_di;
> +extern int *x_si;
> +extern short int *x_hi;
> +extern char *x_qi;
> +extern double *y_df;
> +extern float *y_sf;
> +
> +/* No change for V2DI mode.  */
> +v2i64
> +vec_construct_v2i64 ()
> +{
> +  v2i64 res =
> +  { x_di[1], x_di[0]}
> +  ;
> +  return res;
> +}
> +
> +/* Only load the lowest 2 elements and directly copy them to high half-part,
> +   reducing more vinsgr2vr.w.  */
> +/* { dg-final { scan-assembler-times "v4i32:.*\tvreplvei\\.d.*v4i32" 1 } } */
> +v4i32
> +vec_construct_v4i32 ()
> +{
> +  v4i32 res =
> +  { x_si[0], x_si[1], x_si[0], x_si[1]}
> +  ;
> +  return res;
> +}
> +
> +/* Only load the lowest 4 elements and directly copy them to high half-part,
> +   reducing more vinsgr2vr.h.  */
> +/* { dg-final { scan-assembler-times "v8i16:.*\tvreplvei\\.d.*v8i16" 1 } } */
> +v8i16
> +vec_construct_v8i16 ()
> +{
> +  v8i16 res =
> +  { x_hi[0], x_hi[0], x_hi[0], x_hi[1],
> +    x_hi[0], x_hi[0], x_hi[0], x_hi[1] }
> +  ;
> +  return res;
> +}
> +
> +/* Only load the lowest 8 elements and directly copy them to high half-part,
> +   reducing more vinsgr2vr.b.  */
> +/* { dg-final { scan-assembler-times "v16i8:.*\tvreplvei\\.d.*v16i8" 1 } } */
> +v16i8
> +vec_construct_v16i8 ()
> +{
> +  v16i8 res =
> +  { x_qi[0], x_qi[1], x_qi[0], x_qi[2],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[3],
> +    x_qi[0], x_qi[1], x_qi[0], x_qi[2],
> +    x_qi[0], x_qi[0], x_qi[0], x_qi[3] }
> +  ;
> +  return res;
> +}
> +
> +/* Set 2 elements of a vector simultaneously by vilvl.d.  */
> +/* { dg-final { scan-assembler-not "v2f64:.*\tvextrins\\.d.*v2f64" } } */
> +/* { dg-final { scan-assembler-times "v2f64:.*\tvilvl\\.d.*v2f64" 1 } } */
> +v2f64
> +vec_construct_v2f64 ()
> +{
> +  v2f64 res =
> +  { y_df[0], y_df[2] }
> +  ;
> +  return res;
> +}
> +
> +/* Set 2 elements of a vector simultaneously by vilvl.w
> +   and reducing more vextrins.w.  */
> +/* { dg-final { scan-assembler-times "v4f32:.*\tvilvl\\.w.*v4f32" 1 } } */
> +v4f32
> +vec_construct_v4f32 ()
> +{
> +  v4f32 res =
> +  { y_sf[0], y_sf[1], y_sf[0], y_sf[0] }
> +  ;
> +  return res;
> +}


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-09-25  3:04 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-21  1:19 [PATCH] LoongArch: Optimizations of vector construction Guo Jie
2023-09-25  3:04 ` chenglulu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).