public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/work146-vsize)] Add vector_size(32) support.
@ 2023-11-18  8:01 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2023-11-18  8:01 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:df6d060211ce87b0dcfa90a75cd8b0535aa79da0

commit df6d060211ce87b0dcfa90a75cd8b0535aa79da0
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Sat Nov 18 03:01:24 2023 -0500

    Add vector_size(32) support.
    
    2023-11-18  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (const_0_to_31_operand): New predicate.
            (mma_assemble_input_operand): Allow any 16-byte vector type, not just
            V16QImode.
            * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
            __VECTOR_SIZE_32__ if -mvector-size-32 used.
            * config/rs6000/rs6000-protos.h (rs6000_expand_vector_pair_init): New
            declaration.
            (rs6000_expand_vector_pair_set): Likewise.
            (split_unary_vector_pair): Likewise.
            (split_binary_vector_pair): Likewisee.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (rs6000_hard_regno_mode_ok_uncached): Add
            support for -mvector-size-32.
            (rs6000_hard_regno_mode_ok): Likewise.
            (rs6000_setup_reg_addr_masks): Likewise.
            (rs6000_init_hard_regno_mode_ok): Likewise.
            (rs6000_option_override_internal): Likewise.
            (vector_pair_to_vector_mode): New function.
            (+rs6000_split_vpair_constant): Likewise.
            (rs6000_expand_vector_pair_init): Likewise.
            (rs6000_expand_vector_pair_set): Likewise.
            (reg_offset_addressing_ok_p): Add support for -mvector-size-32.
            (rs6000_emit_move): Likewise.
            (rs6000_preferred_reload_class): Likewise.
            (altivec_expand_vec_perm_le): Likewise.
            (rs6000_opt_vars): Add -mvector-size-32.
            (split_unary_vector_pair): New function.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pai): Likewise.
            (rs6000_split_multireg_move): Add -mvector-size-32 support.
            * config/rs6000/rs6000.h (VECTOR_PAIR_MODE): New macro.
            * config/rs6000/rs6000.md (wd mode attribute): Add vector pair modes.
            (RELOAD mode iterator): Likewise.
            (toplevel): Include vector-pair.md.
            * config/rs6000/rs6000.opt (-mvector-size-32): New option.
            * config/rs6000/vector-pair.md: New file.
            * config/rs6000/vsx.md (vsx_extract_v4df): New insn.
            (vsx_extract_v8sf): Likewise.
            * doc/invoke.texi (RS/6000 and PowerPC Options): Document
            -mvector-size-32.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-size-32-1.c: New test.
            * gcc.target/powerpc/vector-size-32-2.c: Likewise.
            * gcc.target/powerpc/vector-size-32-3.c: Likewise.
            * gcc.target/powerpc/vector-size-32-4.c: Likewise.
            * gcc.target/powerpc/vector-size-32-5.c: Likewise.
            * gcc.target/powerpc/vector-size-32-6.c: Likewise.

Diff:
---
 gcc/config/rs6000/predicates.md                    |   9 +-
 gcc/config/rs6000/rs6000-c.cc                      |   3 +
 gcc/config/rs6000/rs6000-protos.h                  |   7 +
 gcc/config/rs6000/rs6000.cc                        | 377 ++++++++++-
 gcc/config/rs6000/rs6000.h                         |   6 +
 gcc/config/rs6000/rs6000.md                        |   7 +-
 gcc/config/rs6000/rs6000.opt                       |   4 +
 gcc/config/rs6000/vector-pair.md                   | 704 +++++++++++++++++++++
 gcc/config/rs6000/vsx.md                           |  56 ++
 .../gcc.target/powerpc/vector-size-32-1.c          |  85 +++
 .../gcc.target/powerpc/vector-size-32-2.c          |  96 +++
 .../gcc.target/powerpc/vector-size-32-3.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-4.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-5.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-6.c          | 137 ++++
 15 files changed, 1880 insertions(+), 22 deletions(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c4..8a56487d7d2 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -327,6 +327,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
 
+;; Match op = 0..31
+(define_predicate "const_0_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
 ;; Return 1 if op is a 34-bit constant integer.
 (define_predicate "cint34_operand"
   (match_code "const_int")
@@ -1301,8 +1306,10 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(GET_MODE_SIZE (mode) == 16 && VECTOR_MODE_P (mode)
 		&& (vsx_register_operand (op, mode)
+		    || op == CONST0_RTX (mode)
+		    || vsx_prefixed_constant (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
 			    || quad_address_p (XEXP (op, 0), mode, false)))))"))
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 65be0ac43e2..27114b14022 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -631,6 +631,9 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
     builtin_define ("__SIZEOF_IBM128__=16");
   if (ieee128_float_type_node)
     builtin_define ("__SIZEOF_IEEE128__=16");
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    builtin_define ("__VECTOR_SIZE_32__");
+
 #ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
   builtin_define ("__BUILTIN_CPU_SUPPORTS__");
 #endif
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..13687c5b1b3 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -61,7 +61,9 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
+extern void rs6000_expand_vector_pair_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, rtx);
+extern void rs6000_expand_vector_pair_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
@@ -138,6 +140,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0dd21e67dde..d3aecb68565 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -1843,7 +1843,7 @@ rs6000_hard_regno_mode_ok_uncached (int regno, machine_mode mode)
 
   /* Vector pair modes need even/odd VSX register pairs.  Only allow vector
      registers.  */
-  if (mode == OOmode)
+  if (VECTOR_PAIR_MODE (mode))
     return (TARGET_MMA && VSX_REGNO_P (regno) && (regno & 1) == 0);
 
   /* MMA accumulator modes need FPR registers divisible by 4.  */
@@ -1954,9 +1954,10 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
    GPR registers, and TImode can go in any GPR as well as VSX registers (PR
    57744).
 
-   Similarly, don't allow OOmode (vector pair, restricted to even VSX
-   registers) or XOmode (vector quad, restricted to FPR registers divisible
-   by 4) to tie with other modes.
+   Similarly, don't allow XOmode (vector quad, restricted to FPR registers
+   divisible by 4) to tie with other modes.
+
+   Vector pair modes can tie with other vector pair modes.
 
    Altivec/VSX vector tests were moved ahead of scalar float mode, so that IEEE
    128-bit floating point on VSX systems ties with other vectors.  */
@@ -1964,9 +1965,14 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 static bool
 rs6000_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
-  if (mode1 == PTImode || mode1 == OOmode || mode1 == XOmode
-      || mode2 == PTImode || mode2 == OOmode || mode2 == XOmode)
-    return mode1 == mode2;
+  if (mode1 == PTImode || mode1 == XOmode
+       || mode2 == PTImode || mode2 == XOmode)
+     return mode1 == mode2;
+ 
+  if (VECTOR_PAIR_MODE (mode1))
+    return VECTOR_PAIR_MODE (mode2);
+  if (VECTOR_PAIR_MODE (mode2))
+    return ALTIVEC_OR_VSX_VECTOR_MODE (mode1);
 
   if (ALTIVEC_OR_VSX_VECTOR_MODE (mode1))
     return ALTIVEC_OR_VSX_VECTOR_MODE (mode2);
@@ -2715,13 +2721,13 @@ rs6000_setup_reg_addr_masks (void)
 	     of the LXVP or STXVP instructions, do not allow indexed mode so
 	     that we can split the load/store.  */
 	  else if ((addr_mask != 0) && TARGET_MMA
-		   && (m2 == OOmode || m2 == XOmode))
+		   && (VECTOR_PAIR_MODE (m2) || m2 == XOmode))
 	    {
 	      addr_mask |= RELOAD_REG_OFFSET;
 	      if (rc == RELOAD_REG_FPR || rc == RELOAD_REG_VMX)
 		{
 		  addr_mask |= RELOAD_REG_QUAD_OFFSET;
-		  if (m2 == OOmode
+		  if (VECTOR_PAIR_MODE (m2)
 		      && TARGET_LOAD_VECTOR_PAIR
 		      && TARGET_STORE_VECTOR_PAIR)
 		    addr_mask |= RELOAD_REG_INDEXED;
@@ -2941,6 +2947,33 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
       rs6000_vector_align[XOmode] = 512;
     }
 
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      rs6000_vector_unit[V32QImode] = VECTOR_NONE;
+      rs6000_vector_mem[V32QImode] = VECTOR_VSX;
+      rs6000_vector_align[V32QImode] = 256;
+
+      rs6000_vector_unit[V16HImode] = VECTOR_NONE;
+      rs6000_vector_mem[V16HImode] = VECTOR_VSX;
+      rs6000_vector_align[V16HImode] = 256;
+
+      rs6000_vector_unit[V8SImode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SImode] = VECTOR_VSX;
+      rs6000_vector_align[V8SImode] = 256;
+
+      rs6000_vector_unit[V8SFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SFmode] = VECTOR_VSX;
+      rs6000_vector_align[V8SFmode] = 256;
+
+      rs6000_vector_unit[V4DImode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DImode] = VECTOR_VSX;
+      rs6000_vector_align[V4DImode] = 256;
+
+      rs6000_vector_unit[V4DFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DFmode] = VECTOR_VSX;
+      rs6000_vector_align[V4DFmode] = 256;
+    }
+
   /* Register class constraints for the constraints that depend on compile
      switches. When the VSX code was added, different constraints were added
      based on the type (DFmode, V2DFmode, V4SFmode).  For the vector types, all
@@ -3072,6 +3105,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 		  reg_addr[XOmode].reload_store = CODE_FOR_reload_xo_di_store;
 		  reg_addr[XOmode].reload_load = CODE_FOR_reload_xo_di_load;
 		}
+
+	      if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+		{
+		  reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_di_store;
+		  reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_di_load;
+		  reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_di_store;
+		  reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_di_load;
+		  reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_di_store;
+		  reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_di_load;
+		  reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_di_store;
+		  reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_di_load;
+		  reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_di_store;
+		  reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_di_load;
+		  reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_di_store;
+		  reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_di_load;
+		}
 	    }
 	}
       else
@@ -3129,6 +3178,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 	      reg_addr[DDmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdd;
 	      reg_addr[DFmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdf;
 	    }
+
+	  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+	    {
+	      reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_si_store;
+	      reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_si_load;
+	      reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_si_store;
+	      reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_si_load;
+	      reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_si_store;
+	      reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_si_load;
+	      reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_si_store;
+	      reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_si_load;
+	      reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_si_store;
+	      reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_si_load;
+	      reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_si_store;
+	      reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_si_load;
+	    }
 	}
 
       reg_addr[DFmode].scalar_in_vmx_p = true;
@@ -4429,6 +4494,15 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= OPTION_MASK_STORE_VECTOR_PAIR;
     }
 
+  if (!TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      if (OPTION_SET_P (TARGET_VECTOR_SIZE_32))
+	warning (0, "%qs should not be used unless you use %qs",
+		 "-mvector-size-32", "-mmma");
+
+      TARGET_VECTOR_SIZE_32 = 0;
+    }
+
   /* Enable power10 fusion if we are tuning for power10, even if we aren't
      generating power10 instructions.  */
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION))
@@ -7275,6 +7349,142 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
+/* For a vector pair mode, return the equivalent vector mode or VOIDmode.  */
+
+static machine_mode
+vector_pair_to_vector_mode (machine_mode mode)
+{
+  machine_mode vmode;
+
+  switch (mode)
+    {
+    case E_V32QImode:  vmode = V16QImode; break;
+    case E_V16HImode:  vmode = V8HImode;  break;
+    case E_V8SImode:   vmode = V4SImode;  break;
+    case E_V4DImode:   vmode = V2DImode;  break;
+    case E_V8SFmode:   vmode = V4SFmode;  break;
+    case E_V4DFmode:   vmode = V2DFmode;  break;
+    case E_OOmode:     vmode = V1TImode;  break;
+    default:           vmode = VOIDmode;  break;
+    }
+
+  return vmode;
+}
+
+/* Split a vector constant for a type that can be held into a vector register
+   pair into 2 separate constants that can be held in a single vector register.
+   Return true if we can split the constant.  */
+
+static bool
+rs6000_split_vpair_constant (rtx op, rtx *high, rtx *low)
+{
+  machine_mode vmode = vector_pair_to_vector_mode (GET_MODE (op));
+
+  *high = *low = NULL_RTX;
+
+  if (!CONST_VECTOR_P (op) || vmode == GET_MODE (op))
+    return false;
+
+  size_t nunits = GET_MODE_NUNITS (vmode);
+  rtvec hi_vec = rtvec_alloc (nunits);
+  rtvec lo_vec = rtvec_alloc (nunits);
+
+  for (size_t i = 0; i < nunits; i++)
+    {
+      RTVEC_ELT (hi_vec, i) = CONST_VECTOR_ELT (op, i);
+      RTVEC_ELT (lo_vec, i) = CONST_VECTOR_ELT (op, i + nunits);
+    }
+
+  *high = gen_rtx_CONST_VECTOR (vmode, hi_vec);
+  *low = gen_rtx_CONST_VECTOR (vmode, lo_vec);
+  return true;
+}
+
+/* Initialize vector pair TARGET to VALS.  */
+
+void
+rs6000_expand_vector_pair_init (rtx target, rtx vals)
+{
+  machine_mode mode_vpair = GET_MODE (target);
+  machine_mode mode_vector;
+  size_t n_elts_vpair = GET_MODE_NUNITS (mode_vpair);
+  bool all_same = true;
+  rtx first = XVECEXP (vals, 0, 0);
+  rtx (*gen_splat) (rtx, rtx);
+  rtx (*gen_concat) (rtx, rtx, rtx);
+
+  switch (mode_vpair)
+    {
+    case E_V32QImode:
+      mode_vector = V16QImode;
+      gen_splat = gen_vpair_splat_v32qi;
+      gen_concat = gen_vpair_concat_v32qi;
+      break;
+
+    case E_V16HImode:
+      mode_vector = V8HImode;
+      gen_splat = gen_vpair_splat_v16hi;
+      gen_concat = gen_vpair_concat_v16hi;
+      break;
+
+    case E_V8SImode:
+      mode_vector = V4SImode;
+      gen_splat = gen_vpair_splat_v8si;
+      gen_concat = gen_vpair_concat_v8si;
+      break;
+
+    case E_V4DImode:
+      mode_vector = V2DImode;
+      gen_splat = gen_vpair_splat_v4di;
+      gen_concat = gen_vpair_concat_v4di;
+      break;
+
+    case E_V8SFmode:
+      mode_vector = V4SFmode;
+      gen_splat = gen_vpair_splat_v8sf;
+      gen_concat = gen_vpair_concat_v8sf;
+      break;
+
+    case E_V4DFmode:
+      mode_vector = V2DFmode;
+      gen_splat = gen_vpair_splat_v4df;
+      gen_concat = gen_vpair_concat_v4df;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* See if we can do a splat operation.  */
+  for (size_t i = 1; i < n_elts_vpair; ++i)
+    {
+      if (!rtx_equal_p (XVECEXP (vals, 0, i), first))
+	{
+	  all_same = false;
+	  break;
+	}
+    }
+
+  if (all_same)
+    {
+      emit_insn (gen_splat (target, first));
+      return;
+    }
+
+  /* Break the initialization into two parts.  */
+  rtx vector_hi = gen_reg_rtx (mode_vector);
+  rtx vector_lo = gen_reg_rtx (mode_vector);
+  rtx vals_hi;
+  rtx vals_lo;
+
+  rs6000_split_vpair_constant (vals, &vals_hi, &vals_lo);
+
+  rs6000_expand_vector_init (vector_hi, vals_hi);
+  rs6000_expand_vector_init (vector_lo, vals_lo);
+  emit_insn (gen_concat (target, vector_hi, vector_lo));
+  return;
+}
+
 /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
    is variable and also counts by vector element size for p9 and above.  */
 
@@ -7603,6 +7813,15 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
   emit_insn (gen_rtx_SET (target, x));
 }
 
+/* Set field ELT_RTX of vaector pair TARGET to VAL.  */
+
+void
+rs6000_expand_vector_pair_set (rtx target, rtx val, rtx elt_rtx)
+{
+  if (target || val || elt_rtx)
+    gcc_unreachable ();
+}
+
 /* Extract field ELT from VEC into TARGET.  */
 
 void
@@ -8694,6 +8913,12 @@ reg_offset_addressing_ok_p (machine_mode mode)
       /* The vector pair/quad types support offset addressing if the
 	 underlying vectors support offset addressing.  */
     case E_OOmode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SImode:
+    case E_V8SFmode:
+    case E_V4DImode:
+    case E_V4DFmode:
     case E_XOmode:
       return TARGET_MMA;
 
@@ -11202,6 +11427,12 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
     case E_V2DFmode:
     case E_V2DImode:
     case E_V1TImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V4DFmode:
+    case E_V4DImode:
       if (CONSTANT_P (operands[1])
 	  && !easy_vector_constant (operands[1], mode))
 	operands[1] = force_const_mem (mode, operands[1]);
@@ -13456,7 +13687,7 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
      the GPR registers.  */
   if (rclass == GEN_OR_FLOAT_REGS)
     {
-      if (mode == OOmode)
+      if (VECTOR_PAIR_MODE (mode))
 	return VSX_REGS;
 
       if (mode == XOmode)
@@ -23417,6 +23648,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   rtx tmp = target;
   rtx norreg = gen_reg_rtx (V16QImode);
   machine_mode mode = GET_MODE (target);
+  machine_mode qi_vmode = VECTOR_PAIR_MODE (mode) ? V32QImode : V16QImode;
 
   /* Get everything in regs so the pattern matches.  */
   if (!REG_P (op0))
@@ -23424,7 +23656,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   if (!REG_P (op1))
     op1 = force_reg (mode, op1);
   if (!REG_P (sel))
-    sel = force_reg (V16QImode, sel);
+    sel = force_reg (qi_vmode, sel);
   if (!REG_P (target))
     tmp = gen_reg_rtx (mode);
 
@@ -23437,10 +23669,10 @@ altivec_expand_vec_perm_le (rtx operands[4])
     {
       /* Invert the selector with a VNAND if available, else a VNOR.
 	 The VNAND is preferred for future fusion opportunities.  */
-      notx = gen_rtx_NOT (V16QImode, sel);
+      notx = gen_rtx_NOT (qi_vmode, sel);
       iorx = (TARGET_P8_VECTOR
-	      ? gen_rtx_IOR (V16QImode, notx, notx)
-	      : gen_rtx_AND (V16QImode, notx, notx));
+	      ? gen_rtx_IOR (qi_vmode, notx, notx)
+	      : gen_rtx_AND (qi_vmode, notx, notx));
       emit_insn (gen_rtx_SET (norreg, iorx));
 
       /* Permute with operands reversed and adjusted selector.  */
@@ -24572,6 +24804,9 @@ static struct rs6000_opt_var const rs6000_opt_vars[] =
   { "speculate-indirect-jumps",
     offsetof (struct gcc_options, x_rs6000_speculate_indirect_jumps),
     offsetof (struct cl_target_option, x_rs6000_speculate_indirect_jumps), },
+  { "vector-size-32",
+    offsetof (struct gcc_options, x_TARGET_VECTOR_SIZE_32),
+    offsetof (struct cl_target_option, x_TARGET_VECTOR_SIZE_32), },
 };
 
 /* Inner function to handle attribute((target("..."))) and #pragma GCC target
@@ -27408,6 +27643,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
@@ -27426,6 +27735,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
   int reg_mode_size;
   /* The number of registers that will be moved.  */
   int nregs;
+  /* Hi/lo values for splitting vector pair constants.  */
+  rtx vpair_hi, vpair_lo;
 
   reg = REG_P (dst) ? REGNO (dst) : REGNO (src);
   mode = GET_MODE (dst);
@@ -27441,8 +27752,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
     }
   /* If we have a vector pair/quad mode, split it into two/four separate
      vectors.  */
-  else if (mode == OOmode || mode == XOmode)
-    reg_mode = V1TImode;
+  else if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
+    {
+      machine_mode vmode = vector_pair_to_vector_mode (mode);
+      reg_mode = (vmode == VOIDmode) ? V1TImode : vmode;
+    }
   else if (FP_REGNO_P (reg))
     reg_mode = DECIMAL_FLOAT_MODE_P (mode) ? DDmode :
 	(TARGET_HARD_FLOAT ? DFmode : SFmode);
@@ -27454,6 +27768,29 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 
   gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
 
+  /* Handle vector pair constants.  */
+  if (CONST_VECTOR_P (src) && VECTOR_PAIR_MODE (mode) && TARGET_MMA
+      && rs6000_split_vpair_constant (src, &vpair_hi, &vpair_lo)
+      && VSX_REGNO_P (reg))
+    {
+      reg_mode = GET_MODE (vpair_hi);
+      rtx reg_hi = gen_rtx_REG (reg_mode, reg);
+      rtx reg_lo = gen_rtx_REG (reg_mode, reg + 1);
+
+      emit_move_insn (reg_hi, vpair_hi);
+
+      /* 0.0 is easy.  For other constants, copy the high register into the low
+	 register if the two sets of constants are equal.  This means we won't
+	 be doing back to back prefixed load immediate instructions.  */
+      if (rtx_equal_p (vpair_hi, vpair_lo)
+	  && !rtx_equal_p (vpair_hi, CONST0_RTX (reg_mode)))
+	emit_move_insn (reg_lo, reg_hi);
+      else
+	emit_move_insn (reg_lo, vpair_lo);
+      
+      return;
+    }
+      
   /* TDmode residing in FP registers is special, since the ISA requires that
      the lower-numbered word of a register pair is always the most significant
      word, even in little-endian mode.  This does not match the usual subreg
@@ -27493,7 +27830,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
      below.  This means the last register gets the first memory
      location.  We also need to be careful of using the right register
      numbers if we are splitting XO to OO.  */
-  if (mode == OOmode || mode == XOmode)
+  if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
     {
       nregs = hard_regno_nregs (reg, mode);
       int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
@@ -27553,7 +27890,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	  gcc_assert (REG_P (dst));
 	  if (GET_MODE (src) == XOmode)
 	    gcc_assert (FP_REGNO_P (REGNO (dst)));
-	  if (GET_MODE (src) == OOmode)
+	  if (VECTOR_PAIR_MODE (GET_MODE (src)))
 	    gcc_assert (VSX_REGNO_P (REGNO (dst)));
 
 	  int nvecs = XVECLEN (src, 0);
@@ -27628,7 +27965,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	 overlap.  */
       int i;
       /* XO/OO are opaque so cannot use subregs. */
-      if (mode == OOmode || mode == XOmode )
+      if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	{
 	  for (i = nregs - 1; i >= 0; i--)
 	    {
@@ -27802,7 +28139,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	    continue;
 
 	  /* XO/OO are opaque so cannot use subregs. */
-	  if (mode == OOmode || mode == XOmode )
+	  if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	    {
 	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
 	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 326c45221e9..32848f7d15b 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -1006,6 +1006,12 @@ enum data_align { align_abi, align_opt, align_both };
   (ALTIVEC_VECTOR_MODE (MODE) || VSX_VECTOR_MODE (MODE)			\
    || (MODE) == V2DImode || (MODE) == V1TImode)
 
+/* Whether a mode is held in paired vector registers.  */
+#define VECTOR_PAIR_MODE(MODE)						\
+  ((MODE) == OOmode							\
+   || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode)
+
 /* Post-reload, we can't use any new AltiVec registers, as we already
    emitted the vrsave mask.  */
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dcf1f3526f5..e9f2244c216 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -683,9 +683,13 @@
 		      (HI    "h")
 		      (SI    "w")
 		      (DI    "d")
+		      (V32QI "b")
 		      (V16QI "b")
+		      (V16HI "h")
 		      (V8HI  "h")
+		      (V8SI  "w")
 		      (V4SI  "w")
+		      (V4DI  "d")
 		      (V2DI  "d")
 		      (V1TI  "q")
 		      (TI    "q")])
@@ -812,7 +816,7 @@
 ;; supplement addressing modes.
 (define_mode_iterator RELOAD [V16QI V8HI V4SI V2DI V4SF V2DF V1TI
 			      SF SD SI DF DD DI TI PTI KF IF TF
-			      OO XO])
+			      OO XO V32QI V16HI V8SI V8SF V4DI V4DF])
 
 ;; Iterate over smin, smax
 (define_code_iterator fp_minmax	[smin smax])
@@ -15767,6 +15771,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 369095df9ed..bc2966f6120 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -605,6 +605,10 @@ mstore-vector-pair
 Target Undocumented Mask(STORE_VECTOR_PAIR) Var(rs6000_isa_flags)
 Generate (do not generate) store vector pair instructions.
 
+mvector-size-32
+Target Undocumented Var(TARGET_VECTOR_SIZE_32) Init(0) Save
+Generate (do not generate) vector pair instructions for vector_size(32).
+
 mrelative-jumptables
 Target Undocumented Var(rs6000_relative_jumptables) Init(1) Save
 
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 00000000000..9dd9b1f6864
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,704 @@
+;; Vector pair arithmetic and logical instruction support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; This function adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+;; Iterator for all vector pair modes.  Even though we do not provide integer
+;; vector pair operations at this time, we need to support loading and storing
+;; integer vector pairs for perumte operations (and eventually compare).
+(define_mode_iterator VPAIR [V32QI V16HI V8SI V4DI V8SF V4DF])
+
+;; Floating point vector pair ops
+(define_mode_iterator VPAIR_FP [V8SF V4DF])
+
+;; Iterator for floating point unary/binary operations.
+(define_code_iterator VPAIR_FP_UNARY  [abs neg])
+(define_code_iterator VPAIR_FP_BINARY [plus minus mult smin smax])
+
+;; Integer vector pair ops.  We need the basic logical opts to support
+;; permution on little endian systems.
+(define_mode_iterator VPAIR_INT [V32QI V16HI V8SI V4DI])
+
+;; Special iterators for NEG (V4SI and V2DI have vneg{w,d}), while V16QI and
+;; V8HI have to use a subtract from 0.
+(define_mode_iterator VPAIR_NEG_VNEG [V4DI V8SI])
+(define_mode_iterator VPAIR_NEG_SUB [V32QI V16HI])
+
+;; Iterator integer unary/binary operations.  Logical operations can be done on
+;; all VSX registers, while the binary int operators need Altivec registers.
+(define_code_iterator VPAIR_LOGICAL_UNARY  [not])
+(define_code_iterator VPAIR_LOGICAL_BINARY [and ior xor])
+
+(define_code_iterator VPAIR_INT_BINARY     [plus minus smin smax])
+
+;; Give the insn name from the opertion
+(define_code_attr vpair_op [(abs   "abs")
+			    (div   "div")
+			    (and   "and")
+			    (fma   "fma")
+			    (ior   "ior")
+			    (minus "sub")
+			    (mult  "mul")
+			    (neg   "neg")
+			    (not   "one_cmpl")
+			    (plus  "add")
+			    (smin  "smin")
+			    (smax  "smax")
+			    (xor   "xor")])
+
+;; Map vector pair mode to vector mode in upper case after the vector pair is
+;; split to two vectors.
+(define_mode_attr VPAIR_VECTOR [(V32QI "V16QI")
+				(V16HI "V8HI")
+				(V8SI  "V4SI")
+				(V4DI  "V2DI")
+				(V8SF  "V4SF")
+                                (V4DF  "V2DF")])
+
+;; Map vector pair mode to vector mode in lower case after the vector pair is
+;; split to two vectors.
+(define_mode_attr vpair_vector_l [(V32QI "v16qi")
+				  (V16HI "v8hi")
+				  (V8SI  "v4si")
+				  (V4DI  "v2di")
+				  (V8SF  "v4sf")
+				  (V4DF  "v2df")])
+
+;; Map vector pair mode to the base element mode.
+(define_mode_attr VPAIR_ELEMENT [(V32QI "QI")
+				 (V16HI "HI")
+				 (V8SI  "SI")
+				 (V4DI  "DI")
+				 (V8SF  "SF")
+				 (V4DF  "DF")])
+
+;; Map vector pair mode to the base element mode in lower case.
+(define_mode_attr vpair_element_l [(V32QI "qi")
+				   (V16HI "hi")
+				   (V8SI  "si")
+				   (V4DI  "di")
+				   (V8SF  "sf")
+				   (V4DF  "df")])
+
+;; Vector pair move support.
+(define_expand "mov<mode>"
+  [(set (match_operand:VPAIR 0 "nonimmediate_operand")
+	(match_operand:VPAIR 1 "input_operand"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_emit_move (operands[0], operands[1], <MODE>mode);
+  DONE;
+})
+
+(define_insn_and_split "*mov<mode>"
+  [(set (match_operand:VPAIR 0 "nonimmediate_operand" "=wa,wa,ZwO,QwO,wa,wa")
+	(match_operand:VPAIR 1 "input_operand" "ZwO,QwO,wa,wa,wa,j"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], <MODE>mode)
+       || gpc_reg_operand (operands[1], <MODE>mode))"
+  "@
+   lxvp%X1 %x0,%1
+   #
+   stxvp%X0 %x1,%0
+   #
+   #
+   #"
+  "&& reload_completed
+   && ((MEM_P (operands[0]) && !TARGET_STORE_VECTOR_PAIR)
+       || (MEM_P (operands[1]) && !TARGET_LOAD_VECTOR_PAIR)
+       || (!MEM_P (operands[0]) && !MEM_P (operands[1])))"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  if (op1 == CONST0_RTX (<MODE>mode))
+    {
+      machine_mode vmode = <VPAIR_VECTOR>mode;
+      rtx op0_reg0 = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+      rtx op0_reg1 = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+      rtx zero = CONST0_RTX (vmode);
+      emit_move_insn (op0_reg0, zero);
+      emit_move_insn (op0_reg1, zero);
+      DONE;
+    }
+
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecload,vecstore,vecstore,veclogical,vecperm")
+   (set_attr "size" "256")
+   (set_attr "length" "*,8,*,8,8,8")
+   (set_attr "isa" "lxvp,*,stxvp,*,*,*")])
+\f
+;; Vector pair initialization
+(define_expand "vec_init<mode><vpair_element_l>"
+  [(match_operand:VPAIR 0 "vsx_register_operand")
+   (match_operand:VPAIR 1 "")]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_expand_vector_pair_init (operands[0], operands[1]);
+  DONE;
+})
+
+;; Vector pair set element
+(define_expand "vec_set<mode>"
+  [(match_operand:VPAIR 0 "vsx_register_operand")
+   (match_operand:<VPAIR_ELEMENT> 1 "register_operand")
+   (match_operand 2 "vec_set_index_operand")]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_expand_vector_pair_set (operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+;; Vector pair extraction
+(define_insn_and_split "vec_extract<mode><vpair_element_l>"
+  [(set (match_operand:<VPAIR_ELEMENT> 0 "vsx_register_operand" "=wa")
+	(vec_select:<VPAIR_ELEMENT>
+	 (match_operand:VPAIR 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_int_operand" "n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  HOST_WIDE_INT elt = INTVAL (operands[2]);
+  machine_mode mode = <MODE>mode;
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  unsigned vsize = GET_MODE_SIZE (<VPAIR_VECTOR>mode);
+  unsigned reg_num = ((WORDS_BIG_ENDIAN && elt >= vsize)
+		      || (!WORDS_BIG_ENDIAN && elt < vsize));
+	   
+  rtx vreg = simplify_gen_subreg (vmode, op1, mode, reg_num * 16);
+  emit_insn (gen_vsx_extract_<vpair_vector_l> (op0, vreg,
+					       GEN_INT (elt % vsize)));
+  DONE;
+})
+
+;; Assemble a vector pair from two vectors.
+;;
+;; We have both endian versions to change which input register will be moved
+;; the the first register in the vector pair.
+(define_expand "vpair_concat_<mode>"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32")
+
+(define_insn_and_split "*vpair_concat_<mode>_be"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand" "=wa,&wa")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand" "0,mwajeP")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand" "mwajeP,mwajeP")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32 && WORDS_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  operands[3] = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*vpair_concat_<mode>_le"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand" "=&wa,wa")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand" "mwajeP,0")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand" "mwajeP,mwajeP")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32 && !WORDS_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  operands[3] = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+}
+  [(set_attr "length" "8")])
+
+;; Zero a vector pair
+(define_expand "vpair_zero_<mode>"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand") (match_dup 1))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  operands[1] = CONST0_RTX (<MODE>mode);
+})
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<mode>"
+  [(use (match_operand:VPAIR 0 "vsx_register_operand"))
+   (use (match_operand:<VPAIR_ELEMENT> 1 "input_operand"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  if (op1 == CONST0_RTX (vmode))
+    {
+      emit_insn (gen_vpair_zero_<mode> (op0));
+      DONE;
+    }
+
+  rtx tmp = gen_reg_rtx (vmode);
+
+  unsigned num_elements = GET_MODE_NUNITS (vmode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (size_t i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rtx vec_elements = gen_rtx_PARALLEL (vmode, elements);
+  rs6000_expand_vector_init (tmp, vec_elements);
+  emit_insn (gen_vpair_concat_<mode> (op0, tmp, tmp));
+  DONE;
+})
+\f
+;; Vector pair floating point arithmetic unary operations
+(define_insn_and_split "<vpair_op><mode>2"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(VPAIR_FP_UNARY:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_<vpair_op><vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize negative absolute value (both floating point and integer)
+(define_insn_and_split "nabs<mode>2"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(neg:VPAIR_FP
+	 (abs:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_vsx_nabs<vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point arithmetic binary operations
+(define_insn_and_split "<vpair_op><mode>3"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(VPAIR_FP_BINARY:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_<vpair_op><vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused multiply-add
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(fma:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_fma<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused multiply-subtract
+(define_insn_and_split "fms<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(fma:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	 (neg:VPAIR_FP
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_fms<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point negative fused multiply-add
+(define_insn_and_split "nfma<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_nfma<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused negative multiply-subtract
+(define_insn_and_split "nfms<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	  (neg:VPAIR_FP
+	   (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_nfms<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into fma (a, b, c)
+(define_insn_and_split "*fma_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(plus:VPAIR_FP
+	 (mult:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:VPAIR_FP (match_dup 1)
+		      (match_dup 2)
+		      (match_dup 3)))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into fma (a, b, -c)
+(define_insn_and_split "*fms_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(minus:VPAIR_FP
+	 (mult:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:VPAIR_FP (match_dup 1)
+		      (match_dup 2)
+		      (neg:VPAIR_FP
+		       (match_dup 3))))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) + c) into -fma (a, b, c)
+(define_insn_and_split "*nfma_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (plus:VPAIR_FP
+	  (mult:VPAIR_FP
+	   (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP (match_dup 1)
+		       (match_dup 2)
+		       (match_dup 3))))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into -fma (a, b, -c)
+(define_insn_and_split "*nfms_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (minus:VPAIR_FP
+	  (mult:VPAIR_FP
+	   (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP (match_dup 1)
+		       (match_dup 2)
+		       (neg:VPAIR_FP
+			(match_dup 3)))))]
+{
+}
+  [(set_attr "length" "8")])
+\f
+;; Vector pair negate if we have the VNEGx instruction.
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VPAIR_NEG_VNEG 0 "vsx_register_operand" "=v")
+	(neg:VPAIR_NEG_VNEG
+	 (match_operand:VPAIR_NEG_VNEG 1 "vsx_register_operand" "v")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_neg<vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair negate if we have to do a subtract from 0
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VPAIR_NEG_SUB 0 "vsx_register_operand" "=v")
+	(neg:VPAIR_NEG_SUB
+	 (match_operand:VPAIR_NEG_SUB 1 "vsx_register_operand" "v")))
+   (clobber (match_scratch:<VPAIR_VECTOR> 2 "=&v"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = <VPAIR_VECTOR>mode;
+  rtx tmp = operands[2];
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  emit_move_insn (tmp, CONST0_RTX (mode));
+  emit_insn (gen_sub<vpair_vector_l>3 (gen_rtx_REG (mode, reg0),
+				       tmp,
+				       gen_rtx_REG (mode, reg1)));
+
+  emit_insn (gen_sub<vpair_vector_l>3 (gen_rtx_REG (mode, reg0 + 1),
+				       tmp,
+				       gen_rtx_REG (mode, reg1 + 1)));
+
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair logical unary operations
+(define_insn_and_split "<vpair_op><mode>2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(VPAIR_LOGICAL_UNARY:VPAIR_INT
+	 (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_<vpair_op><vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair logical binary operations
+(define_insn_and_split "<vpair_op><mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(VPAIR_LOGICAL_BINARY:VPAIR_INT
+	 (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_<vpair_op><vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optiomize vector pair ~(a | b)  or ((~a) & (~b)) to produce xxlnor
+(define_insn_and_split "*nor<mode>3_1"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (ior:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*nor<mode>3_2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(and:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (~a) & b to use xxlandc
+(define_insn_and_split "*andc<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(and:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_andc<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair ~(a ^ b) to produce xxleqv
+(define_insn_and_split "*eqv<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (xor:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+[(set_attr "length" "8")])
+
+
+;; Optiomize vector pair ~(a & b) or ((~a) | (~b)) to produce xxlnand
+(define_insn_and_split "*nand<mode>3_1"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (and:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nand<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*nand<mode>3_2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(ior:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nand<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (~a) | b to produce xxlorc
+(define_insn_and_split "*orc<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(ior:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_orc<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 26fa32829af..7d1f372d76e 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3566,6 +3566,33 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Exctract DF from V4DF, convert it into extract from V2DF.
+(define_insn_and_split "vsx_extract_v4df"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=wa,r")
+	(vec_select:DF
+	 (match_operand:V4DF 1 "gpc_reg_operand" "wa,wa")
+	 (parallel
+	  [(match_operand:QI 2 "const_0_to_3_operand" "n,n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(vec_select:DF
+	 (match_dup 3)
+	 (parallel [(match_dup 4)])))]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 2)
+      || (!WORDS_BIG_ENDIAN && element < 2))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V2DFmode, reg_num);
+  operands[4] = GEN_INT (element & 1);
+}
+  [(set_attr "type" "mfvsr,vecperm")])
+
 ;; Extract a SF element from V4SF
 (define_insn_and_split "vsx_extract_v4sf"
   [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
@@ -3653,6 +3680,35 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Extract SF from V8SF, converting it into an extract from V4SF
+(define_insn_and_split "vsx_extract_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(vec_select:SF
+	 (match_operand:V8SF 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_0_to_7_operand" "n")])))
+   (clobber (match_scratch:V4SF 3 "=0"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (vec_select:SF
+		    (match_dup 4)
+		    (parallel [(match_dup 5)])))
+	      (clobber (match_dup 3))])]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 4)
+      || (!WORDS_BIG_ENDIAN && element < 4))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V4SFmode, reg_num);
+  operands[4] = GEN_INT (element & 3);
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "fp")])
+
 ;; Expand the builtin form of xxpermdi to canonical rtl.
 (define_expand "vsx_xxpermdi_<mode>"
   [(match_operand:VSX_L 0 "vsx_register_operand")
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c
new file mode 100644
index 00000000000..89343c44a34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 double elements.  */
+
+typedef double vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_multiply (vectype_t *dest,
+	       vectype_t *a,
+	       vectype_t *b)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  *dest = *a * *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xvnegdp, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_fma (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  *dest = (*a * *b) + *c;
+}
+
+void
+test_fms (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  *dest = (*a * *b) - *c;
+}
+
+void
+test_nfma (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmadddp, 1 stxvp.  */
+  *dest = -((*a * *b) + *c);
+}
+
+void
+test_nfms (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmsubdp, 1 stxvp.  */
+  *dest = -((*a * *b) - *c);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c
new file mode 100644
index 00000000000..c598f6307d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 8 float elements.  */
+
+typedef float vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_multiply (vectype_t *dest,
+	       vectype_t *a,
+	       vectype_t *b)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  *dest = *a * *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xvnegsp, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_fma (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  *dest = (*a * *b) + *c;
+}
+
+void
+test_fms (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  *dest = (*a * *b) - *c;
+}
+
+void
+test_nfma (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmaddsp, 1 stxvp.  */
+  *dest = -((*a * *b) + *c);
+}
+
+void
+test_nfms (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmsubsp, 1 stxvp.  */
+  *dest = -((*a * *b) - *c);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       19 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}       8 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}     2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c
new file mode 100644
index 00000000000..b1952b046f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef long long vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vaddudm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubudm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 2 vnegd, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}    24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}   13 } } */
+/* { dg-final { scan-assembler-times {\mvaddudm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mvnegd\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsubudm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}  4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c
new file mode 100644
index 00000000000..110292bb4df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef int vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vadduwm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubuwm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 2 vnegw, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}    24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}   13 } } */
+/* { dg-final { scan-assembler-times {\mvadduwm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mvnegw\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuwm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}  4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c
new file mode 100644
index 00000000000..8921b04c468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef short vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vadduhm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubuhm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 1 xxspltib, 2 vsubuhm, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}     24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}    13 } } */
+/* { dg-final { scan-assembler-times {\mvadduhm\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuhm\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c
new file mode 100644
index 00000000000..a905e6b0a31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef unsigned char vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vaddubm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsububm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 1 xxspltib, 2 vsububm, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}      24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}     13 } } */
+/* { dg-final { scan-assembler-times {\mvaddubm\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsububm\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}     4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}     4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}   1 } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/meissner/heads/work146-vsize)] Add vector_size(32) support.
@ 2023-11-18  5:47 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2023-11-18  5:47 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:249561480abcffeab1dc8bcddd6b209247e4f7d7

commit 249561480abcffeab1dc8bcddd6b209247e4f7d7
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Sat Nov 18 00:46:22 2023 -0500

    Add vector_size(32) support.
    
    2023-11-18  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (const_0_to_31_operand): New predicate.
            (mma_assemble_input_operand): Allow any 16-byte vector type, not just
            V16QImode.
            * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
            __VECTOR_SIZE_32__ if -mvector-size-32 used.
            * config/rs6000/rs6000-protos.h (rs6000_expand_vector_pair_init): New
            declaration.
            (rs6000_expand_vector_pair_set): Likewise.
            (split_unary_vector_pair): Likewise.
            (split_binary_vector_pair): Likewisee.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (rs6000_hard_regno_mode_ok_uncached): Add
            support for -mvector-size-32.
            (rs6000_hard_regno_mode_ok): Likewise.
            (rs6000_setup_reg_addr_masks): Likewise.
            (rs6000_init_hard_regno_mode_ok): Likewise.
            (rs6000_option_override_internal): Likewise.
            (rs6000_builtin_vectorization_cost): Likewise.
            (vector_pair_to_vector_mode): New function.
            (+rs6000_split_vpair_constan): Likewise.
            (rs6000_expand_vector_pair_ini): Likewise.
            (rs6000_expand_vector_pair_set): Likewise.
            (reg_offset_addressing_ok_p): Likewise.
            (rs6000_emit_move): Likewise.
            (altivec_expand_vec_perm_le): Likewise.
            (rs6000_opt_var): Add -mvector-size-32.
            (split_unary_vector_pair): New function.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pai): Likewise.
            (rs6000_split_multireg_move): Add -mvector-size-32 support.
            * config/rs6000/rs6000.h (VECTOR_PAIR_MODE): New macro.
            * config/rs6000/rs6000.md (wd mode attribute): Add vector pair modes.
            (RELOAD mode iterator): Likewise.
            (toplevel): Include vector-pair.md.
            * config/rs6000/rs6000.opt (-mvector-size-32): New option.
            * config/rs6000/vector-pair.md: New file.
            * config/rs6000/vsx.md (vsx_extract_v4df): New insn.
            (vsx_extract_v8sf): Likewise.
            * doc/invoke.texi (RS/6000 and PowerPC Options): Document
            -mvector-size-32.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-size-32-1.c: New test.
            * gcc.target/powerpc/vector-size-32-2.c: Likewise.
            * gcc.target/powerpc/vector-size-32-3.c: Likewise.
            * gcc.target/powerpc/vector-size-32-4.c: Likewise.
            * gcc.target/powerpc/vector-size-32-5.c: Likewise.
            * gcc.target/powerpc/vector-size-32-6.c: Likewise.

Diff:
---
 gcc/config/rs6000/predicates.md   |   9 +-
 gcc/config/rs6000/rs6000-c.cc     |   3 +
 gcc/config/rs6000/rs6000-protos.h |   7 +
 gcc/config/rs6000/rs6000.cc       | 381 ++++++++++++++++++++++++++++++++++++--
 gcc/config/rs6000/rs6000.h        |   6 +
 gcc/config/rs6000/rs6000.md       |   7 +-
 gcc/config/rs6000/rs6000.opt      |   4 +
 gcc/config/rs6000/vsx.md          |  56 ++++++
 gcc/doc/invoke.texi               |  14 +-
 9 files changed, 464 insertions(+), 23 deletions(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c4..8a56487d7d2 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -327,6 +327,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
 
+;; Match op = 0..31
+(define_predicate "const_0_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
 ;; Return 1 if op is a 34-bit constant integer.
 (define_predicate "cint34_operand"
   (match_code "const_int")
@@ -1301,8 +1306,10 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(GET_MODE_SIZE (mode) == 16 && VECTOR_MODE_P (mode)
 		&& (vsx_register_operand (op, mode)
+		    || op == CONST0_RTX (mode)
+		    || vsx_prefixed_constant (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
 			    || quad_address_p (XEXP (op, 0), mode, false)))))"))
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 65be0ac43e2..27114b14022 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -631,6 +631,9 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
     builtin_define ("__SIZEOF_IBM128__=16");
   if (ieee128_float_type_node)
     builtin_define ("__SIZEOF_IEEE128__=16");
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    builtin_define ("__VECTOR_SIZE_32__");
+
 #ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
   builtin_define ("__BUILTIN_CPU_SUPPORTS__");
 #endif
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..13687c5b1b3 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -61,7 +61,9 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
+extern void rs6000_expand_vector_pair_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, rtx);
+extern void rs6000_expand_vector_pair_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
@@ -138,6 +140,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0dd21e67dde..41c9f2c4e50 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -1843,7 +1843,7 @@ rs6000_hard_regno_mode_ok_uncached (int regno, machine_mode mode)
 
   /* Vector pair modes need even/odd VSX register pairs.  Only allow vector
      registers.  */
-  if (mode == OOmode)
+  if (VECTOR_PAIR_MODE (mode))
     return (TARGET_MMA && VSX_REGNO_P (regno) && (regno & 1) == 0);
 
   /* MMA accumulator modes need FPR registers divisible by 4.  */
@@ -1954,9 +1954,10 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
    GPR registers, and TImode can go in any GPR as well as VSX registers (PR
    57744).
 
-   Similarly, don't allow OOmode (vector pair, restricted to even VSX
-   registers) or XOmode (vector quad, restricted to FPR registers divisible
-   by 4) to tie with other modes.
+   Similarly, don't allow XOmode (vector quad, restricted to FPR registers
+   divisible by 4) to tie with other modes.
+
+   Vector pair modes can tie with other vector pair modes.
 
    Altivec/VSX vector tests were moved ahead of scalar float mode, so that IEEE
    128-bit floating point on VSX systems ties with other vectors.  */
@@ -1964,9 +1965,14 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 static bool
 rs6000_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
-  if (mode1 == PTImode || mode1 == OOmode || mode1 == XOmode
-      || mode2 == PTImode || mode2 == OOmode || mode2 == XOmode)
-    return mode1 == mode2;
+  if (mode1 == PTImode || mode1 == XOmode
+       || mode2 == PTImode || mode2 == XOmode)
+     return mode1 == mode2;
+ 
+  if (VECTOR_PAIR_MODE (mode1))
+    return VECTOR_PAIR_MODE (mode2);
+  if (VECTOR_PAIR_MODE (mode2))
+    return ALTIVEC_OR_VSX_VECTOR_MODE (mode1);
 
   if (ALTIVEC_OR_VSX_VECTOR_MODE (mode1))
     return ALTIVEC_OR_VSX_VECTOR_MODE (mode2);
@@ -2715,13 +2721,13 @@ rs6000_setup_reg_addr_masks (void)
 	     of the LXVP or STXVP instructions, do not allow indexed mode so
 	     that we can split the load/store.  */
 	  else if ((addr_mask != 0) && TARGET_MMA
-		   && (m2 == OOmode || m2 == XOmode))
+		   && (VECTOR_PAIR_MODE (m2) || m2 == XOmode))
 	    {
 	      addr_mask |= RELOAD_REG_OFFSET;
 	      if (rc == RELOAD_REG_FPR || rc == RELOAD_REG_VMX)
 		{
 		  addr_mask |= RELOAD_REG_QUAD_OFFSET;
-		  if (m2 == OOmode
+		  if (VECTOR_PAIR_MODE (m2)
 		      && TARGET_LOAD_VECTOR_PAIR
 		      && TARGET_STORE_VECTOR_PAIR)
 		    addr_mask |= RELOAD_REG_INDEXED;
@@ -2941,6 +2947,33 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
       rs6000_vector_align[XOmode] = 512;
     }
 
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      rs6000_vector_unit[V32QImode] = VECTOR_NONE;
+      rs6000_vector_mem[V32QImode] = VECTOR_VSX;
+      rs6000_vector_align[V32QImode] = 256;
+
+      rs6000_vector_unit[V16HImode] = VECTOR_NONE;
+      rs6000_vector_mem[V16HImode] = VECTOR_VSX;
+      rs6000_vector_align[V16HImode] = 256;
+
+      rs6000_vector_unit[V8SImode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SImode] = VECTOR_VSX;
+      rs6000_vector_align[V8SImode] = 256;
+
+      rs6000_vector_unit[V8SFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SFmode] = VECTOR_VSX;
+      rs6000_vector_align[V8SFmode] = 256;
+
+      rs6000_vector_unit[V4DImode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DImode] = VECTOR_VSX;
+      rs6000_vector_align[V4DImode] = 256;
+
+      rs6000_vector_unit[V4DFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DFmode] = VECTOR_VSX;
+      rs6000_vector_align[V4DFmode] = 256;
+    }
+
   /* Register class constraints for the constraints that depend on compile
      switches. When the VSX code was added, different constraints were added
      based on the type (DFmode, V2DFmode, V4SFmode).  For the vector types, all
@@ -3072,6 +3105,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 		  reg_addr[XOmode].reload_store = CODE_FOR_reload_xo_di_store;
 		  reg_addr[XOmode].reload_load = CODE_FOR_reload_xo_di_load;
 		}
+
+	      if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+		{
+		  reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_di_store;
+		  reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_di_load;
+		  reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_di_store;
+		  reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_di_load;
+		  reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_di_store;
+		  reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_di_load;
+		  reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_di_store;
+		  reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_di_load;
+		  reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_di_store;
+		  reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_di_load;
+		  reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_di_store;
+		  reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_di_load;
+		}
 	    }
 	}
       else
@@ -3129,6 +3178,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 	      reg_addr[DDmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdd;
 	      reg_addr[DFmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdf;
 	    }
+
+	  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+	    {
+	      reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_si_store;
+	      reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_si_load;
+	      reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_si_store;
+	      reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_si_load;
+	      reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_si_store;
+	      reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_si_load;
+	      reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_si_store;
+	      reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_si_load;
+	      reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_si_store;
+	      reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_si_load;
+	      reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_si_store;
+	      reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_si_load;
+	    }
 	}
 
       reg_addr[DFmode].scalar_in_vmx_p = true;
@@ -4429,6 +4494,15 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= OPTION_MASK_STORE_VECTOR_PAIR;
     }
 
+  if (!TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      if (OPTION_SET_P (TARGET_VECTOR_SIZE_32))
+	warning (0, "%qs should not be used unless you use %qs",
+		 "-mvector-size-32", "-mmma");
+
+      TARGET_VECTOR_SIZE_32 = 0;
+    }
+
   /* Enable power10 fusion if we are tuning for power10, even if we aren't
      generating power10 instructions.  */
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION))
@@ -5174,6 +5248,15 @@ rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 static machine_mode
 rs6000_preferred_simd_mode (scalar_mode mode)
 {
+  /* Prefer vector pair for floating point, but not for integer modes.  */
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      if (mode == DFmode)
+	return V4DFmode;
+      else if (mode == SFmode)
+	return V8SFmode;
+    }
+
   opt_machine_mode vmode = mode_for_vector (mode, 16 / GET_MODE_SIZE (mode));
 
   if (vmode.exists () && !VECTOR_MEM_NONE_P (vmode.require ()))
@@ -7275,6 +7358,141 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
+/* For a vector pair mode, return the equivalent vector mode or VOIDmode.  */
+
+static machine_mode
+vector_pair_to_vector_mode (machine_mode mode)
+{
+  machine_mode vmode;
+
+  switch (mode)
+    {
+    case E_V32QImode:  vmode = V16QImode; break;
+    case E_V16HImode:  vmode = V8HImode;  break;
+    case E_V8SImode:   vmode = V4SImode;  break;
+    case E_V4DImode:   vmode = V2DImode;  break;
+    case E_V8SFmode:   vmode = V4SFmode;  break;
+    case E_V4DFmode:   vmode = V2DFmode;  break;
+    default:           vmode = VOIDmode;  break;
+    }
+
+  return vmode;
+}
+
+/* Split a vector constant for a type that can be held into a vector register
+   pair into 2 separate constants that can be held in a single vector register.
+   Return true if we can split the constant.  */
+
+static bool
+rs6000_split_vpair_constant (rtx op, rtx *high, rtx *low)
+{
+  machine_mode vmode = vector_pair_to_vector_mode (GET_MODE (op));
+
+  *high = *low = NULL_RTX;
+
+  if (!CONST_VECTOR_P (op) || vmode == GET_MODE (op))
+    return false;
+
+  size_t nunits = GET_MODE_NUNITS (vmode);
+  rtvec hi_vec = rtvec_alloc (nunits);
+  rtvec lo_vec = rtvec_alloc (nunits);
+
+  for (size_t i = 0; i < nunits; i++)
+    {
+      RTVEC_ELT (hi_vec, i) = CONST_VECTOR_ELT (op, i);
+      RTVEC_ELT (lo_vec, i) = CONST_VECTOR_ELT (op, i + nunits);
+    }
+
+  *high = gen_rtx_CONST_VECTOR (vmode, hi_vec);
+  *low = gen_rtx_CONST_VECTOR (vmode, lo_vec);
+  return true;
+}
+
+/* Initialize vector pair TARGET to VALS.  */
+
+void
+rs6000_expand_vector_pair_init (rtx target, rtx vals)
+{
+  machine_mode mode_vpair = GET_MODE (target);
+  machine_mode mode_vector;
+  size_t n_elts_vpair = GET_MODE_NUNITS (mode_vpair);
+  bool all_same = true;
+  rtx first = XVECEXP (vals, 0, 0);
+  rtx (*gen_splat) (rtx, rtx);
+  rtx (*gen_concat) (rtx, rtx, rtx);
+
+  switch (mode_vpair)
+    {
+    case E_V32QImode:
+      mode_vector = V16QImode;
+      gen_splat = gen_vpair_splat_v32qi;
+      gen_concat = gen_vpair_concat_v32qi;
+      break;
+
+    case E_V16HImode:
+      mode_vector = V8HImode;
+      gen_splat = gen_vpair_splat_v16hi;
+      gen_concat = gen_vpair_concat_v16hi;
+      break;
+
+    case E_V8SImode:
+      mode_vector = V4SImode;
+      gen_splat = gen_vpair_splat_v8si;
+      gen_concat = gen_vpair_concat_v8si;
+      break;
+
+    case E_V4DImode:
+      mode_vector = V2DImode;
+      gen_splat = gen_vpair_splat_v4di;
+      gen_concat = gen_vpair_concat_v4di;
+      break;
+
+    case E_V8SFmode:
+      mode_vector = V4SFmode;
+      gen_splat = gen_vpair_splat_v8sf;
+      gen_concat = gen_vpair_concat_v8sf;
+      break;
+
+    case E_V4DFmode:
+      mode_vector = V2DFmode;
+      gen_splat = gen_vpair_splat_v4df;
+      gen_concat = gen_vpair_concat_v4df;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* See if we can do a splat operation.  */
+  for (size_t i = 1; i < n_elts_vpair; ++i)
+    {
+      if (!rtx_equal_p (XVECEXP (vals, 0, i), first))
+	{
+	  all_same = false;
+	  break;
+	}
+    }
+
+  if (all_same)
+    {
+      emit_insn (gen_splat (target, first));
+      return;
+    }
+
+  /* Break the initialization into two parts.  */
+  rtx vector_hi = gen_reg_rtx (mode_vector);
+  rtx vector_lo = gen_reg_rtx (mode_vector);
+  rtx vals_hi;
+  rtx vals_lo;
+
+  rs6000_split_vpair_constant (vals, &vals_hi, &vals_lo);
+
+  rs6000_expand_vector_init (vector_hi, vals_hi);
+  rs6000_expand_vector_init (vector_lo, vals_lo);
+  emit_insn (gen_concat (target, vector_hi, vector_lo));
+  return;
+}
+
 /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
    is variable and also counts by vector element size for p9 and above.  */
 
@@ -7603,6 +7821,15 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
   emit_insn (gen_rtx_SET (target, x));
 }
 
+/* Set field ELT_RTX of vaector pair TARGET to VAL.  */
+
+void
+rs6000_expand_vector_pair_set (rtx target, rtx val, rtx elt_rtx)
+{
+  if (target || val || elt_rtx)
+    gcc_unreachable ();
+}
+
 /* Extract field ELT from VEC into TARGET.  */
 
 void
@@ -8694,6 +8921,8 @@ reg_offset_addressing_ok_p (machine_mode mode)
       /* The vector pair/quad types support offset addressing if the
 	 underlying vectors support offset addressing.  */
     case E_OOmode:
+    case E_V8SFmode:
+    case E_V4DFmode:
     case E_XOmode:
       return TARGET_MMA;
 
@@ -11202,6 +11431,12 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
     case E_V2DFmode:
     case E_V2DImode:
     case E_V1TImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V4DFmode:
+    case E_V4DImode:
       if (CONSTANT_P (operands[1])
 	  && !easy_vector_constant (operands[1], mode))
 	operands[1] = force_const_mem (mode, operands[1]);
@@ -13456,7 +13691,7 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
      the GPR registers.  */
   if (rclass == GEN_OR_FLOAT_REGS)
     {
-      if (mode == OOmode)
+      if (VECTOR_PAIR_MODE (mode))
 	return VSX_REGS;
 
       if (mode == XOmode)
@@ -23417,6 +23652,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   rtx tmp = target;
   rtx norreg = gen_reg_rtx (V16QImode);
   machine_mode mode = GET_MODE (target);
+  machine_mode qi_vmode = VECTOR_PAIR_MODE (mode) ? V32QImode : V16QImode;
 
   /* Get everything in regs so the pattern matches.  */
   if (!REG_P (op0))
@@ -23424,7 +23660,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   if (!REG_P (op1))
     op1 = force_reg (mode, op1);
   if (!REG_P (sel))
-    sel = force_reg (V16QImode, sel);
+    sel = force_reg (qi_vmode, sel);
   if (!REG_P (target))
     tmp = gen_reg_rtx (mode);
 
@@ -23437,10 +23673,10 @@ altivec_expand_vec_perm_le (rtx operands[4])
     {
       /* Invert the selector with a VNAND if available, else a VNOR.
 	 The VNAND is preferred for future fusion opportunities.  */
-      notx = gen_rtx_NOT (V16QImode, sel);
+      notx = gen_rtx_NOT (qi_vmode, sel);
       iorx = (TARGET_P8_VECTOR
-	      ? gen_rtx_IOR (V16QImode, notx, notx)
-	      : gen_rtx_AND (V16QImode, notx, notx));
+	      ? gen_rtx_IOR (qi_vmode, notx, notx)
+	      : gen_rtx_AND (qi_vmode, notx, notx));
       emit_insn (gen_rtx_SET (norreg, iorx));
 
       /* Permute with operands reversed and adjusted selector.  */
@@ -24572,6 +24808,9 @@ static struct rs6000_opt_var const rs6000_opt_vars[] =
   { "speculate-indirect-jumps",
     offsetof (struct gcc_options, x_rs6000_speculate_indirect_jumps),
     offsetof (struct cl_target_option, x_rs6000_speculate_indirect_jumps), },
+  { "vector-size-32",
+    offsetof (struct gcc_options, x_TARGET_VECTOR_SIZE_32),
+    offsetof (struct cl_target_option, x_TARGET_VECTOR_SIZE_32), },
 };
 
 /* Inner function to handle attribute((target("..."))) and #pragma GCC target
@@ -27408,6 +27647,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
@@ -27426,6 +27739,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
   int reg_mode_size;
   /* The number of registers that will be moved.  */
   int nregs;
+  /* Hi/lo values for splitting vector pair constants.  */
+  rtx vpair_hi, vpair_lo;
 
   reg = REG_P (dst) ? REGNO (dst) : REGNO (src);
   mode = GET_MODE (dst);
@@ -27441,8 +27756,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
     }
   /* If we have a vector pair/quad mode, split it into two/four separate
      vectors.  */
-  else if (mode == OOmode || mode == XOmode)
-    reg_mode = V1TImode;
+  else if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
+    {
+      machine_mode vmode = vector_pair_to_vector_mode (mode);
+      reg_mode = (vmode == VOIDmode) ? V1TImode : vmode;
+    }
   else if (FP_REGNO_P (reg))
     reg_mode = DECIMAL_FLOAT_MODE_P (mode) ? DDmode :
 	(TARGET_HARD_FLOAT ? DFmode : SFmode);
@@ -27454,6 +27772,29 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 
   gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
 
+  /* Handle vector pair constants.  */
+  if (CONST_VECTOR_P (src) && VECTOR_PAIR_MODE (mode) && TARGET_MMA
+      && rs6000_split_vpair_constant (src, &vpair_hi, &vpair_lo)
+      && VSX_REGNO_P (reg))
+    {
+      reg_mode = GET_MODE (vpair_hi);
+      rtx reg_hi = gen_rtx_REG (reg_mode, reg);
+      rtx reg_lo = gen_rtx_REG (reg_mode, reg + 1);
+
+      emit_move_insn (reg_hi, vpair_hi);
+
+      /* 0.0 is easy.  For other constants, copy the high register into the low
+	 register if the two sets of constants are equal.  This means we won't
+	 be doing back to back prefixed load immediate instructions.  */
+      if (rtx_equal_p (vpair_hi, vpair_lo)
+	  && !rtx_equal_p (vpair_hi, CONST0_RTX (reg_mode)))
+	emit_move_insn (reg_lo, reg_hi);
+      else
+	emit_move_insn (reg_lo, vpair_lo);
+      
+      return;
+    }
+      
   /* TDmode residing in FP registers is special, since the ISA requires that
      the lower-numbered word of a register pair is always the most significant
      word, even in little-endian mode.  This does not match the usual subreg
@@ -27493,7 +27834,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
      below.  This means the last register gets the first memory
      location.  We also need to be careful of using the right register
      numbers if we are splitting XO to OO.  */
-  if (mode == OOmode || mode == XOmode)
+  if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
     {
       nregs = hard_regno_nregs (reg, mode);
       int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
@@ -27553,7 +27894,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	  gcc_assert (REG_P (dst));
 	  if (GET_MODE (src) == XOmode)
 	    gcc_assert (FP_REGNO_P (REGNO (dst)));
-	  if (GET_MODE (src) == OOmode)
+	  if (VECTOR_PAIR_MODE (GET_MODE (src)))
 	    gcc_assert (VSX_REGNO_P (REGNO (dst)));
 
 	  int nvecs = XVECLEN (src, 0);
@@ -27628,7 +27969,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	 overlap.  */
       int i;
       /* XO/OO are opaque so cannot use subregs. */
-      if (mode == OOmode || mode == XOmode )
+      if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	{
 	  for (i = nregs - 1; i >= 0; i--)
 	    {
@@ -27802,7 +28143,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	    continue;
 
 	  /* XO/OO are opaque so cannot use subregs. */
-	  if (mode == OOmode || mode == XOmode )
+	  if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	    {
 	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
 	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 326c45221e9..32848f7d15b 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -1006,6 +1006,12 @@ enum data_align { align_abi, align_opt, align_both };
   (ALTIVEC_VECTOR_MODE (MODE) || VSX_VECTOR_MODE (MODE)			\
    || (MODE) == V2DImode || (MODE) == V1TImode)
 
+/* Whether a mode is held in paired vector registers.  */
+#define VECTOR_PAIR_MODE(MODE)						\
+  ((MODE) == OOmode							\
+   || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode)
+
 /* Post-reload, we can't use any new AltiVec registers, as we already
    emitted the vrsave mask.  */
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dcf1f3526f5..29292c5f5b5 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -683,9 +683,13 @@
 		      (HI    "h")
 		      (SI    "w")
 		      (DI    "d")
+		      (V32QI "b")
 		      (V16QI "b")
+		      (V16HI "h")
 		      (V8HI  "h")
+		      (V8SI  "w")
 		      (V4SI  "w")
+		      (V4DI  "d")
 		      (V2DI  "d")
 		      (V1TI  "q")
 		      (TI    "q")])
@@ -812,7 +816,7 @@
 ;; supplement addressing modes.
 (define_mode_iterator RELOAD [V16QI V8HI V4SI V2DI V4SF V2DF V1TI
 			      SF SD SI DF DD DI TI PTI KF IF TF
-			      OO XO])
+			      OO XO V8SF V4DF V32QI V16HI V8SI V4DI])
 
 ;; Iterate over smin, smax
 (define_code_iterator fp_minmax	[smin smax])
@@ -15767,6 +15771,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 369095df9ed..38e24e20032 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -605,6 +605,10 @@ mstore-vector-pair
 Target Undocumented Mask(STORE_VECTOR_PAIR) Var(rs6000_isa_flags)
 Generate (do not generate) store vector pair instructions.
 
+mvector-size-32
+Target Var(TARGET_VECTOR_SIZE_32) Init(0) Save
+Generate (do not generate) vector pair instructions for float/double vector_size(32).
+
 mrelative-jumptables
 Target Undocumented Var(rs6000_relative_jumptables) Init(1) Save
 
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 26fa32829af..7d1f372d76e 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3566,6 +3566,33 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Exctract DF from V4DF, convert it into extract from V2DF.
+(define_insn_and_split "vsx_extract_v4df"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=wa,r")
+	(vec_select:DF
+	 (match_operand:V4DF 1 "gpc_reg_operand" "wa,wa")
+	 (parallel
+	  [(match_operand:QI 2 "const_0_to_3_operand" "n,n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(vec_select:DF
+	 (match_dup 3)
+	 (parallel [(match_dup 4)])))]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 2)
+      || (!WORDS_BIG_ENDIAN && element < 2))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V2DFmode, reg_num);
+  operands[4] = GEN_INT (element & 1);
+}
+  [(set_attr "type" "mfvsr,vecperm")])
+
 ;; Extract a SF element from V4SF
 (define_insn_and_split "vsx_extract_v4sf"
   [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
@@ -3653,6 +3680,35 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Extract SF from V8SF, converting it into an extract from V4SF
+(define_insn_and_split "vsx_extract_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(vec_select:SF
+	 (match_operand:V8SF 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_0_to_7_operand" "n")])))
+   (clobber (match_scratch:V4SF 3 "=0"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (vec_select:SF
+		    (match_dup 4)
+		    (parallel [(match_dup 5)])))
+	      (clobber (match_dup 3))])]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 4)
+      || (!WORDS_BIG_ENDIAN && element < 4))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V4SFmode, reg_num);
+  operands[4] = GEN_INT (element & 3);
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "fp")])
+
 ;; Expand the builtin form of xxpermdi to canonical rtl.
 (define_expand "vsx_xxpermdi_<mode>"
   [(match_operand:VSX_L 0 "vsx_register_operand")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1748afdbfe0..786883df413 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1311,7 +1311,7 @@ See RS/6000 and PowerPC Options.
 -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{reg}
 -mstack-protector-guard-offset=@var{offset} -mprefixed -mno-prefixed
 -mpcrel -mno-pcrel -mmma -mno-mmma -mrop-protect -mno-rop-protect
--mprivileged -mno-privileged}
+-mprivileged -mno-privileged -mvector-size-32 -mno-vector-size-32}
 
 @emph{RX Options}
 @gccoptlist{-m64bit-doubles  -m32bit-doubles  -fpu  -nofpu
@@ -30960,6 +30960,18 @@ optimization (@option{-fshrink-wrap}).
 @itemx -mno-privileged
 Generate (do not generate) code that will run in privileged state.
 
+@opindex mvector-size-32
+@opindex mno-vector-size-32
+@item -mvector-size-32
+@itemx -mno-vector-size-32
+Generate (do not generate) code that will use the load vector pair and
+store vector pair instructions for vectorization.  This options
+requires @option{-mmma} to be enabled.
+
+At the present time, @option{-mvector-size-32} is not enabled by
+default.  If you use @option{-mvector-size-32}, if will only enable
+vectorized for @code{float} and @code{double} operations.
+
 @opindex block-ops-unaligned-vsx
 @opindex no-block-ops-unaligned-vsx
 @item -mblock-ops-unaligned-vsx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/meissner/heads/work146-vsize)] Add vector_size(32) support.
@ 2023-11-18  4:38 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2023-11-18  4:38 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:5a967002e67f054ab4336a9ed39f5f7860f93dea

commit 5a967002e67f054ab4336a9ed39f5f7860f93dea
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Nov 17 23:37:45 2023 -0500

    Add vector_size(32) support.
    
    2023-11-17  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (const_0_to_31_operand): New predicate.
            (mma_assemble_input_operand): Allow any 16-byte vector type, not just
            V16QImode.
            * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
            __VECTOR_SIZE_32__ if -mvector-size-32 used.
            * config/rs6000/rs6000-protos.h (rs6000_expand_vector_pair_init): New
            declaration.
            (rs6000_expand_vector_pair_set): Likewise.
            (split_unary_vector_pair): Likewise.
            (split_binary_vector_pair): Likewisee.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (rs6000_hard_regno_mode_ok_uncached): Add
            support for -mvector-size-32.
            (rs6000_hard_regno_mode_ok): Likewise.
            (rs6000_setup_reg_addr_masks): Likewise.
            (rs6000_init_hard_regno_mode_ok): Likewise.
            (rs6000_option_override_internal): Likewise.
            (rs6000_builtin_vectorization_cost): Likewise.
            (vector_pair_to_vector_mode): New function.
            (+rs6000_split_vpair_constan): Likewise.
            (rs6000_expand_vector_pair_ini): Likewise.
            (rs6000_expand_vector_pair_set): Likewise.
            (reg_offset_addressing_ok_p): Likewise.
            (rs6000_emit_move): Likewise.
            (altivec_expand_vec_perm_le): Likewise.
            (rs6000_opt_var): Add -mvector-size-32.
            (split_unary_vector_pair): New function.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pai): Likewise.
            (rs6000_split_multireg_move): Add -mvector-size-32 support.
            * config/rs6000/rs6000.h (VECTOR_PAIR_MODE): New macro.
            * config/rs6000/rs6000.md (wd mode attribute): Add vector pair modes.
            (RELOAD mode iterator): Likewise.
            (toplevel): Include vector-pair.md.
            * config/rs6000/rs6000.opt (-mvector-size-32): New option.
            * config/rs6000/vector-pair.md: New file.
            * config/rs6000/vsx.md (vsx_extract_v4df): New insn.
            (vsx_extract_v8sf): Likewise.
            * doc/invoke.texi (RS/6000 and PowerPC Options): Document
            -mvector-size-32.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-size-32-1.c: New test.
            * gcc.target/powerpc/vector-size-32-2.c: Likewise.
            * gcc.target/powerpc/vector-size-32-3.c: Likewise.
            * gcc.target/powerpc/vector-size-32-4.c: Likewise.
            * gcc.target/powerpc/vector-size-32-5.c: Likewise.
            * gcc.target/powerpc/vector-size-32-6.c: Likewise.

Diff:
---
 gcc/config/rs6000/predicates.md                    |   9 +-
 gcc/config/rs6000/rs6000-c.cc                      |   3 +
 gcc/config/rs6000/rs6000-protos.h                  |   7 +
 gcc/config/rs6000/rs6000.cc                        | 370 ++++++++++-
 gcc/config/rs6000/rs6000.h                         |   6 +
 gcc/config/rs6000/rs6000.md                        |   7 +-
 gcc/config/rs6000/rs6000.opt                       |   4 +
 gcc/config/rs6000/vector-pair.md                   | 704 +++++++++++++++++++++
 gcc/config/rs6000/vsx.md                           |  56 ++
 gcc/doc/invoke.texi                                |  14 +-
 .../gcc.target/powerpc/vector-size-32-1.c          |  85 +++
 .../gcc.target/powerpc/vector-size-32-2.c          |  96 +++
 .../gcc.target/powerpc/vector-size-32-3.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-4.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-5.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-6.c          | 137 ++++
 16 files changed, 1886 insertions(+), 23 deletions(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c4..8a56487d7d2 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -327,6 +327,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
 
+;; Match op = 0..31
+(define_predicate "const_0_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
 ;; Return 1 if op is a 34-bit constant integer.
 (define_predicate "cint34_operand"
   (match_code "const_int")
@@ -1301,8 +1306,10 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(GET_MODE_SIZE (mode) == 16 && VECTOR_MODE_P (mode)
 		&& (vsx_register_operand (op, mode)
+		    || op == CONST0_RTX (mode)
+		    || vsx_prefixed_constant (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
 			    || quad_address_p (XEXP (op, 0), mode, false)))))"))
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 65be0ac43e2..27114b14022 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -631,6 +631,9 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
     builtin_define ("__SIZEOF_IBM128__=16");
   if (ieee128_float_type_node)
     builtin_define ("__SIZEOF_IEEE128__=16");
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    builtin_define ("__VECTOR_SIZE_32__");
+
 #ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
   builtin_define ("__BUILTIN_CPU_SUPPORTS__");
 #endif
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..13687c5b1b3 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -61,7 +61,9 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
+extern void rs6000_expand_vector_pair_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, rtx);
+extern void rs6000_expand_vector_pair_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
@@ -138,6 +140,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0dd21e67dde..b85e3b75d6a 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -1843,7 +1843,7 @@ rs6000_hard_regno_mode_ok_uncached (int regno, machine_mode mode)
 
   /* Vector pair modes need even/odd VSX register pairs.  Only allow vector
      registers.  */
-  if (mode == OOmode)
+  if (VECTOR_PAIR_MODE (mode))
     return (TARGET_MMA && VSX_REGNO_P (regno) && (regno & 1) == 0);
 
   /* MMA accumulator modes need FPR registers divisible by 4.  */
@@ -1954,9 +1954,10 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
    GPR registers, and TImode can go in any GPR as well as VSX registers (PR
    57744).
 
-   Similarly, don't allow OOmode (vector pair, restricted to even VSX
-   registers) or XOmode (vector quad, restricted to FPR registers divisible
-   by 4) to tie with other modes.
+   Similarly, don't allow XOmode (vector quad, restricted to FPR registers
+   divisible by 4) to tie with other modes.
+
+   Vector pair modes can tie with other vector pair modes.
 
    Altivec/VSX vector tests were moved ahead of scalar float mode, so that IEEE
    128-bit floating point on VSX systems ties with other vectors.  */
@@ -1964,9 +1965,14 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 static bool
 rs6000_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
-  if (mode1 == PTImode || mode1 == OOmode || mode1 == XOmode
-      || mode2 == PTImode || mode2 == OOmode || mode2 == XOmode)
-    return mode1 == mode2;
+  if (mode1 == PTImode || mode1 == XOmode
+       || mode2 == PTImode || mode2 == XOmode)
+     return mode1 == mode2;
+ 
+  if (VECTOR_PAIR_MODE (mode1))
+    return VECTOR_PAIR_MODE (mode2);
+  if (VECTOR_PAIR_MODE (mode2))
+    return ALTIVEC_OR_VSX_VECTOR_MODE (mode1);
 
   if (ALTIVEC_OR_VSX_VECTOR_MODE (mode1))
     return ALTIVEC_OR_VSX_VECTOR_MODE (mode2);
@@ -2715,13 +2721,13 @@ rs6000_setup_reg_addr_masks (void)
 	     of the LXVP or STXVP instructions, do not allow indexed mode so
 	     that we can split the load/store.  */
 	  else if ((addr_mask != 0) && TARGET_MMA
-		   && (m2 == OOmode || m2 == XOmode))
+		   && (VECTOR_PAIR_MODE (m2) || m2 == XOmode))
 	    {
 	      addr_mask |= RELOAD_REG_OFFSET;
 	      if (rc == RELOAD_REG_FPR || rc == RELOAD_REG_VMX)
 		{
 		  addr_mask |= RELOAD_REG_QUAD_OFFSET;
-		  if (m2 == OOmode
+		  if (VECTOR_PAIR_MODE (m2)
 		      && TARGET_LOAD_VECTOR_PAIR
 		      && TARGET_STORE_VECTOR_PAIR)
 		    addr_mask |= RELOAD_REG_INDEXED;
@@ -2941,6 +2947,33 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
       rs6000_vector_align[XOmode] = 512;
     }
 
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      rs6000_vector_unit[V32QImode] = VECTOR_NONE;
+      rs6000_vector_mem[V32QImode] = VECTOR_VSX;
+      rs6000_vector_align[V32QImode] = 256;
+
+      rs6000_vector_unit[V16HImode] = VECTOR_NONE;
+      rs6000_vector_mem[V16HImode] = VECTOR_VSX;
+      rs6000_vector_align[V16HImode] = 256;
+
+      rs6000_vector_unit[V8SImode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SImode] = VECTOR_VSX;
+      rs6000_vector_align[V8SImode] = 256;
+
+      rs6000_vector_unit[V8SFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SFmode] = VECTOR_VSX;
+      rs6000_vector_align[V8SFmode] = 256;
+
+      rs6000_vector_unit[V4DImode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DImode] = VECTOR_VSX;
+      rs6000_vector_align[V4DImode] = 256;
+
+      rs6000_vector_unit[V4DFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DFmode] = VECTOR_VSX;
+      rs6000_vector_align[V4DFmode] = 256;
+    }
+
   /* Register class constraints for the constraints that depend on compile
      switches. When the VSX code was added, different constraints were added
      based on the type (DFmode, V2DFmode, V4SFmode).  For the vector types, all
@@ -3072,6 +3105,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 		  reg_addr[XOmode].reload_store = CODE_FOR_reload_xo_di_store;
 		  reg_addr[XOmode].reload_load = CODE_FOR_reload_xo_di_load;
 		}
+
+	      if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+		{
+		  reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_di_store;
+		  reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_di_load;
+		  reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_di_store;
+		  reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_di_load;
+		  reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_di_store;
+		  reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_di_load;
+		  reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_di_store;
+		  reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_di_load;
+		  reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_di_store;
+		  reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_di_load;
+		  reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_di_store;
+		  reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_di_load;
+		}
 	    }
 	}
       else
@@ -3129,6 +3178,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 	      reg_addr[DDmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdd;
 	      reg_addr[DFmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdf;
 	    }
+
+	  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+	    {
+	      reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_si_store;
+	      reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_si_load;
+	      reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_si_store;
+	      reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_si_load;
+	      reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_si_store;
+	      reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_si_load;
+	      reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_si_store;
+	      reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_si_load;
+	      reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_si_store;
+	      reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_si_load;
+	      reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_si_store;
+	      reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_si_load;
+	    }
 	}
 
       reg_addr[DFmode].scalar_in_vmx_p = true;
@@ -4429,6 +4494,15 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= OPTION_MASK_STORE_VECTOR_PAIR;
     }
 
+  if (!TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      if (OPTION_SET_P (TARGET_VECTOR_SIZE_32))
+	warning (0, "%qs should not be used unless you use %qs",
+		 "-mvector-size-32", "-mmma");
+
+      TARGET_VECTOR_SIZE_32 = 0;
+    }
+
   /* Enable power10 fusion if we are tuning for power10, even if we aren't
      generating power10 instructions.  */
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION))
@@ -7275,6 +7349,141 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
+/* For a vector pair mode, return the equivalent vector mode or VOIDmode.  */
+
+static machine_mode
+vector_pair_to_vector_mode (machine_mode mode)
+{
+  machine_mode vmode;
+
+  switch (mode)
+    {
+    case E_V32QImode:  vmode = V16QImode; break;
+    case E_V16HImode:  vmode = V8HImode;  break;
+    case E_V8SImode:   vmode = V4SImode;  break;
+    case E_V4DImode:   vmode = V2DImode;  break;
+    case E_V8SFmode:   vmode = V4SFmode;  break;
+    case E_V4DFmode:   vmode = V2DFmode;  break;
+    default:           vmode = VOIDmode;  break;
+    }
+
+  return vmode;
+}
+
+/* Split a vector constant for a type that can be held into a vector register
+   pair into 2 separate constants that can be held in a single vector register.
+   Return true if we can split the constant.  */
+
+static bool
+rs6000_split_vpair_constant (rtx op, rtx *high, rtx *low)
+{
+  machine_mode vmode = vector_pair_to_vector_mode (GET_MODE (op));
+
+  *high = *low = NULL_RTX;
+
+  if (!CONST_VECTOR_P (op) || vmode == GET_MODE (op))
+    return false;
+
+  size_t nunits = GET_MODE_NUNITS (vmode);
+  rtvec hi_vec = rtvec_alloc (nunits);
+  rtvec lo_vec = rtvec_alloc (nunits);
+
+  for (size_t i = 0; i < nunits; i++)
+    {
+      RTVEC_ELT (hi_vec, i) = CONST_VECTOR_ELT (op, i);
+      RTVEC_ELT (lo_vec, i) = CONST_VECTOR_ELT (op, i + nunits);
+    }
+
+  *high = gen_rtx_CONST_VECTOR (vmode, hi_vec);
+  *low = gen_rtx_CONST_VECTOR (vmode, lo_vec);
+  return true;
+}
+
+/* Initialize vector pair TARGET to VALS.  */
+
+void
+rs6000_expand_vector_pair_init (rtx target, rtx vals)
+{
+  machine_mode mode_vpair = GET_MODE (target);
+  machine_mode mode_vector;
+  size_t n_elts_vpair = GET_MODE_NUNITS (mode_vpair);
+  bool all_same = true;
+  rtx first = XVECEXP (vals, 0, 0);
+  rtx (*gen_splat) (rtx, rtx);
+  rtx (*gen_concat) (rtx, rtx, rtx);
+
+  switch (mode_vpair)
+    {
+    case E_V32QImode:
+      mode_vector = V16QImode;
+      gen_splat = gen_vpair_splat_v32qi;
+      gen_concat = gen_vpair_concat_v32qi;
+      break;
+
+    case E_V16HImode:
+      mode_vector = V8HImode;
+      gen_splat = gen_vpair_splat_v16hi;
+      gen_concat = gen_vpair_concat_v16hi;
+      break;
+
+    case E_V8SImode:
+      mode_vector = V4SImode;
+      gen_splat = gen_vpair_splat_v8si;
+      gen_concat = gen_vpair_concat_v8si;
+      break;
+
+    case E_V4DImode:
+      mode_vector = V2DImode;
+      gen_splat = gen_vpair_splat_v4di;
+      gen_concat = gen_vpair_concat_v4di;
+      break;
+
+    case E_V8SFmode:
+      mode_vector = V4SFmode;
+      gen_splat = gen_vpair_splat_v8sf;
+      gen_concat = gen_vpair_concat_v8sf;
+      break;
+
+    case E_V4DFmode:
+      mode_vector = V2DFmode;
+      gen_splat = gen_vpair_splat_v4df;
+      gen_concat = gen_vpair_concat_v4df;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* See if we can do a splat operation.  */
+  for (size_t i = 1; i < n_elts_vpair; ++i)
+    {
+      if (!rtx_equal_p (XVECEXP (vals, 0, i), first))
+	{
+	  all_same = false;
+	  break;
+	}
+    }
+
+  if (all_same)
+    {
+      emit_insn (gen_splat (target, first));
+      return;
+    }
+
+  /* Break the initialization into two parts.  */
+  rtx vector_hi = gen_reg_rtx (mode_vector);
+  rtx vector_lo = gen_reg_rtx (mode_vector);
+  rtx vals_hi;
+  rtx vals_lo;
+
+  rs6000_split_vpair_constant (vals, &vals_hi, &vals_lo);
+
+  rs6000_expand_vector_init (vector_hi, vals_hi);
+  rs6000_expand_vector_init (vector_lo, vals_lo);
+  emit_insn (gen_concat (target, vector_hi, vector_lo));
+  return;
+}
+
 /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
    is variable and also counts by vector element size for p9 and above.  */
 
@@ -7603,6 +7812,15 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
   emit_insn (gen_rtx_SET (target, x));
 }
 
+/* Set field ELT_RTX of vaector pair TARGET to VAL.  */
+
+void
+rs6000_expand_vector_pair_set (rtx target, rtx val, rtx elt_rtx)
+{
+  if (target || val || elt_rtx)
+    gcc_unreachable ();
+}
+
 /* Extract field ELT from VEC into TARGET.  */
 
 void
@@ -8694,6 +8912,8 @@ reg_offset_addressing_ok_p (machine_mode mode)
       /* The vector pair/quad types support offset addressing if the
 	 underlying vectors support offset addressing.  */
     case E_OOmode:
+    case E_V8SFmode:
+    case E_V4DFmode:
     case E_XOmode:
       return TARGET_MMA;
 
@@ -11195,10 +11415,14 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
 	operands[1] = force_const_mem (mode, operands[1]);
       break;
 
+    case E_V32QImode:
     case E_V16QImode:
+    case E_V16HImode:
     case E_V8HImode:
+    case E_V8SFmode:
     case E_V4SFmode:
     case E_V4SImode:
+    case E_V4DFmode:
     case E_V2DFmode:
     case E_V2DImode:
     case E_V1TImode:
@@ -13456,7 +13680,7 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
      the GPR registers.  */
   if (rclass == GEN_OR_FLOAT_REGS)
     {
-      if (mode == OOmode)
+      if (VECTOR_PAIR_MODE (mode))
 	return VSX_REGS;
 
       if (mode == XOmode)
@@ -23417,6 +23641,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   rtx tmp = target;
   rtx norreg = gen_reg_rtx (V16QImode);
   machine_mode mode = GET_MODE (target);
+  machine_mode qi_vmode = VECTOR_PAIR_MODE (mode) ? V32QImode : V16QImode;
 
   /* Get everything in regs so the pattern matches.  */
   if (!REG_P (op0))
@@ -23424,7 +23649,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   if (!REG_P (op1))
     op1 = force_reg (mode, op1);
   if (!REG_P (sel))
-    sel = force_reg (V16QImode, sel);
+    sel = force_reg (qi_vmode, sel);
   if (!REG_P (target))
     tmp = gen_reg_rtx (mode);
 
@@ -23437,10 +23662,10 @@ altivec_expand_vec_perm_le (rtx operands[4])
     {
       /* Invert the selector with a VNAND if available, else a VNOR.
 	 The VNAND is preferred for future fusion opportunities.  */
-      notx = gen_rtx_NOT (V16QImode, sel);
+      notx = gen_rtx_NOT (qi_vmode, sel);
       iorx = (TARGET_P8_VECTOR
-	      ? gen_rtx_IOR (V16QImode, notx, notx)
-	      : gen_rtx_AND (V16QImode, notx, notx));
+	      ? gen_rtx_IOR (qi_vmode, notx, notx)
+	      : gen_rtx_AND (qi_vmode, notx, notx));
       emit_insn (gen_rtx_SET (norreg, iorx));
 
       /* Permute with operands reversed and adjusted selector.  */
@@ -24572,6 +24797,9 @@ static struct rs6000_opt_var const rs6000_opt_vars[] =
   { "speculate-indirect-jumps",
     offsetof (struct gcc_options, x_rs6000_speculate_indirect_jumps),
     offsetof (struct cl_target_option, x_rs6000_speculate_indirect_jumps), },
+  { "vector-size-32",
+    offsetof (struct gcc_options, x_TARGET_VECTOR_SIZE_32),
+    offsetof (struct cl_target_option, x_TARGET_VECTOR_SIZE_32), },
 };
 
 /* Inner function to handle attribute((target("..."))) and #pragma GCC target
@@ -27408,6 +27636,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
@@ -27426,6 +27728,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
   int reg_mode_size;
   /* The number of registers that will be moved.  */
   int nregs;
+  /* Hi/lo values for splitting vector pair constants.  */
+  rtx vpair_hi, vpair_lo;
 
   reg = REG_P (dst) ? REGNO (dst) : REGNO (src);
   mode = GET_MODE (dst);
@@ -27441,8 +27745,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
     }
   /* If we have a vector pair/quad mode, split it into two/four separate
      vectors.  */
-  else if (mode == OOmode || mode == XOmode)
-    reg_mode = V1TImode;
+  else if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
+    {
+      machine_mode vmode = vector_pair_to_vector_mode (mode);
+      reg_mode = (vmode == VOIDmode) ? V1TImode : vmode;
+    }
   else if (FP_REGNO_P (reg))
     reg_mode = DECIMAL_FLOAT_MODE_P (mode) ? DDmode :
 	(TARGET_HARD_FLOAT ? DFmode : SFmode);
@@ -27454,6 +27761,29 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 
   gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
 
+  /* Handle vector pair constants.  */
+  if (CONST_VECTOR_P (src) && VECTOR_PAIR_MODE (mode) && TARGET_MMA
+      && rs6000_split_vpair_constant (src, &vpair_hi, &vpair_lo)
+      && VSX_REGNO_P (reg))
+    {
+      reg_mode = GET_MODE (vpair_hi);
+      rtx reg_hi = gen_rtx_REG (reg_mode, reg);
+      rtx reg_lo = gen_rtx_REG (reg_mode, reg + 1);
+
+      emit_move_insn (reg_hi, vpair_hi);
+
+      /* 0.0 is easy.  For other constants, copy the high register into the low
+	 register if the two sets of constants are equal.  This means we won't
+	 be doing back to back prefixed load immediate instructions.  */
+      if (rtx_equal_p (vpair_hi, vpair_lo)
+	  && !rtx_equal_p (vpair_hi, CONST0_RTX (reg_mode)))
+	emit_move_insn (reg_lo, reg_hi);
+      else
+	emit_move_insn (reg_lo, vpair_lo);
+      
+      return;
+    }
+      
   /* TDmode residing in FP registers is special, since the ISA requires that
      the lower-numbered word of a register pair is always the most significant
      word, even in little-endian mode.  This does not match the usual subreg
@@ -27493,7 +27823,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
      below.  This means the last register gets the first memory
      location.  We also need to be careful of using the right register
      numbers if we are splitting XO to OO.  */
-  if (mode == OOmode || mode == XOmode)
+  if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
     {
       nregs = hard_regno_nregs (reg, mode);
       int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
@@ -27553,7 +27883,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	  gcc_assert (REG_P (dst));
 	  if (GET_MODE (src) == XOmode)
 	    gcc_assert (FP_REGNO_P (REGNO (dst)));
-	  if (GET_MODE (src) == OOmode)
+	  if (VECTOR_PAIR_MODE (GET_MODE (src)))
 	    gcc_assert (VSX_REGNO_P (REGNO (dst)));
 
 	  int nvecs = XVECLEN (src, 0);
@@ -27628,7 +27958,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	 overlap.  */
       int i;
       /* XO/OO are opaque so cannot use subregs. */
-      if (mode == OOmode || mode == XOmode )
+      if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	{
 	  for (i = nregs - 1; i >= 0; i--)
 	    {
@@ -27802,7 +28132,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	    continue;
 
 	  /* XO/OO are opaque so cannot use subregs. */
-	  if (mode == OOmode || mode == XOmode )
+	  if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	    {
 	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
 	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 326c45221e9..32848f7d15b 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -1006,6 +1006,12 @@ enum data_align { align_abi, align_opt, align_both };
   (ALTIVEC_VECTOR_MODE (MODE) || VSX_VECTOR_MODE (MODE)			\
    || (MODE) == V2DImode || (MODE) == V1TImode)
 
+/* Whether a mode is held in paired vector registers.  */
+#define VECTOR_PAIR_MODE(MODE)						\
+  ((MODE) == OOmode							\
+   || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode)
+
 /* Post-reload, we can't use any new AltiVec registers, as we already
    emitted the vrsave mask.  */
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dcf1f3526f5..29292c5f5b5 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -683,9 +683,13 @@
 		      (HI    "h")
 		      (SI    "w")
 		      (DI    "d")
+		      (V32QI "b")
 		      (V16QI "b")
+		      (V16HI "h")
 		      (V8HI  "h")
+		      (V8SI  "w")
 		      (V4SI  "w")
+		      (V4DI  "d")
 		      (V2DI  "d")
 		      (V1TI  "q")
 		      (TI    "q")])
@@ -812,7 +816,7 @@
 ;; supplement addressing modes.
 (define_mode_iterator RELOAD [V16QI V8HI V4SI V2DI V4SF V2DF V1TI
 			      SF SD SI DF DD DI TI PTI KF IF TF
-			      OO XO])
+			      OO XO V8SF V4DF V32QI V16HI V8SI V4DI])
 
 ;; Iterate over smin, smax
 (define_code_iterator fp_minmax	[smin smax])
@@ -15767,6 +15771,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 369095df9ed..38e24e20032 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -605,6 +605,10 @@ mstore-vector-pair
 Target Undocumented Mask(STORE_VECTOR_PAIR) Var(rs6000_isa_flags)
 Generate (do not generate) store vector pair instructions.
 
+mvector-size-32
+Target Var(TARGET_VECTOR_SIZE_32) Init(0) Save
+Generate (do not generate) vector pair instructions for float/double vector_size(32).
+
 mrelative-jumptables
 Target Undocumented Var(rs6000_relative_jumptables) Init(1) Save
 
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 00000000000..51b93a43ab0
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,704 @@
+;; Vector pair arithmetic and logical instruction support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; This function adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+;; Iterator for all vector pair modes.  Even though we do not provide integer
+;; vector pair operations at this time, we need to support loading and storing
+;; integer vector pairs for perumte operations (and eventually compare).
+(define_mode_iterator VPAIR [V32QI V16HI V8SI V4DI V8SF V4DF])
+
+;; Floating point vector pair ops
+(define_mode_iterator VPAIR_FP [V8SF V4DF])
+
+;; Iterator for floating point unary/binary operations.
+(define_code_iterator VPAIR_FP_UNARY  [abs neg])
+(define_code_iterator VPAIR_FP_BINARY [plus minus mult smin smax])
+
+;; Integer vector pair ops.  We need the basic logical opts to support
+;; permution on little endian systems.
+(define_mode_iterator VPAIR_INT [V32QI V16HI V8SI V4DI])
+
+;; Special iterators for NEG (V4SI and V2DI have vneg{w,d}), while V16QI and
+;; V8HI have to use a subtract from 0.
+(define_mode_iterator VPAIR_NEG_VNEG [V4DI V8SI])
+(define_mode_iterator VPAIR_NEG_SUB [V32QI V16HI])
+
+;; Iterator integer unary/binary operations.  Logical operations can be done on
+;; all VSX registers, while the binary int operators need Altivec registers.
+(define_code_iterator VPAIR_LOGICAL_UNARY  [not])
+(define_code_iterator VPAIR_LOGICAL_BINARY [and ior xor])
+
+(define_code_iterator VPAIR_INT_BINARY     [plus minus smin smax])
+
+;; Give the insn name from the opertion
+(define_code_attr vpair_op [(abs   "abs")
+			    (div   "div")
+			    (and   "and")
+			    (fma   "fma")
+			    (ior   "ior")
+			    (minus "sub")
+			    (mult  "mul")
+			    (neg   "neg")
+			    (not   "one_cmpl")
+			    (plus  "add")
+			    (smin  "smin")
+			    (smax  "smax")
+			    (xor   "xor")])
+
+;; Map vector pair mode to vector mode in upper case after the vector pair is
+;; split to two vectors.
+(define_mode_attr VPAIR_VECTOR [(V32QI "V16QI")
+				(V16HI "V8HI")
+				(V8SI  "V4SI")
+				(V4DI  "V2DI")
+				(V8SF  "V4SF")
+                                (V4DF  "V2DF")])
+
+;; Map vector pair mode to vector mode in lower case after the vector pair is
+;; split to two vectors.
+(define_mode_attr vpair_vector_l [(V32QI "v16qi")
+				  (V16HI "v8hi")
+				  (V8SI  "v4si")
+				  (V4DI  "v2di")
+				  (V8SF  "v4sf")
+				  (V4DF  "v2df")])
+
+;; Map vector pair mode to the base element mode.
+(define_mode_attr VPAIR_ELEMENT [(V32QI "QI")
+				 (V16HI "HI")
+				 (V8SI  "SI")
+				 (V4DI  "DI")
+				 (V8SF  "SF")
+				 (V4DF  "DF")])
+
+;; Map vector pair mode to the base element mode in lower case.
+(define_mode_attr vpair_element_l [(V32QI "qi")
+				   (V16HI "hi")
+				   (V8SI  "si")
+				   (V4DI  "di")
+				   (V8SF  "sf")
+				   (V4DF  "df")])
+
+;; Vector pair move support.
+(define_expand "mov<mode>"
+  [(set (match_operand:VPAIR 0 "nonimmediate_operand")
+	(match_operand:VPAIR 1 "input_operand"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_emit_move (operands[0], operands[1], <MODE>mode);
+  DONE;
+})
+
+(define_insn_and_split "*mov<mode>"
+  [(set (match_operand:VPAIR 0 "nonimmediate_operand" "=wa,wa,ZwO,QwO,wa,wa")
+	(match_operand:VPAIR 1 "input_operand" "ZwO,QwO,wa,wa,wa,j"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && (gpc_reg_operand (operands[0], <MODE>mode)
+       || gpc_reg_operand (operands[1], <MODE>mode))"
+  "@
+   lxvp%X1 %x0,%1
+   #
+   stxvp%X0 %x1,%0
+   #
+   #
+   #"
+  "&& reload_completed
+   && ((MEM_P (operands[0]) && !TARGET_STORE_VECTOR_PAIR)
+       || (MEM_P (operands[1]) && !TARGET_LOAD_VECTOR_PAIR)
+       || (!MEM_P (operands[0]) && !MEM_P (operands[1])))"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  if (op1 == CONST0_RTX (<MODE>mode))
+    {
+      machine_mode vmode = <VPAIR_VECTOR>mode;
+      rtx op0_reg0 = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+      rtx op0_reg1 = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+      rtx zero = CONST0_RTX (vmode);
+      emit_move_insn (op0_reg0, zero);
+      emit_move_insn (op0_reg1, zero);
+      DONE;
+    }
+
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecload,vecstore,vecstore,veclogical,vecperm")
+   (set_attr "size" "256")
+   (set_attr "length" "*,8,*,8,8,8")
+   (set_attr "isa" "lxvp,*,stxvp,*,*,*")])
+\f
+;; Vector pair initialization
+(define_expand "vec_init<mode><vpair_element_l>"
+  [(match_operand:VPAIR 0 "vsx_register_operand")
+   (match_operand:VPAIR 1 "")]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_expand_vector_pair_init (operands[0], operands[1]);
+  DONE;
+})
+
+;; Vector pair set element
+(define_expand "vec_set<mode>"
+  [(match_operand:VPAIR 0 "vsx_register_operand")
+   (match_operand:<VPAIR_ELEMENT> 1 "register_operand")
+   (match_operand 2 "vec_set_index_operand")]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_expand_vector_pair_set (operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+;; Vector pair extraction
+(define_insn_and_split "vec_extract<mode><vpair_element_l>"
+  [(set (match_operand:<VPAIR_ELEMENT> 0 "vsx_register_operand" "=wa")
+	(vec_select:<VPAIR_ELEMENT>
+	 (match_operand:VPAIR 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_int_operand" "n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  HOST_WIDE_INT elt = INTVAL (operands[2]);
+  machine_mode mode = <MODE>mode;
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  unsigned vsize = GET_MODE_SIZE (<VPAIR_VECTOR>mode);
+  unsigned reg_num = ((WORDS_BIG_ENDIAN && elt >= vsize)
+		      || (!WORDS_BIG_ENDIAN && elt < vsize));
+	   
+  rtx vreg = simplify_gen_subreg (vmode, op1, mode, reg_num * 16);
+  emit_insn (gen_vsx_extract_<vpair_vector_l> (op0, vreg,
+					       GEN_INT (elt % vsize)));
+  DONE;
+})
+
+;; Assemble a vector pair from two vectors.
+;;
+;; We have both endian versions to change which input register will be moved
+;; the the first register in the vector pair.
+(define_expand "vpair_concat_<mode>"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32")
+
+(define_insn_and_split "*vpair_concat_<mode>_be"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand" "=wa,&wa")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand" "0,mwajeP")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand" "mwajeP,mwajeP")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32 && WORDS_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  operands[3] = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*vpair_concat_<mode>_le"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand" "=&wa,wa")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand" "mwajeP,0")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand" "mwajeP,mwajeP")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32 && !WORDS_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  operands[3] = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+}
+  [(set_attr "length" "8")])
+
+;; Zero a vector pair
+(define_expand "vpair_zero_<mode>"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand") (match_dup 1))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  operands[1] = CONST0_RTX (<MODE>mode);
+})
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<mode>"
+  [(use (match_operand:VPAIR 0 "vsx_register_operand"))
+   (use (match_operand:<VPAIR_ELEMENT> 1 "input_operand"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  if (op1 == CONST0_RTX (vmode))
+    {
+      emit_insn (gen_vpair_zero_<mode> (op0));
+      DONE;
+    }
+
+  rtx tmp = gen_reg_rtx (vmode);
+
+  unsigned num_elements = GET_MODE_NUNITS (vmode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (size_t i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rtx vec_elements = gen_rtx_PARALLEL (vmode, elements);
+  rs6000_expand_vector_init (tmp, vec_elements);
+  emit_insn (gen_vpair_concat_<mode> (op0, tmp, tmp));
+  DONE;
+})
+\f
+;; Vector pair floating point arithmetic unary operations
+(define_insn_and_split "<vpair_op><mode>2"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(VPAIR_FP_UNARY:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_<vpair_op><vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize negative absolute value (both floating point and integer)
+(define_insn_and_split "nabs<mode>2"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(neg:VPAIR_FP
+	 (abs:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_vsx_nabs<vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point arithmetic binary operations
+(define_insn_and_split "<vpair_op><mode>3"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(VPAIR_FP_BINARY:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_<vpair_op><vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused multiply-add
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(fma:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_fma<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused multiply-subtract
+(define_insn_and_split "fms<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(fma:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	 (neg:VPAIR_FP
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_fms<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point negative fused multiply-add
+(define_insn_and_split "nfma<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_nfma<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused negative multiply-subtract
+(define_insn_and_split "nfms<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	  (neg:VPAIR_FP
+	   (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_nfms<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into fma (a, b, c)
+(define_insn_and_split "*fma_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(plus:VPAIR_FP
+	 (mult:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:VPAIR_FP (match_dup 1)
+		      (match_dup 2)
+		      (match_dup 3)))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into fma (a, b, -c)
+(define_insn_and_split "*fms_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(minus:VPAIR_FP
+	 (mult:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:VPAIR_FP (match_dup 1)
+		      (match_dup 2)
+		      (neg:VPAIR_FP
+		       (match_dup 3))))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) + c) into -fma (a, b, c)
+(define_insn_and_split "*nfma_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (plus:VPAIR_FP
+	  (mult:VPAIR_FP
+	   (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP (match_dup 1)
+		       (match_dup 2)
+		       (match_dup 3))))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into -fma (a, b, -c)
+(define_insn_and_split "*nfms_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (minus:VPAIR_FP
+	  (mult:VPAIR_FP
+	   (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP (match_dup 1)
+		       (match_dup 2)
+		       (neg:VPAIR_FP
+			(match_dup 3)))))]
+{
+}
+  [(set_attr "length" "8")])
+\f
+;; Vector pair negate if we have the VNEGx instruction.
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VPAIR_NEG_VNEG 0 "vsx_register_operand" "=v")
+	(neg:VPAIR_NEG_VNEG
+	 (match_operand:VPAIR_NEG_VNEG 1 "vsx_register_operand" "v")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_neg<vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair negate if we have to do a subtract from 0
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VPAIR_NEG_SUB 0 "vsx_register_operand" "=v")
+	(neg:VPAIR_NEG_SUB
+	 (match_operand:VPAIR_NEG_SUB 1 "vsx_register_operand" "v")))
+   (clobber (match_scratch:<VPAIR_VECTOR> 2 "=&v"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = <VPAIR_VECTOR>mode;
+  rtx tmp = operands[2];
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  emit_move_insn (tmp, CONST0_RTX (mode));
+  emit_insn (gen_sub<vpair_vector_l>3 (gen_rtx_REG (mode, reg0),
+				       tmp,
+				       gen_rtx_REG (mode, reg1)));
+
+  emit_insn (gen_sub<vpair_vector_l>3 (gen_rtx_REG (mode, reg0 + 1),
+				       tmp,
+				       gen_rtx_REG (mode, reg1 + 1)));
+
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair logical unary operations
+(define_insn_and_split "<vpair_op><mode>2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(VPAIR_LOGICAL_UNARY:VPAIR_INT
+	 (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_<vpair_op><vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair logical binary operations
+(define_insn_and_split "<vpair_op><mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(VPAIR_LOGICAL_BINARY:VPAIR_INT
+	 (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_<vpair_op><vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optiomize vector pair ~(a | b)  or ((~a) & (~b)) to produce xxlnor
+(define_insn_and_split "*nor<mode>3_1"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (ior:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*nor<mode>3_2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(and:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (~a) & b to use xxlandc
+(define_insn_and_split "*andc<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(and:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_andc<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair ~(a ^ b) to produce xxleqv
+(define_insn_and_split "*eqv<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (xor:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+[(set_attr "length" "8")])
+
+
+;; Optiomize vector pair ~(a & b) or ((~a) | (~b)) to produce xxlnand
+(define_insn_and_split "*nand<mode>3_1"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (and:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nand<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*nand<mode>3_2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(ior:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nand<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (~a) | b to produce xxlorc
+(define_insn_and_split "*orc<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(ior:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_orc<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 26fa32829af..7d1f372d76e 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3566,6 +3566,33 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Exctract DF from V4DF, convert it into extract from V2DF.
+(define_insn_and_split "vsx_extract_v4df"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=wa,r")
+	(vec_select:DF
+	 (match_operand:V4DF 1 "gpc_reg_operand" "wa,wa")
+	 (parallel
+	  [(match_operand:QI 2 "const_0_to_3_operand" "n,n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(vec_select:DF
+	 (match_dup 3)
+	 (parallel [(match_dup 4)])))]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 2)
+      || (!WORDS_BIG_ENDIAN && element < 2))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V2DFmode, reg_num);
+  operands[4] = GEN_INT (element & 1);
+}
+  [(set_attr "type" "mfvsr,vecperm")])
+
 ;; Extract a SF element from V4SF
 (define_insn_and_split "vsx_extract_v4sf"
   [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
@@ -3653,6 +3680,35 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Extract SF from V8SF, converting it into an extract from V4SF
+(define_insn_and_split "vsx_extract_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(vec_select:SF
+	 (match_operand:V8SF 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_0_to_7_operand" "n")])))
+   (clobber (match_scratch:V4SF 3 "=0"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (vec_select:SF
+		    (match_dup 4)
+		    (parallel [(match_dup 5)])))
+	      (clobber (match_dup 3))])]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 4)
+      || (!WORDS_BIG_ENDIAN && element < 4))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V4SFmode, reg_num);
+  operands[4] = GEN_INT (element & 3);
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "fp")])
+
 ;; Expand the builtin form of xxpermdi to canonical rtl.
 (define_expand "vsx_xxpermdi_<mode>"
   [(match_operand:VSX_L 0 "vsx_register_operand")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1748afdbfe0..786883df413 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1311,7 +1311,7 @@ See RS/6000 and PowerPC Options.
 -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{reg}
 -mstack-protector-guard-offset=@var{offset} -mprefixed -mno-prefixed
 -mpcrel -mno-pcrel -mmma -mno-mmma -mrop-protect -mno-rop-protect
--mprivileged -mno-privileged}
+-mprivileged -mno-privileged -mvector-size-32 -mno-vector-size-32}
 
 @emph{RX Options}
 @gccoptlist{-m64bit-doubles  -m32bit-doubles  -fpu  -nofpu
@@ -30960,6 +30960,18 @@ optimization (@option{-fshrink-wrap}).
 @itemx -mno-privileged
 Generate (do not generate) code that will run in privileged state.
 
+@opindex mvector-size-32
+@opindex mno-vector-size-32
+@item -mvector-size-32
+@itemx -mno-vector-size-32
+Generate (do not generate) code that will use the load vector pair and
+store vector pair instructions for vectorization.  This options
+requires @option{-mmma} to be enabled.
+
+At the present time, @option{-mvector-size-32} is not enabled by
+default.  If you use @option{-mvector-size-32}, if will only enable
+vectorized for @code{float} and @code{double} operations.
+
 @opindex block-ops-unaligned-vsx
 @opindex no-block-ops-unaligned-vsx
 @item -mblock-ops-unaligned-vsx
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c
new file mode 100644
index 00000000000..89343c44a34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 double elements.  */
+
+typedef double vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_multiply (vectype_t *dest,
+	       vectype_t *a,
+	       vectype_t *b)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  *dest = *a * *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xvnegdp, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_fma (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  *dest = (*a * *b) + *c;
+}
+
+void
+test_fms (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  *dest = (*a * *b) - *c;
+}
+
+void
+test_nfma (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmadddp, 1 stxvp.  */
+  *dest = -((*a * *b) + *c);
+}
+
+void
+test_nfms (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmsubdp, 1 stxvp.  */
+  *dest = -((*a * *b) - *c);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c
new file mode 100644
index 00000000000..c598f6307d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 8 float elements.  */
+
+typedef float vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_multiply (vectype_t *dest,
+	       vectype_t *a,
+	       vectype_t *b)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  *dest = *a * *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xvnegsp, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_fma (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  *dest = (*a * *b) + *c;
+}
+
+void
+test_fms (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  *dest = (*a * *b) - *c;
+}
+
+void
+test_nfma (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmaddsp, 1 stxvp.  */
+  *dest = -((*a * *b) + *c);
+}
+
+void
+test_nfms (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmsubsp, 1 stxvp.  */
+  *dest = -((*a * *b) - *c);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       19 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}       8 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}     2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c
new file mode 100644
index 00000000000..b1952b046f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef long long vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vaddudm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubudm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 2 vnegd, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}    24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}   13 } } */
+/* { dg-final { scan-assembler-times {\mvaddudm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mvnegd\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsubudm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}  4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c
new file mode 100644
index 00000000000..110292bb4df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef int vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vadduwm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubuwm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 2 vnegw, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}    24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}   13 } } */
+/* { dg-final { scan-assembler-times {\mvadduwm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mvnegw\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuwm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}  4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c
new file mode 100644
index 00000000000..8921b04c468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef short vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vadduhm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubuhm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 1 xxspltib, 2 vsubuhm, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}     24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}    13 } } */
+/* { dg-final { scan-assembler-times {\mvadduhm\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuhm\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c
new file mode 100644
index 00000000000..a905e6b0a31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef unsigned char vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vaddubm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsububm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 1 xxspltib, 2 vsububm, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}      24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}     13 } } */
+/* { dg-final { scan-assembler-times {\mvaddubm\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsububm\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}     4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}     4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}   1 } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/meissner/heads/work146-vsize)] Add vector_size(32) support.
@ 2023-11-17 23:32 Michael Meissner
  0 siblings, 0 replies; 4+ messages in thread
From: Michael Meissner @ 2023-11-17 23:32 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:81d00c71e5ed67bf434f2cab1521296f3070dd4a

commit 81d00c71e5ed67bf434f2cab1521296f3070dd4a
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Nov 17 18:31:58 2023 -0500

    Add vector_size(32) support.
    
    2023-11-17  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/predicates.md (const_0_to_31_operand): New predicate.
            (mma_assemble_input_operand): Allow any 16-byte vector type, not just
            V16QImode.
            * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
            __VECTOR_SIZE_32__ if -mvector-size-32 used.
            * config/rs6000/rs6000-protos.h (rs6000_expand_vector_pair_init): New
            declaration.
            (rs6000_expand_vector_pair_set): Likewise.
            (split_unary_vector_pair): Likewise.
            (split_binary_vector_pair): Likewisee.
            (split_fma_vector_pair): Likewise.
            * config/rs6000/rs6000.cc (rs6000_hard_regno_mode_ok_uncached): Add
            support for -mvector-size-32.
            (rs6000_hard_regno_mode_ok): Likewise.
            (rs6000_setup_reg_addr_masks): Likewise.
            (rs6000_init_hard_regno_mode_ok): Likewise.
            (rs6000_option_override_internal): Likewise.
            (rs6000_builtin_vectorization_cost): Likewise.
            (vector_pair_to_vector_mode): New function.
            (+rs6000_split_vpair_constan): Likewise.
            (rs6000_expand_vector_pair_ini): Likewise.
            (rs6000_expand_vector_pair_set): Likewise.
            (reg_offset_addressing_ok_p): Likewise.
            (rs6000_emit_move): Likewise.
            (altivec_expand_vec_perm_le): Likewise.
            (rs6000_opt_var): Add -mvector-size-32.
            (split_unary_vector_pair): New function.
            (split_binary_vector_pair): Likewise.
            (split_fma_vector_pai): Likewise.
            (rs6000_split_multireg_move): Add -mvector-size-32 support.
            * config/rs6000/rs6000.h (VECTOR_PAIR_MODE): New macro.
            * config/rs6000/rs6000.md (wd mode attribute): Add vector pair modes.
            (RELOAD mode iterator): Likewise.
            (toplevel): Include vector-pair.md.
            * config/rs6000/rs6000.opt (-mvector-size-32): New option.
            * config/rs6000/vector-pair.md: New file.
            * config/rs6000/vsx.md (vsx_extract_v4df): New insn.
            (vsx_extract_v8sf): Likewise.
            * doc/invoke.texi (RS/6000 and PowerPC Options): Document
            -mvector-size-32.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/vector-size-32-1.c: New test.
            * gcc.target/powerpc/vector-size-32-2.c: Likewise.
            * gcc.target/powerpc/vector-size-32-3.c: Likewise.
            * gcc.target/powerpc/vector-size-32-4.c: Likewise.
            * gcc.target/powerpc/vector-size-32-5.c: Likewise.
            * gcc.target/powerpc/vector-size-32-6.c: Likewise.

Diff:
---
 gcc/config/rs6000/predicates.md                    |   9 +-
 gcc/config/rs6000/rs6000-c.cc                      |   3 +
 gcc/config/rs6000/rs6000-protos.h                  |   7 +
 gcc/config/rs6000/rs6000.cc                        | 377 ++++++++++-
 gcc/config/rs6000/rs6000.h                         |   6 +
 gcc/config/rs6000/rs6000.md                        |   7 +-
 gcc/config/rs6000/rs6000.opt                       |   4 +
 gcc/config/rs6000/vector-pair.md                   | 704 +++++++++++++++++++++
 gcc/config/rs6000/vsx.md                           |  56 ++
 gcc/doc/invoke.texi                                |  14 +-
 .../gcc.target/powerpc/vector-size-32-1.c          |  85 +++
 .../gcc.target/powerpc/vector-size-32-2.c          |  96 +++
 .../gcc.target/powerpc/vector-size-32-3.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-4.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-5.c          | 137 ++++
 .../gcc.target/powerpc/vector-size-32-6.c          | 137 ++++
 16 files changed, 1893 insertions(+), 23 deletions(-)

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index ef7d3f214c4..8a56487d7d2 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -327,6 +327,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
 
+;; Match op = 0..31
+(define_predicate "const_0_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
 ;; Return 1 if op is a 34-bit constant integer.
 (define_predicate "cint34_operand"
   (match_code "const_int")
@@ -1301,8 +1306,10 @@
 
 ;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
 (define_special_predicate "mma_assemble_input_operand"
-  (match_test "(mode == V16QImode
+  (match_test "(GET_MODE_SIZE (mode) == 16 && VECTOR_MODE_P (mode)
 		&& (vsx_register_operand (op, mode)
+		    || op == CONST0_RTX (mode)
+		    || vsx_prefixed_constant (op, mode)
 		    || (MEM_P (op)
 			&& (indexed_or_indirect_address (XEXP (op, 0), mode)
 			    || quad_address_p (XEXP (op, 0), mode, false)))))"))
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 65be0ac43e2..27114b14022 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -631,6 +631,9 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
     builtin_define ("__SIZEOF_IBM128__=16");
   if (ieee128_float_type_node)
     builtin_define ("__SIZEOF_IEEE128__=16");
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    builtin_define ("__VECTOR_SIZE_32__");
+
 #ifdef TARGET_LIBC_PROVIDES_HWCAP_IN_TCB
   builtin_define ("__BUILTIN_CPU_SUPPORTS__");
 #endif
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..13687c5b1b3 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -61,7 +61,9 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
+extern void rs6000_expand_vector_pair_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, rtx);
+extern void rs6000_expand_vector_pair_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
@@ -138,6 +140,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool);
 extern void output_toc (FILE *, rtx, int, machine_mode);
 extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
+extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx));
+extern void split_binary_vector_pair (machine_mode, rtx [],
+				      rtx (*)(rtx, rtx, rtx));
+extern void split_fma_vector_pair (machine_mode, rtx [],
+				   rtx (*)(rtx, rtx, rtx, rtx));
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0dd21e67dde..1b28dd6e1dd 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -1843,7 +1843,7 @@ rs6000_hard_regno_mode_ok_uncached (int regno, machine_mode mode)
 
   /* Vector pair modes need even/odd VSX register pairs.  Only allow vector
      registers.  */
-  if (mode == OOmode)
+  if (VECTOR_PAIR_MODE (mode))
     return (TARGET_MMA && VSX_REGNO_P (regno) && (regno & 1) == 0);
 
   /* MMA accumulator modes need FPR registers divisible by 4.  */
@@ -1954,9 +1954,10 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
    GPR registers, and TImode can go in any GPR as well as VSX registers (PR
    57744).
 
-   Similarly, don't allow OOmode (vector pair, restricted to even VSX
-   registers) or XOmode (vector quad, restricted to FPR registers divisible
-   by 4) to tie with other modes.
+   Similarly, don't allow XOmode (vector quad, restricted to FPR registers
+   divisible by 4) to tie with other modes.
+
+   Vector pair modes can tie with other vector pair modes.
 
    Altivec/VSX vector tests were moved ahead of scalar float mode, so that IEEE
    128-bit floating point on VSX systems ties with other vectors.  */
@@ -1964,9 +1965,14 @@ rs6000_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 static bool
 rs6000_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
-  if (mode1 == PTImode || mode1 == OOmode || mode1 == XOmode
-      || mode2 == PTImode || mode2 == OOmode || mode2 == XOmode)
-    return mode1 == mode2;
+  if (mode1 == PTImode || mode1 == XOmode
+       || mode2 == PTImode || mode2 == XOmode)
+     return mode1 == mode2;
+ 
+  if (VECTOR_PAIR_MODE (mode1))
+    return VECTOR_PAIR_MODE (mode2);
+  if (VECTOR_PAIR_MODE (mode2))
+    return ALTIVEC_OR_VSX_VECTOR_MODE (mode1);
 
   if (ALTIVEC_OR_VSX_VECTOR_MODE (mode1))
     return ALTIVEC_OR_VSX_VECTOR_MODE (mode2);
@@ -2715,13 +2721,13 @@ rs6000_setup_reg_addr_masks (void)
 	     of the LXVP or STXVP instructions, do not allow indexed mode so
 	     that we can split the load/store.  */
 	  else if ((addr_mask != 0) && TARGET_MMA
-		   && (m2 == OOmode || m2 == XOmode))
+		   && (VECTOR_PAIR_MODE (m2) || m2 == XOmode))
 	    {
 	      addr_mask |= RELOAD_REG_OFFSET;
 	      if (rc == RELOAD_REG_FPR || rc == RELOAD_REG_VMX)
 		{
 		  addr_mask |= RELOAD_REG_QUAD_OFFSET;
-		  if (m2 == OOmode
+		  if (VECTOR_PAIR_MODE (m2)
 		      && TARGET_LOAD_VECTOR_PAIR
 		      && TARGET_STORE_VECTOR_PAIR)
 		    addr_mask |= RELOAD_REG_INDEXED;
@@ -2941,6 +2947,33 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
       rs6000_vector_align[XOmode] = 512;
     }
 
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      rs6000_vector_unit[V32QImode] = VECTOR_NONE;
+      rs6000_vector_mem[V32QImode] = VECTOR_VSX;
+      rs6000_vector_align[V32QImode] = 256;
+
+      rs6000_vector_unit[V16HImode] = VECTOR_NONE;
+      rs6000_vector_mem[V16HImode] = VECTOR_VSX;
+      rs6000_vector_align[V16HImode] = 256;
+
+      rs6000_vector_unit[V8SImode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SImode] = VECTOR_VSX;
+      rs6000_vector_align[V8SImode] = 256;
+
+      rs6000_vector_unit[V8SFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V8SFmode] = VECTOR_VSX;
+      rs6000_vector_align[V8SFmode] = 256;
+
+      rs6000_vector_unit[V4DImode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DImode] = VECTOR_VSX;
+      rs6000_vector_align[V4DImode] = 256;
+
+      rs6000_vector_unit[V4DFmode] = VECTOR_NONE;
+      rs6000_vector_mem[V4DFmode] = VECTOR_VSX;
+      rs6000_vector_align[V4DFmode] = 256;
+    }
+
   /* Register class constraints for the constraints that depend on compile
      switches. When the VSX code was added, different constraints were added
      based on the type (DFmode, V2DFmode, V4SFmode).  For the vector types, all
@@ -3072,6 +3105,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 		  reg_addr[XOmode].reload_store = CODE_FOR_reload_xo_di_store;
 		  reg_addr[XOmode].reload_load = CODE_FOR_reload_xo_di_load;
 		}
+
+	      if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+		{
+		  reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_di_store;
+		  reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_di_load;
+		  reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_di_store;
+		  reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_di_load;
+		  reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_di_store;
+		  reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_di_load;
+		  reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_di_store;
+		  reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_di_load;
+		  reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_di_store;
+		  reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_di_load;
+		  reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_di_store;
+		  reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_di_load;
+		}
 	    }
 	}
       else
@@ -3129,6 +3178,22 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 	      reg_addr[DDmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdd;
 	      reg_addr[DFmode].reload_fpr_gpr = CODE_FOR_reload_fpr_from_gprdf;
 	    }
+
+	  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+	    {
+	      reg_addr[V32QImode].reload_store = CODE_FOR_reload_v32qi_si_store;
+	      reg_addr[V32QImode].reload_load = CODE_FOR_reload_v32qi_si_load;
+	      reg_addr[V16HImode].reload_store = CODE_FOR_reload_v16hi_si_store;
+	      reg_addr[V16HImode].reload_load = CODE_FOR_reload_v16hi_si_load;
+	      reg_addr[V8SImode].reload_store = CODE_FOR_reload_v8si_si_store;
+	      reg_addr[V8SImode].reload_load = CODE_FOR_reload_v8si_si_load;
+	      reg_addr[V8SFmode].reload_store = CODE_FOR_reload_v8sf_si_store;
+	      reg_addr[V8SFmode].reload_load = CODE_FOR_reload_v8sf_si_load;
+	      reg_addr[V4DImode].reload_store = CODE_FOR_reload_v4di_si_store;
+	      reg_addr[V4DImode].reload_load = CODE_FOR_reload_v4di_si_load;
+	      reg_addr[V4DFmode].reload_store = CODE_FOR_reload_v4df_si_store;
+	      reg_addr[V4DFmode].reload_load = CODE_FOR_reload_v4df_si_load;
+	    }
 	}
 
       reg_addr[DFmode].scalar_in_vmx_p = true;
@@ -4429,6 +4494,15 @@ rs6000_option_override_internal (bool global_init_p)
       rs6000_isa_flags &= OPTION_MASK_STORE_VECTOR_PAIR;
     }
 
+  if (!TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      if (OPTION_SET_P (TARGET_VECTOR_SIZE_32))
+	warning (0, "%qs should not be used unless you use %qs",
+		 "-mvector-size-32", "-mmma");
+
+      TARGET_VECTOR_SIZE_32 = 0;
+    }
+
   /* Enable power10 fusion if we are tuning for power10, even if we aren't
      generating power10 instructions.  */
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION))
@@ -5174,6 +5248,15 @@ rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 static machine_mode
 rs6000_preferred_simd_mode (scalar_mode mode)
 {
+  /* Prefer vector pair for floating point, but not for integer modes.  */
+  if (TARGET_MMA && TARGET_VECTOR_SIZE_32)
+    {
+      if (mode == DFmode)
+	return V4DFmode;
+      else if (mode == SFmode)
+	return V8SFmode;
+    }
+
   opt_machine_mode vmode = mode_for_vector (mode, 16 / GET_MODE_SIZE (mode));
 
   if (vmode.exists () && !VECTOR_MEM_NONE_P (vmode.require ()))
@@ -7275,6 +7358,141 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
+/* For a vector pair mode, return the equivalent vector mode or VOIDmode.  */
+
+static machine_mode
+vector_pair_to_vector_mode (machine_mode mode)
+{
+  machine_mode vmode;
+
+  switch (mode)
+    {
+    case E_V32QImode:  vmode = V16QImode; break;
+    case E_V16HImode:  vmode = V8HImode;  break;
+    case E_V8SImode:   vmode = V4SImode;  break;
+    case E_V4DImode:   vmode = V2DImode;  break;
+    case E_V8SFmode:   vmode = V4SFmode;  break;
+    case E_V4DFmode:   vmode = V2DFmode;  break;
+    default:           vmode = VOIDmode;  break;
+    }
+
+  return vmode;
+}
+
+/* Split a vector constant for a type that can be held into a vector register
+   pair into 2 separate constants that can be held in a single vector register.
+   Return true if we can split the constant.  */
+
+static bool
+rs6000_split_vpair_constant (rtx op, rtx *high, rtx *low)
+{
+  machine_mode vmode = vector_pair_to_vector_mode (GET_MODE (op));
+
+  *high = *low = NULL_RTX;
+
+  if (!CONST_VECTOR_P (op) || vmode == GET_MODE (op))
+    return false;
+
+  size_t nunits = GET_MODE_NUNITS (vmode);
+  rtvec hi_vec = rtvec_alloc (nunits);
+  rtvec lo_vec = rtvec_alloc (nunits);
+
+  for (size_t i = 0; i < nunits; i++)
+    {
+      RTVEC_ELT (hi_vec, i) = CONST_VECTOR_ELT (op, i);
+      RTVEC_ELT (lo_vec, i) = CONST_VECTOR_ELT (op, i + nunits);
+    }
+
+  *high = gen_rtx_CONST_VECTOR (vmode, hi_vec);
+  *low = gen_rtx_CONST_VECTOR (vmode, lo_vec);
+  return true;
+}
+
+/* Initialize vector pair TARGET to VALS.  */
+
+void
+rs6000_expand_vector_pair_init (rtx target, rtx vals)
+{
+  machine_mode mode_vpair = GET_MODE (target);
+  machine_mode mode_vector;
+  size_t n_elts_vpair = GET_MODE_NUNITS (mode_vpair);
+  bool all_same = true;
+  rtx first = XVECEXP (vals, 0, 0);
+  rtx (*gen_splat) (rtx, rtx);
+  rtx (*gen_concat) (rtx, rtx, rtx);
+
+  switch (mode_vpair)
+    {
+    case E_V32QImode:
+      mode_vector = V16QImode;
+      gen_splat = gen_vpair_splat_v32qi;
+      gen_concat = gen_vpair_concat_v32qi;
+      break;
+
+    case E_V16HImode:
+      mode_vector = V8HImode;
+      gen_splat = gen_vpair_splat_v16hi;
+      gen_concat = gen_vpair_concat_v16hi;
+      break;
+
+    case E_V8SImode:
+      mode_vector = V4SImode;
+      gen_splat = gen_vpair_splat_v8si;
+      gen_concat = gen_vpair_concat_v8si;
+      break;
+
+    case E_V4DImode:
+      mode_vector = V2DImode;
+      gen_splat = gen_vpair_splat_v4di;
+      gen_concat = gen_vpair_concat_v4di;
+      break;
+
+    case E_V8SFmode:
+      mode_vector = V4SFmode;
+      gen_splat = gen_vpair_splat_v8sf;
+      gen_concat = gen_vpair_concat_v8sf;
+      break;
+
+    case E_V4DFmode:
+      mode_vector = V2DFmode;
+      gen_splat = gen_vpair_splat_v4df;
+      gen_concat = gen_vpair_concat_v4df;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* See if we can do a splat operation.  */
+  for (size_t i = 1; i < n_elts_vpair; ++i)
+    {
+      if (!rtx_equal_p (XVECEXP (vals, 0, i), first))
+	{
+	  all_same = false;
+	  break;
+	}
+    }
+
+  if (all_same)
+    {
+      emit_insn (gen_splat (target, first));
+      return;
+    }
+
+  /* Break the initialization into two parts.  */
+  rtx vector_hi = gen_reg_rtx (mode_vector);
+  rtx vector_lo = gen_reg_rtx (mode_vector);
+  rtx vals_hi;
+  rtx vals_lo;
+
+  rs6000_split_vpair_constant (vals, &vals_hi, &vals_lo);
+
+  rs6000_expand_vector_init (vector_hi, vals_hi);
+  rs6000_expand_vector_init (vector_lo, vals_lo);
+  emit_insn (gen_concat (target, vector_hi, vector_lo));
+  return;
+}
+
 /* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
    is variable and also counts by vector element size for p9 and above.  */
 
@@ -7603,6 +7821,15 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
   emit_insn (gen_rtx_SET (target, x));
 }
 
+/* Set field ELT_RTX of vaector pair TARGET to VAL.  */
+
+void
+rs6000_expand_vector_pair_set (rtx target, rtx val, rtx elt_rtx)
+{
+  if (target || val || elt_rtx)
+    gcc_unreachable ();
+}
+
 /* Extract field ELT from VEC into TARGET.  */
 
 void
@@ -8694,6 +8921,8 @@ reg_offset_addressing_ok_p (machine_mode mode)
       /* The vector pair/quad types support offset addressing if the
 	 underlying vectors support offset addressing.  */
     case E_OOmode:
+    case E_V8SFmode:
+    case E_V4DFmode:
     case E_XOmode:
       return TARGET_MMA;
 
@@ -11197,8 +11426,10 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
 
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8SFmode:
     case E_V4SFmode:
     case E_V4SImode:
+    case E_V4DFmode:
     case E_V2DFmode:
     case E_V2DImode:
     case E_V1TImode:
@@ -13456,7 +13687,7 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
      the GPR registers.  */
   if (rclass == GEN_OR_FLOAT_REGS)
     {
-      if (mode == OOmode)
+      if (VECTOR_PAIR_MODE (mode))
 	return VSX_REGS;
 
       if (mode == XOmode)
@@ -23417,6 +23648,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   rtx tmp = target;
   rtx norreg = gen_reg_rtx (V16QImode);
   machine_mode mode = GET_MODE (target);
+  machine_mode qi_vmode = VECTOR_PAIR_MODE (mode) ? V32QImode : V16QImode;
 
   /* Get everything in regs so the pattern matches.  */
   if (!REG_P (op0))
@@ -23424,7 +23656,7 @@ altivec_expand_vec_perm_le (rtx operands[4])
   if (!REG_P (op1))
     op1 = force_reg (mode, op1);
   if (!REG_P (sel))
-    sel = force_reg (V16QImode, sel);
+    sel = force_reg (qi_vmode, sel);
   if (!REG_P (target))
     tmp = gen_reg_rtx (mode);
 
@@ -23437,10 +23669,10 @@ altivec_expand_vec_perm_le (rtx operands[4])
     {
       /* Invert the selector with a VNAND if available, else a VNOR.
 	 The VNAND is preferred for future fusion opportunities.  */
-      notx = gen_rtx_NOT (V16QImode, sel);
+      notx = gen_rtx_NOT (qi_vmode, sel);
       iorx = (TARGET_P8_VECTOR
-	      ? gen_rtx_IOR (V16QImode, notx, notx)
-	      : gen_rtx_AND (V16QImode, notx, notx));
+	      ? gen_rtx_IOR (qi_vmode, notx, notx)
+	      : gen_rtx_AND (qi_vmode, notx, notx));
       emit_insn (gen_rtx_SET (norreg, iorx));
 
       /* Permute with operands reversed and adjusted selector.  */
@@ -24572,6 +24804,9 @@ static struct rs6000_opt_var const rs6000_opt_vars[] =
   { "speculate-indirect-jumps",
     offsetof (struct gcc_options, x_rs6000_speculate_indirect_jumps),
     offsetof (struct cl_target_option, x_rs6000_speculate_indirect_jumps), },
+  { "vector-size-32",
+    offsetof (struct gcc_options, x_TARGET_VECTOR_SIZE_32),
+    offsetof (struct cl_target_option, x_TARGET_VECTOR_SIZE_32), },
 };
 
 /* Inner function to handle attribute((target("..."))) and #pragma GCC target
@@ -27408,6 +27643,80 @@ rs6000_split_logical (rtx operands[3],
   return;
 }
 
+/* Split a unary vector pair insn into two separate vector insns.  */
+
+void
+split_unary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx))		/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1));
+  return;
+}
+
+/* Split a binary vector pair insn into two separate vector insns.  */
+
+void
+split_binary_vector_pair (machine_mode mode,		/* vector mode.  */
+			 rtx operands[],		/* dest, src.  */
+			 rtx (*func)(rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1));
+  return;
+}
+
+/* Split a fused multiply-add vector pair insn into two separate vector
+   insns.  */
+
+void
+split_fma_vector_pair (machine_mode mode,		/* vector mode.  */
+		       rtx operands[],			/* dest, src.  */
+		       rtx (*func)(rtx, rtx, rtx, rtx))	/* create insn.  */
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  machine_mode orig_mode = GET_MODE (op0);
+
+  rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0);
+  rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0);
+  rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0);
+  rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0);
+
+  rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16);
+  rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16);
+  rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16);
+  rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16);
+
+  emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0));
+  emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1));
+  return;
+}
+
 /* Emit instructions to move SRC to DST.  Called by splitters for
    multi-register moves.  It will emit at most one instruction for
    each register that is accessed; that is, it won't emit li/lis pairs
@@ -27426,6 +27735,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
   int reg_mode_size;
   /* The number of registers that will be moved.  */
   int nregs;
+  /* Hi/lo values for splitting vector pair constants.  */
+  rtx vpair_hi, vpair_lo;
 
   reg = REG_P (dst) ? REGNO (dst) : REGNO (src);
   mode = GET_MODE (dst);
@@ -27441,8 +27752,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
     }
   /* If we have a vector pair/quad mode, split it into two/four separate
      vectors.  */
-  else if (mode == OOmode || mode == XOmode)
-    reg_mode = V1TImode;
+  else if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
+    {
+      machine_mode vmode = vector_pair_to_vector_mode (mode);
+      reg_mode = (vmode == VOIDmode) ? V1TImode : vmode;
+    }
   else if (FP_REGNO_P (reg))
     reg_mode = DECIMAL_FLOAT_MODE_P (mode) ? DDmode :
 	(TARGET_HARD_FLOAT ? DFmode : SFmode);
@@ -27454,6 +27768,29 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 
   gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
 
+  /* Handle vector pair constants.  */
+  if (CONST_VECTOR_P (src) && VECTOR_PAIR_MODE (mode) && TARGET_MMA
+      && rs6000_split_vpair_constant (src, &vpair_hi, &vpair_lo)
+      && VSX_REGNO_P (reg))
+    {
+      reg_mode = GET_MODE (vpair_hi);
+      rtx reg_hi = gen_rtx_REG (reg_mode, reg);
+      rtx reg_lo = gen_rtx_REG (reg_mode, reg + 1);
+
+      emit_move_insn (reg_hi, vpair_hi);
+
+      /* 0.0 is easy.  For other constants, copy the high register into the low
+	 register if the two sets of constants are equal.  This means we won't
+	 be doing back to back prefixed load immediate instructions.  */
+      if (rtx_equal_p (vpair_hi, vpair_lo)
+	  && !rtx_equal_p (vpair_hi, CONST0_RTX (reg_mode)))
+	emit_move_insn (reg_lo, reg_hi);
+      else
+	emit_move_insn (reg_lo, vpair_lo);
+      
+      return;
+    }
+      
   /* TDmode residing in FP registers is special, since the ISA requires that
      the lower-numbered word of a register pair is always the most significant
      word, even in little-endian mode.  This does not match the usual subreg
@@ -27493,7 +27830,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
      below.  This means the last register gets the first memory
      location.  We also need to be careful of using the right register
      numbers if we are splitting XO to OO.  */
-  if (mode == OOmode || mode == XOmode)
+  if (VECTOR_PAIR_MODE (mode) || mode == XOmode)
     {
       nregs = hard_regno_nregs (reg, mode);
       int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
@@ -27553,7 +27890,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	  gcc_assert (REG_P (dst));
 	  if (GET_MODE (src) == XOmode)
 	    gcc_assert (FP_REGNO_P (REGNO (dst)));
-	  if (GET_MODE (src) == OOmode)
+	  if (VECTOR_PAIR_MODE (GET_MODE (src)))
 	    gcc_assert (VSX_REGNO_P (REGNO (dst)));
 
 	  int nvecs = XVECLEN (src, 0);
@@ -27628,7 +27965,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	 overlap.  */
       int i;
       /* XO/OO are opaque so cannot use subregs. */
-      if (mode == OOmode || mode == XOmode )
+      if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	{
 	  for (i = nregs - 1; i >= 0; i--)
 	    {
@@ -27802,7 +28139,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 	    continue;
 
 	  /* XO/OO are opaque so cannot use subregs. */
-	  if (mode == OOmode || mode == XOmode )
+	  if (VECTOR_PAIR_MODE (mode) || mode == XOmode )
 	    {
 	      rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
 	      rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 326c45221e9..32848f7d15b 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -1006,6 +1006,12 @@ enum data_align { align_abi, align_opt, align_both };
   (ALTIVEC_VECTOR_MODE (MODE) || VSX_VECTOR_MODE (MODE)			\
    || (MODE) == V2DImode || (MODE) == V1TImode)
 
+/* Whether a mode is held in paired vector registers.  */
+#define VECTOR_PAIR_MODE(MODE)						\
+  ((MODE) == OOmode							\
+   || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode)
+
 /* Post-reload, we can't use any new AltiVec registers, as we already
    emitted the vrsave mask.  */
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dcf1f3526f5..29292c5f5b5 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -683,9 +683,13 @@
 		      (HI    "h")
 		      (SI    "w")
 		      (DI    "d")
+		      (V32QI "b")
 		      (V16QI "b")
+		      (V16HI "h")
 		      (V8HI  "h")
+		      (V8SI  "w")
 		      (V4SI  "w")
+		      (V4DI  "d")
 		      (V2DI  "d")
 		      (V1TI  "q")
 		      (TI    "q")])
@@ -812,7 +816,7 @@
 ;; supplement addressing modes.
 (define_mode_iterator RELOAD [V16QI V8HI V4SI V2DI V4SF V2DF V1TI
 			      SF SD SI DF DD DI TI PTI KF IF TF
-			      OO XO])
+			      OO XO V8SF V4DF V32QI V16HI V8SI V4DI])
 
 ;; Iterate over smin, smax
 (define_code_iterator fp_minmax	[smin smax])
@@ -15767,6 +15771,7 @@
 (include "vsx.md")
 (include "altivec.md")
 (include "mma.md")
+(include "vector-pair.md")
 (include "dfp.md")
 (include "crypto.md")
 (include "htm.md")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 369095df9ed..38e24e20032 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -605,6 +605,10 @@ mstore-vector-pair
 Target Undocumented Mask(STORE_VECTOR_PAIR) Var(rs6000_isa_flags)
 Generate (do not generate) store vector pair instructions.
 
+mvector-size-32
+Target Var(TARGET_VECTOR_SIZE_32) Init(0) Save
+Generate (do not generate) vector pair instructions for float/double vector_size(32).
+
 mrelative-jumptables
 Target Undocumented Var(rs6000_relative_jumptables) Init(1) Save
 
diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md
new file mode 100644
index 00000000000..51b93a43ab0
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.md
@@ -0,0 +1,704 @@
+;; Vector pair arithmetic and logical instruction support.
+;; Copyright (C) 2020-2023 Free Software Foundation, Inc.
+;; Contributed by Peter Bergner <bergner@linux.ibm.com> and
+;;		  Michael Meissner <meissner@linux.ibm.com>
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; This function adds support for doing vector operations on pairs of vector
+;; registers.  Most of the instructions use vector pair instructions to load
+;; and possibly store registers, but splitting the operation after register
+;; allocation to do 2 separate operations.  The second scheduler pass can
+;; interleave other instructions between these pairs of instructions if
+;; possible.
+
+;; Iterator for all vector pair modes.  Even though we do not provide integer
+;; vector pair operations at this time, we need to support loading and storing
+;; integer vector pairs for perumte operations (and eventually compare).
+(define_mode_iterator VPAIR [V32QI V16HI V8SI V4DI V8SF V4DF])
+
+;; Floating point vector pair ops
+(define_mode_iterator VPAIR_FP [V8SF V4DF])
+
+;; Iterator for floating point unary/binary operations.
+(define_code_iterator VPAIR_FP_UNARY  [abs neg])
+(define_code_iterator VPAIR_FP_BINARY [plus minus mult smin smax])
+
+;; Integer vector pair ops.  We need the basic logical opts to support
+;; permution on little endian systems.
+(define_mode_iterator VPAIR_INT [V32QI V16HI V8SI V4DI])
+
+;; Special iterators for NEG (V4SI and V2DI have vneg{w,d}), while V16QI and
+;; V8HI have to use a subtract from 0.
+(define_mode_iterator VPAIR_NEG_VNEG [V4DI V8SI])
+(define_mode_iterator VPAIR_NEG_SUB [V32QI V16HI])
+
+;; Iterator integer unary/binary operations.  Logical operations can be done on
+;; all VSX registers, while the binary int operators need Altivec registers.
+(define_code_iterator VPAIR_LOGICAL_UNARY  [not])
+(define_code_iterator VPAIR_LOGICAL_BINARY [and ior xor])
+
+(define_code_iterator VPAIR_INT_BINARY     [plus minus smin smax])
+
+;; Give the insn name from the opertion
+(define_code_attr vpair_op [(abs   "abs")
+			    (div   "div")
+			    (and   "and")
+			    (fma   "fma")
+			    (ior   "ior")
+			    (minus "sub")
+			    (mult  "mul")
+			    (neg   "neg")
+			    (not   "one_cmpl")
+			    (plus  "add")
+			    (smin  "smin")
+			    (smax  "smax")
+			    (xor   "xor")])
+
+;; Map vector pair mode to vector mode in upper case after the vector pair is
+;; split to two vectors.
+(define_mode_attr VPAIR_VECTOR [(V32QI "V16QI")
+				(V16HI "V8HI")
+				(V8SI  "V4SI")
+				(V4DI  "V2DI")
+				(V8SF  "V4SF")
+                                (V4DF  "V2DF")])
+
+;; Map vector pair mode to vector mode in lower case after the vector pair is
+;; split to two vectors.
+(define_mode_attr vpair_vector_l [(V32QI "v16qi")
+				  (V16HI "v8hi")
+				  (V8SI  "v4si")
+				  (V4DI  "v2di")
+				  (V8SF  "v4sf")
+				  (V4DF  "v2df")])
+
+;; Map vector pair mode to the base element mode.
+(define_mode_attr VPAIR_ELEMENT [(V32QI "QI")
+				 (V16HI "HI")
+				 (V8SI  "SI")
+				 (V4DI  "DI")
+				 (V8SF  "SF")
+				 (V4DF  "DF")])
+
+;; Map vector pair mode to the base element mode in lower case.
+(define_mode_attr vpair_element_l [(V32QI "qi")
+				   (V16HI "hi")
+				   (V8SI  "si")
+				   (V4DI  "di")
+				   (V8SF  "sf")
+				   (V4DF  "df")])
+
+;; Vector pair move support.
+(define_expand "mov<mode>"
+  [(set (match_operand:VPAIR 0 "nonimmediate_operand")
+	(match_operand:VPAIR 1 "input_operand"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_emit_move (operands[0], operands[1], <MODE>mode);
+  DONE;
+})
+
+(define_insn_and_split "*mov<mode>"
+  [(set (match_operand:VPAIR 0 "nonimmediate_operand" "=wa,wa,ZwO,QwO,wa,wa")
+	(match_operand:VPAIR 1 "input_operand" "ZwO,QwO,wa,wa,wa,j"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && (gpc_reg_operand (operands[0], <MODE>mode)
+       || gpc_reg_operand (operands[1], <MODE>mode))"
+  "@
+   lxvp%X1 %x0,%1
+   #
+   stxvp%X0 %x1,%0
+   #
+   #
+   #"
+  "&& reload_completed
+   && ((MEM_P (operands[0]) && !TARGET_STORE_VECTOR_PAIR)
+       || (MEM_P (operands[1]) && !TARGET_LOAD_VECTOR_PAIR)
+       || (!MEM_P (operands[0]) && !MEM_P (operands[1])))"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  if (op1 == CONST0_RTX (<MODE>mode))
+    {
+      machine_mode vmode = <VPAIR_VECTOR>mode;
+      rtx op0_reg0 = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+      rtx op0_reg1 = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+      rtx zero = CONST0_RTX (vmode);
+      emit_move_insn (op0_reg0, zero);
+      emit_move_insn (op0_reg1, zero);
+      DONE;
+    }
+
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecload,vecstore,vecstore,veclogical,vecperm")
+   (set_attr "size" "256")
+   (set_attr "length" "*,8,*,8,8,8")
+   (set_attr "isa" "lxvp,*,stxvp,*,*,*")])
+\f
+;; Vector pair initialization
+(define_expand "vec_init<mode><vpair_element_l>"
+  [(match_operand:VPAIR 0 "vsx_register_operand")
+   (match_operand:VPAIR 1 "")]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_expand_vector_pair_init (operands[0], operands[1]);
+  DONE;
+})
+
+;; Vector pair set element
+(define_expand "vec_set<mode>"
+  [(match_operand:VPAIR 0 "vsx_register_operand")
+   (match_operand:<VPAIR_ELEMENT> 1 "register_operand")
+   (match_operand 2 "vec_set_index_operand")]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  rs6000_expand_vector_pair_set (operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+;; Vector pair extraction
+(define_insn_and_split "vec_extract<mode><vpair_element_l>"
+  [(set (match_operand:<VPAIR_ELEMENT> 0 "vsx_register_operand" "=wa")
+	(vec_select:<VPAIR_ELEMENT>
+	 (match_operand:VPAIR 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_int_operand" "n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  HOST_WIDE_INT elt = INTVAL (operands[2]);
+  machine_mode mode = <MODE>mode;
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  unsigned vsize = GET_MODE_SIZE (<VPAIR_VECTOR>mode);
+  unsigned reg_num = ((WORDS_BIG_ENDIAN && elt >= vsize)
+		      || (!WORDS_BIG_ENDIAN && elt < vsize));
+	   
+  rtx vreg = simplify_gen_subreg (vmode, op1, mode, reg_num * 16);
+  emit_insn (gen_vsx_extract_<vpair_vector_l> (op0, vreg,
+					       GEN_INT (elt % vsize)));
+  DONE;
+})
+
+;; Assemble a vector pair from two vectors.
+;;
+;; We have both endian versions to change which input register will be moved
+;; the the first register in the vector pair.
+(define_expand "vpair_concat_<mode>"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32")
+
+(define_insn_and_split "*vpair_concat_<mode>_be"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand" "=wa,&wa")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand" "0,mwajeP")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand" "mwajeP,mwajeP")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32 && WORDS_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  operands[3] = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*vpair_concat_<mode>_le"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand" "=&wa,wa")
+	(vec_concat:VPAIR
+	 (match_operand:<VPAIR_VECTOR> 1 "input_operand" "mwajeP,0")
+	 (match_operand:<VPAIR_VECTOR> 2 "input_operand" "mwajeP,mwajeP")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32 && !WORDS_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  operands[3] = simplify_gen_subreg (vmode, op0, <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (vmode, op0, <MODE>mode, 16);
+}
+  [(set_attr "length" "8")])
+
+;; Zero a vector pair
+(define_expand "vpair_zero_<mode>"
+  [(set (match_operand:VPAIR 0 "vsx_register_operand") (match_dup 1))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  operands[1] = CONST0_RTX (<MODE>mode);
+})
+
+;; Create a vector pair with a value splat'ed (duplicated) to all of the
+;; elements.
+(define_expand "vpair_splat_<mode>"
+  [(use (match_operand:VPAIR 0 "vsx_register_operand"))
+   (use (match_operand:<VPAIR_ELEMENT> 1 "input_operand"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+{
+  machine_mode vmode = <VPAIR_VECTOR>mode;
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+
+  if (op1 == CONST0_RTX (vmode))
+    {
+      emit_insn (gen_vpair_zero_<mode> (op0));
+      DONE;
+    }
+
+  rtx tmp = gen_reg_rtx (vmode);
+
+  unsigned num_elements = GET_MODE_NUNITS (vmode);
+  rtvec elements = rtvec_alloc (num_elements);
+  for (size_t i = 0; i < num_elements; i++)
+    RTVEC_ELT (elements, i) = copy_rtx (op1);
+
+  rtx vec_elements = gen_rtx_PARALLEL (vmode, elements);
+  rs6000_expand_vector_init (tmp, vec_elements);
+  emit_insn (gen_vpair_concat_<mode> (op0, tmp, tmp));
+  DONE;
+})
+\f
+;; Vector pair floating point arithmetic unary operations
+(define_insn_and_split "<vpair_op><mode>2"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(VPAIR_FP_UNARY:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_<vpair_op><vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize negative absolute value (both floating point and integer)
+(define_insn_and_split "nabs<mode>2"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(neg:VPAIR_FP
+	 (abs:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_vsx_nabs<vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point arithmetic binary operations
+(define_insn_and_split "<vpair_op><mode>3"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa")
+	(VPAIR_FP_BINARY:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_<vpair_op><vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused multiply-add
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(fma:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_fma<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused multiply-subtract
+(define_insn_and_split "fms<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(fma:VPAIR_FP
+	 (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	 (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	 (neg:VPAIR_FP
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_fms<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point negative fused multiply-add
+(define_insn_and_split "nfma<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_nfma<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair floating point fused negative multiply-subtract
+(define_insn_and_split "nfms<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0")
+	  (neg:VPAIR_FP
+	   (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_fma_vector_pair (<VPAIR_VECTOR>mode, operands,
+			 gen_nfms<vpair_vector_l>4);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) + c into fma (a, b, c)
+(define_insn_and_split "*fma_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(plus:VPAIR_FP
+	 (mult:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:VPAIR_FP (match_dup 1)
+		      (match_dup 2)
+		      (match_dup 3)))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (a * b) - c into fma (a, b, -c)
+(define_insn_and_split "*fms_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(minus:VPAIR_FP
+	 (mult:VPAIR_FP
+	  (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	 (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:VPAIR_FP (match_dup 1)
+		      (match_dup 2)
+		      (neg:VPAIR_FP
+		       (match_dup 3))))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) + c) into -fma (a, b, c)
+(define_insn_and_split "*nfma_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (plus:VPAIR_FP
+	  (mult:VPAIR_FP
+	   (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP (match_dup 1)
+		       (match_dup 2)
+		       (match_dup 3))))]
+{
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair -((a * b) - c) into -fma (a, b, -c)
+(define_insn_and_split "*nfms_fpcontract_<mode>4"
+  [(set (match_operand:VPAIR_FP 0 "vsx_register_operand" "=wa,wa")
+	(neg:VPAIR_FP
+	 (minus:VPAIR_FP
+	  (mult:VPAIR_FP
+	   (match_operand:VPAIR_FP 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:VPAIR_FP 2 "vsx_register_operand" "wa,0"))
+	  (match_operand:VPAIR_FP 3 "vsx_register_operand" "0,wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32
+   && flag_fp_contract_mode == FP_CONTRACT_FAST"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(neg:VPAIR_FP
+	 (fma:VPAIR_FP (match_dup 1)
+		       (match_dup 2)
+		       (neg:VPAIR_FP
+			(match_dup 3)))))]
+{
+}
+  [(set_attr "length" "8")])
+\f
+;; Vector pair negate if we have the VNEGx instruction.
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VPAIR_NEG_VNEG 0 "vsx_register_operand" "=v")
+	(neg:VPAIR_NEG_VNEG
+	 (match_operand:VPAIR_NEG_VNEG 1 "vsx_register_operand" "v")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_neg<vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair negate if we have to do a subtract from 0
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VPAIR_NEG_SUB 0 "vsx_register_operand" "=v")
+	(neg:VPAIR_NEG_SUB
+	 (match_operand:VPAIR_NEG_SUB 1 "vsx_register_operand" "v")))
+   (clobber (match_scratch:<VPAIR_VECTOR> 2 "=&v"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = <VPAIR_VECTOR>mode;
+  rtx tmp = operands[2];
+  unsigned reg0 = reg_or_subregno (operands[0]);
+  unsigned reg1 = reg_or_subregno (operands[1]);
+
+  emit_move_insn (tmp, CONST0_RTX (mode));
+  emit_insn (gen_sub<vpair_vector_l>3 (gen_rtx_REG (mode, reg0),
+				       tmp,
+				       gen_rtx_REG (mode, reg1)));
+
+  emit_insn (gen_sub<vpair_vector_l>3 (gen_rtx_REG (mode, reg0 + 1),
+				       tmp,
+				       gen_rtx_REG (mode, reg1 + 1)));
+
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair logical unary operations
+(define_insn_and_split "<vpair_op><mode>2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(VPAIR_LOGICAL_UNARY:VPAIR_INT
+	 (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_unary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			   gen_<vpair_op><vpair_vector_l>2);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Vector pair logical binary operations
+(define_insn_and_split "<vpair_op><mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(VPAIR_LOGICAL_BINARY:VPAIR_INT
+	 (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_<vpair_op><vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optiomize vector pair ~(a | b)  or ((~a) & (~b)) to produce xxlnor
+(define_insn_and_split "*nor<mode>3_1"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (ior:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*nor<mode>3_2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(and:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (~a) & b to use xxlandc
+(define_insn_and_split "*andc<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(and:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_andc<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair ~(a ^ b) to produce xxleqv
+(define_insn_and_split "*eqv<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (xor:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nor<vpair_vector_l>3);
+  DONE;
+}
+[(set_attr "length" "8")])
+
+
+;; Optiomize vector pair ~(a & b) or ((~a) | (~b)) to produce xxlnand
+(define_insn_and_split "*nand<mode>3_1"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(not:VPAIR_INT
+	 (and:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa")
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nand<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+(define_insn_and_split "*nand<mode>3_2"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(ior:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa"))))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_nand<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
+
+;; Optimize vector pair (~a) | b to produce xxlorc
+(define_insn_and_split "*orc<mode>3"
+  [(set (match_operand:VPAIR_INT 0 "vsx_register_operand" "=wa")
+	(ior:VPAIR_INT
+	 (not:VPAIR_INT
+	  (match_operand:VPAIR_INT 1 "vsx_register_operand" "wa"))
+	 (match_operand:VPAIR_INT 2 "vsx_register_operand" "wa")))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  split_binary_vector_pair (<VPAIR_VECTOR>mode, operands,
+			    gen_orc<vpair_vector_l>3);
+  DONE;
+}
+  [(set_attr "length" "8")])
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 26fa32829af..7d1f372d76e 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3566,6 +3566,33 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Exctract DF from V4DF, convert it into extract from V2DF.
+(define_insn_and_split "vsx_extract_v4df"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=wa,r")
+	(vec_select:DF
+	 (match_operand:V4DF 1 "gpc_reg_operand" "wa,wa")
+	 (parallel
+	  [(match_operand:QI 2 "const_0_to_3_operand" "n,n")])))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(vec_select:DF
+	 (match_dup 3)
+	 (parallel [(match_dup 4)])))]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 2)
+      || (!WORDS_BIG_ENDIAN && element < 2))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V2DFmode, reg_num);
+  operands[4] = GEN_INT (element & 1);
+}
+  [(set_attr "type" "mfvsr,vecperm")])
+
 ;; Extract a SF element from V4SF
 (define_insn_and_split "vsx_extract_v4sf"
   [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
@@ -3653,6 +3680,35 @@
 }
   [(set_attr "type" "fpload,load")])
 
+;; Extract SF from V8SF, converting it into an extract from V4SF
+(define_insn_and_split "vsx_extract_v8sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+	(vec_select:SF
+	 (match_operand:V8SF 1 "vsx_register_operand" "wa")
+	 (parallel [(match_operand:QI 2 "const_0_to_7_operand" "n")])))
+   (clobber (match_scratch:V4SF 3 "=0"))]
+  "TARGET_MMA && TARGET_VECTOR_SIZE_32"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (vec_select:SF
+		    (match_dup 4)
+		    (parallel [(match_dup 5)])))
+	      (clobber (match_dup 3))])]
+{
+  HOST_WIDE_INT element = INTVAL (operands[2]);
+  unsigned reg_num = reg_or_subregno (operands[1]);
+
+  if ((WORDS_BIG_ENDIAN && element >= 4)
+      || (!WORDS_BIG_ENDIAN && element < 4))
+    reg_num++;
+
+  operands[3] = gen_rtx_REG (V4SFmode, reg_num);
+  operands[4] = GEN_INT (element & 3);
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "fp")])
+
 ;; Expand the builtin form of xxpermdi to canonical rtl.
 (define_expand "vsx_xxpermdi_<mode>"
   [(match_operand:VSX_L 0 "vsx_register_operand")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1748afdbfe0..786883df413 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1311,7 +1311,7 @@ See RS/6000 and PowerPC Options.
 -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{reg}
 -mstack-protector-guard-offset=@var{offset} -mprefixed -mno-prefixed
 -mpcrel -mno-pcrel -mmma -mno-mmma -mrop-protect -mno-rop-protect
--mprivileged -mno-privileged}
+-mprivileged -mno-privileged -mvector-size-32 -mno-vector-size-32}
 
 @emph{RX Options}
 @gccoptlist{-m64bit-doubles  -m32bit-doubles  -fpu  -nofpu
@@ -30960,6 +30960,18 @@ optimization (@option{-fshrink-wrap}).
 @itemx -mno-privileged
 Generate (do not generate) code that will run in privileged state.
 
+@opindex mvector-size-32
+@opindex mno-vector-size-32
+@item -mvector-size-32
+@itemx -mno-vector-size-32
+Generate (do not generate) code that will use the load vector pair and
+store vector pair instructions for vectorization.  This options
+requires @option{-mmma} to be enabled.
+
+At the present time, @option{-mvector-size-32} is not enabled by
+default.  If you use @option{-mvector-size-32}, if will only enable
+vectorized for @code{float} and @code{double} operations.
+
 @opindex block-ops-unaligned-vsx
 @opindex no-block-ops-unaligned-vsx
 @item -mblock-ops-unaligned-vsx
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c
new file mode 100644
index 00000000000..89343c44a34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 double elements.  */
+
+typedef double vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_multiply (vectype_t *dest,
+	       vectype_t *a,
+	       vectype_t *b)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  *dest = *a * *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xvnegdp, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_fma (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  *dest = (*a * *b) + *c;
+}
+
+void
+test_fms (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  *dest = (*a * *b) - *c;
+}
+
+void
+test_nfma (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmadddp, 1 stxvp.  */
+  *dest = -((*a * *b) + *c);
+}
+
+void
+test_nfms (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmsubdp, 1 stxvp.  */
+  *dest = -((*a * *b) - *c);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c
new file mode 100644
index 00000000000..c598f6307d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 8 float elements.  */
+
+typedef float vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_multiply (vectype_t *dest,
+	       vectype_t *a,
+	       vectype_t *b)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  *dest = *a * *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xvnegsp, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_fma (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  *dest = (*a * *b) + *c;
+}
+
+void
+test_fms (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b,
+	  vectype_t *c)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  *dest = (*a * *b) - *c;
+}
+
+void
+test_nfma (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmaddsp, 1 stxvp.  */
+  *dest = -((*a * *b) + *c);
+}
+
+void
+test_nfms (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b,
+	   vectype_t *c)
+{
+  /* 2 lxvp, 2 xvnmsubsp, 1 stxvp.  */
+  *dest = -((*a * *b) - *c);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       19 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}       8 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}     2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c
new file mode 100644
index 00000000000..b1952b046f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-3.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef long long vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vaddudm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubudm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 2 vnegd, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}    24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}   13 } } */
+/* { dg-final { scan-assembler-times {\mvaddudm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mvnegd\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsubudm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}  4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c
new file mode 100644
index 00000000000..110292bb4df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-4.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef int vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vadduwm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubuwm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 2 vnegw, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}    24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}   13 } } */
+/* { dg-final { scan-assembler-times {\mvadduwm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mvnegw\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuwm\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}  4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}   2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c
new file mode 100644
index 00000000000..8921b04c468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-5.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef short vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vadduhm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsubuhm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 1 xxspltib, 2 vsubuhm, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}     24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}    13 } } */
+/* { dg-final { scan-assembler-times {\mvadduhm\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mvsubuhm\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}   4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c b/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c
new file mode 100644
index 00000000000..a905e6b0a31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector-size-32-6.c
@@ -0,0 +1,137 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -mvector-size-32" } */
+
+/* Test whether the __attrbiute__((__vector_size(32))) generates paired vector
+   loads and stores with the -mvector-size-32 option.  This file tests 32-byte
+   vectors with 4 64-bit integer elements.  */
+
+typedef unsigned char vectype_t __attribute__((__vector_size__(32)));
+
+void
+test_add (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vaddubm, 1 stxvp.  */
+  *dest = *a + *b;
+}
+
+void
+test_sub (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 vsububm, 1 stxvp.  */
+  *dest = *a - *b;
+}
+
+void
+test_negate (vectype_t *dest,
+	     vectype_t *a)
+{
+  /* 2 lxvp, 1 xxspltib, 2 vsububm, 1 stxvp.  */
+  *dest = - *a;
+}
+
+void
+test_not (vectype_t *dest,
+	  vectype_t *a)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~ *a;
+}
+
+void
+test_and (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxland, 1 stxvp.  */
+  *dest = *a & *b;
+}
+
+void
+test_or (vectype_t *dest,
+	 vectype_t *a,
+	 vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlor, 1 stxvp.  */
+  *dest = *a | *b;
+}
+
+void
+test_xor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlxor, 1 stxvp.  */
+  *dest = *a ^ *b;
+}
+
+void
+test_andc_1 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = (~ *a) & *b;
+}
+
+void
+test_andc_2 (vectype_t *dest,
+	     vectype_t *a,
+	     vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlandc, 1 stxvp.  */
+  *dest = *a & (~ *b);
+}
+
+void
+test_orc_1 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = (~ *a) | *b;
+}
+
+void
+test_orc_2 (vectype_t *dest,
+	    vectype_t *a,
+	    vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlorc, 1 stxvp.  */
+  *dest = *a | (~ *b);
+}
+
+void
+test_nand (vectype_t *dest,
+	   vectype_t *a,
+	   vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnand, 1 stxvp.  */
+  *dest = ~(*a & *b);
+}
+
+void
+test_nor (vectype_t *dest,
+	  vectype_t *a,
+	  vectype_t *b)
+{
+  /* 2 lxvp, 2 xxlnor, 1 stxvp.  */
+  *dest = ~(*a | *b);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}      24 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}     13 } } */
+/* { dg-final { scan-assembler-times {\mvaddubm\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mvsububm\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxland\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxlandc\M}    4 } } */
+/* { dg-final { scan-assembler-times {\mxxlnand\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M}     4 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mxxlorc\M}     4 } } */
+/* { dg-final { scan-assembler-times {\mxxlxor\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxxspltib\M}   1 } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-11-18  8:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-18  8:01 [gcc(refs/users/meissner/heads/work146-vsize)] Add vector_size(32) support Michael Meissner
  -- strict thread matches above, loose matches on Subject: below --
2023-11-18  5:47 Michael Meissner
2023-11-18  4:38 Michael Meissner
2023-11-17 23:32 Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).