diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 83d29cbfac1..806c3335cbc 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5231,6 +5231,12 @@ typedef struct _rs6000_cost_data
 {
   struct loop *loop_info;
   unsigned cost[3];
+  /* Total number of vectorized stmts (loop only).  */
+  unsigned nstmts;
+  /* Total number of loads (loop only).  */
+  unsigned nloads;
+  /* Possible extra penalized cost on vector construction (loop only).  */
+  unsigned extra_ctor_cost;
   /* For each vectorized loop, this var holds TRUE iff a non-memory vector
      instruction is needed by the vectorization.  */
   bool vect_nonmem;
@@ -5292,9 +5298,45 @@ rs6000_density_test (rs6000_cost_data *data)
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "density %d%%, cost %d exceeds threshold, penalizing "
-			 "loop body cost by %d%%", density_pct,
+			 "loop body cost by %d%%\n", density_pct,
 			 vec_cost + not_vec_cost, DENSITY_PENALTY);
     }
+
+  /* Check if we need to penalize the body cost for latency and
+     execution resources bound from strided or elementwise loads
+     into a vector.  */
+  if (data->extra_ctor_cost > 0)
+    {
+      /* Threshold for load stmts percentage in all vectorized stmts.  */
+      const int DENSITY_LOAD_PCT_THRESHOLD = 45;
+      /* Threshold for total number of load stmts.  */
+      const int DENSITY_LOAD_NUM_THRESHOLD = 20;
+
+      gcc_assert (data->nloads <= data->nstmts);
+      unsigned int load_pct = (data->nloads * 100) / (data->nstmts);
+
+      /* It's likely to be bounded by latency and execution resources
+	 from many scalar loads which are strided or elementwise loads
+	 into a vector if both conditions below are found:
+	   1. there are many loads, it's easy to result in a long wait
+	      for load units;
+	   2. load has a big proportion of all vectorized statements,
+	      it's not easy to schedule other statements to spread among
+	      the loads.
+	 One typical case is the innermost loop of the hotspot of SPEC2017
+	 503.bwaves_r without loop interchange.  */
+      if (data->nloads > DENSITY_LOAD_NUM_THRESHOLD
+	  && load_pct > DENSITY_LOAD_PCT_THRESHOLD)
+	{
+	  data->cost[vect_body] += data->extra_ctor_cost;
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Found %u loads and load pct. %u%% exceed "
+			     "the threshold, penalizing loop body "
+			     "cost by extra cost %u for ctor.\n",
+			     data->nloads, load_pct, data->extra_ctor_cost);
+	}
+    }
 }
 
 /* Implement targetm.vectorize.init_cost.  */
@@ -5308,6 +5350,9 @@ rs6000_init_cost (struct loop *loop_info, bool costing_for_scalar)
   data->cost[vect_body]     = 0;
   data->cost[vect_epilogue] = 0;
   data->vect_nonmem = false;
+  data->nstmts = 0;
+  data->nloads = 0;
+  data->extra_ctor_cost = 0;
   data->costing_for_scalar = costing_for_scalar;
   return data;
 }
@@ -5335,6 +5380,66 @@ rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind,
   return 0;
 }
 
+/* As a visitor function for each statement cost entry handled in
+   function add_stmt_cost, gather some information and update its
+   relevant fields in target cost accordingly.  */
+static void
+rs6000_update_target_cost_per_stmt (rs6000_cost_data *data,
+				    enum vect_cost_for_stmt kind,
+				    struct _stmt_vec_info *stmt_info,
+				    enum vect_cost_model_location where,
+				    int stmt_cost, unsigned int orig_count)
+{
+
+  /* Check whether we're doing something other than just a copy loop.
+     Not all such loops may be profitably vectorized; see
+     rs6000_finish_cost.  */
+  if ((kind == vec_to_scalar || kind == vec_perm || kind == vec_promote_demote
+       || kind == vec_construct || kind == scalar_to_vec)
+      || (where == vect_body && kind == vector_stmt))
+    data->vect_nonmem = true;
+
+  /* Gather some information when we are costing the vector version for
+     the statements locate in a loop body.  */
+  if (!data->costing_for_scalar && data->loop_info && where == vect_body)
+    {
+      data->nstmts += orig_count;
+
+      if (kind == scalar_load || kind == vector_load || kind == unaligned_load
+	  || kind == vector_gather_load)
+	data->nloads += orig_count;
+
+      /* If we have strided or elementwise loads into a vector, it's
+	 possible to be bounded by latency and execution resources for
+	 many scalar loads.  Try to account for this by scaling the
+	 construction cost by the number of elements involved, when
+	 handling each matching statement we record the possible extra
+	 penalized cost into target cost, in the end of costing for
+	 the whole loop, we do the actual penalization once some load
+	 density heuristics are satisfied.  */
+      if (kind == vec_construct && stmt_info
+	  && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
+	  && (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+	      || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_STRIDED_SLP))
+	{
+	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+	  unsigned int nunits = vect_nunits_for_cost (vectype);
+	  unsigned int extra_cost = nunits * stmt_cost;
+	  /* As function rs6000_builtin_vectorization_cost shows, we have
+	     priced much on V16QI/V8HI vector construction as their units,
+	     if we penalize them with nunits * stmt_cost, it can result in
+	     a unreliable body cost, eg: for V16QI on Power8, stmt_cost is
+	     20 and nunits is 16, the extra cost is 320 which looks much
+	     exaggerated.  So let's use one maximum bound for the extra
+	     penalized cost for vector construction here.  */
+	  const unsigned int MAX_PENALIZED_COST_FOR_CTOR = 12;
+	  if (extra_cost > MAX_PENALIZED_COST_FOR_CTOR)
+	    extra_cost = MAX_PENALIZED_COST_FOR_CTOR;
+	  data->extra_ctor_cost += extra_cost;
+	}
+    }
+}
+
 /* Implement targetm.vectorize.add_stmt_cost.  */
 
 static unsigned
@@ -5354,6 +5459,7 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count,
       /* Statements in an inner loop relative to the loop being
 	 vectorized are weighted more heavily.  The value here is
 	 arbitrary and could potentially be improved with analysis.  */
+      unsigned int orig_count = count;
       if (where == vect_body && stmt_info
 	  && stmt_in_inner_loop_p (vinfo, stmt_info))
 	{
@@ -5365,14 +5471,8 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count,
       retval = (unsigned) (count * stmt_cost);
       cost_data->cost[where] += retval;
 
-      /* Check whether we're doing something other than just a copy loop.
-	 Not all such loops may be profitably vectorized; see
-	 rs6000_finish_cost.  */
-      if ((kind == vec_to_scalar || kind == vec_perm
-	   || kind == vec_promote_demote || kind == vec_construct
-	   || kind == scalar_to_vec)
-	  || (where == vect_body && kind == vector_stmt))
-	cost_data->vect_nonmem = true;
+      rs6000_update_target_cost_per_stmt (cost_data, kind, stmt_info, where,
+					  stmt_cost, orig_count);
     }
 
   return retval;