[aarch64] Improve code-gen for vector initialization with single constant element.

gcc/ChangeLog:
	* config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak condition
	if (n_var == n_elts && n_elts <= 16) to allow a single constant,
	and if maxv == 1, use constant element for duplicating into register.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/vec-init-single-const.c: New test.

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2b0de7ca038..f46750133a6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
      and matches[X][1] with the count of duplicate elements (if X is the
      earliest element which has duplicates).  */
 
-  if (n_var == n_elts && n_elts <= 16)
+  if ((n_var >= n_elts - 1) && n_elts <= 16)
     {
       int matches[16][2] = {0};
       for (int i = 0; i < n_elts; i++)
@@ -22227,6 +22227,18 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	     vector register.  For big-endian we want that position to hold
 	     the last element of VALS.  */
 	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
+
+	  /* If we have a single constant element, use that for duplicating
+	     instead.  */
+	  if (n_var == n_elts - 1)
+	    for (int i = 0; i < n_elts; i++)
+	      if (CONST_INT_P (XVECEXP (vals, 0, i))
+		  || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
+		{
+		  maxelement = i;
+		  break;
+		}
+
 	  rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
 	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
 	}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
new file mode 100644
index 00000000000..517f47b13ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.16b, w[0-9]+
+**	movi	v[0-9]+\.8b, 0x1
+**	ins	v[0-9]+\.b\[15\], v[0-9]+\.b\[0\]
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, x, x, x, x, x, x, x,
+                       x, x, x, x, x, x, x, 1 };
+}
+
+/*
+** f_s16:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
+**	ins	v[0-9]+\.h\[7\], v[0-9]+\.h\[0\]
+**	...
+**	ret
+*/
+
+int16x8_t f_s16(int16_t x)
+{
+  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
+}
+
+/*
+** f_s32:
+**	...
+**	movi	v[0-9]\.2s, 0x1
+**	dup	v[0-9]\.4s, w[0-9]+
+**	ins	v[0-9]+\.s\[3\], v[0-9]+\.s\[0\]
+**	...
+**	ret
+*/
+
+int32x4_t f_s32(int32_t x)
+{
+  return (int32x4_t) { x, x, x, 1 };
+}
+
+/*
+** f_s64:
+**	...
+**	fmov	d[0-9]+, x[0-9]+
+**	mov	x[0-9]+, 1
+**	ins	v[0-9]+\.d\[1\], x[0-9]+
+**	...
+**	ret
+*/
+
+int64x2_t f_s64(int64_t x)
+{
+  return (int64x2_t) { x, 1 };
+}