public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [Patch, Vectorizer, SVE] fmin/fmax builtin reduction support
@ 2018-12-19  9:33 Alejandro Martinez Vicente
  2018-12-19 12:34 ` Richard Biener
  0 siblings, 1 reply; 3+ messages in thread
From: Alejandro Martinez Vicente @ 2018-12-19  9:33 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Sandiford, nd

[-- Attachment #1: Type: text/plain, Size: 4922 bytes --]

Hi all,
 
Loops that use the fmin/fmax builtins can be vectorized even without
-ffast-math using SVE's FMINNM/FMAXNM instructions. This is an example:
 
double
f (double *x, int n)
{
  double res = 100.0;
  for (int i = 0; i < n; ++i)
    res = __builtin_fmin (res, x[i]);
  return res;
}

Before this patch, the compiler would generate this code (-march=armv8.2-a+sve
-O2 -ftree-vectorize):

0000000000000000 <f>:
   0:   7100003f        cmp     w1, #0x0
   4:   5400018d        b.le    34 <f+0x34>
   8:   51000422        sub     w2, w1, #0x1
   c:   91002003        add     x3, x0, #0x8
  10:   d2e80b21        mov     x1, #0x4059000000000000
  14:   9e670020        fmov    d0, x1
  18:   8b224c62        add     x2, x3, w2, uxtw #3
  1c:   d503201f        nop
  20:   fc408401        ldr     d1, [x0],#8
  24:   1e617800        fminnm  d0, d0, d1
  28:   eb02001f        cmp     x0, x2
  2c:   54ffffa1        b.ne    20 <f+0x20>
  30:   d65f03c0        ret
  34:   d2e80b20        mov     x0, #0x4059000000000000
  38:   9e670000        fmov    d0, x0
  3c:   d65f03c0        ret

After this patch, this is the code that gets generated:

0000000000000000 <f>:
   0:   7100003f        cmp     w1, #0x0
   4:   5400020d        b.le    44 <f+0x44>
   8:   d2800002        mov     x2, #0x0
   c:   25d8e3e0        ptrue   p0.d
  10:   93407c21        sxtw    x1, w1
  14:   90000003        adrp    x3, 0 <f>
  18:   25804001        mov     p1.b, p0.b
  1c:   91000063        add     x3, x3, #0x0
  20:   85c0e060        ld1rd   {z0.d}, p0/z, [x3]
  24:   25e11fe0        whilelo p0.d, xzr, x1
  28:   a5e24001        ld1d    {z1.d}, p0/z, [x0, x2, lsl #3]
  2c:   04f0e3e2        incd    x2
  30:   65c58020        fminnm  z0.d, p0/m, z0.d, z1.d
  34:   25e11c40        whilelo p0.d, x2, x1
  38:   54ffff81        b.ne    28 <f+0x28>  // b.any
  3c:   65c52400        fminnmv d0, p1, z0.d
  40:   d65f03c0        ret
  44:   d2e80b20        mov     x0, #0x4059000000000000
  48:   9e670000        fmov    d0, x0
  4c:   d65f03c0        ret

This patch extends the support for reductions to include calls to internal
functions, in addition to assign statements. For this purpose, in most places
where a tree_code would be used, a code_helper is used instead. The code_helper
allows to hold either a tree_code or combined_fn.

This patch implements these tasks:

- Detect a reduction candidate based on a call to an internal function
  (currently only fmin or fmax).
- Process the reduction using code_helper. This means that at several places
  we have to check whether this is as assign-based reduction or a call-based
  reduction.
- Add new internal functions for the fmin/fmax reductions and for conditional
  fmin/fmax. In architectures where ieee fmin/fmax reductions are available, it
  is still possible to vectorize the loop using unconditional instructions.
- Update SVE's md to support these new reductions.
- Add new SVE tests to check that the optimal code is being generated.

I tested this patch in an aarch64 machine bootstrapping the compiler and
running the checks.
 
Alejandro
 
gcc/Changelog:
 
2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

	* gimple-match.h (code_helper_for_stmnt): New function to get a
	code_helper from an statement.
	* internal-fn.def: New reduc_fmax_scal and reduc_fmin_scal optabs for
	ieee fp max/min reductions
	* optabs.def: Likewise.
	* tree-vect-loop.c (reduction_fn_for_scalar_code): Changed function
	signature to accept code_helper instead of tree_code. Handle the
	fmax/fmin builtins.
	(needs_fold_left_reduction_p): Likewise.
	(check_reduction_path): Likewise.
	(vect_is_simple_reduction): Use code_helper instead of tree_code. Check
	for supported call-based reductions. Extend support for both
	assignment-based and call-based reductions.
	(vect_model_reduction_cost): Extend cost-model support to call-based
	reductions (just use MAX expression).
	(get_initial_def_for_reduction): Use code_helper instead of tree_code.
	Extend support for both assignment-based and call-based reductions.
	(vect_create_epilog_for_reduction): Likewise.
	(vectorizable_reduction): Likewise.
	* tree-vectorizer.h: include gimple-match.h for code_helper. Use
	code_helper in check_reduction_path signature.
	* config/aarch64/aarch64-sve.md: Added define_expand to capture new
	reduc_fmax_scal and reduc_fmin_scal optabs.
	* config/aarch64/iterators.md: New FMAXMINNMV and fmaxmin_uns iterators
	to support the new define_expand.
 
gcc/testsuite/Changelog:
 
2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

	* gcc.target/aarch64/sve/reduc_9.c: New test to check
	SVE-vectorized reductions without -ffast-math.
	* gcc.target/aarch64/sve/reduc_10.c: New test to check
	SVE-vectorized builtin reductions without -ffast-math.

[-- Attachment #2: final.patch --]
[-- Type: application/octet-stream, Size: 43978 bytes --]

diff --git gcc/config/aarch64/aarch64-sve.md gcc/config/aarch64/aarch64-sve.md
index 5cd591b..d9fbc79 100644
--- gcc/config/aarch64/aarch64-sve.md
+++ gcc/config/aarch64/aarch64-sve.md
@@ -2109,6 +2109,18 @@
   }
 )
 
+;; Unpredicated ieee floating-point MIN/MAX reduction.
+(define_expand "reduc_<fmaxmin_uns>_scal_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(unspec:<VEL> [(match_dup 2)
+		       (match_operand:SVE_F 1 "register_operand")]
+		      FMAXMINNMV))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
 ;; Predicated floating-point MIN/MAX reduction.
 (define_insn "*reduc_<maxmin_uns>_scal_<mode>"
   [(set (match_operand:<VEL> 0 "register_operand" "=w")
diff --git gcc/config/aarch64/iterators.md gcc/config/aarch64/iterators.md
index 524e4e6..ccc9f9d 100644
--- gcc/config/aarch64/iterators.md
+++ gcc/config/aarch64/iterators.md
@@ -474,6 +474,8 @@
     UNSPEC_COND_DIV	; Used in aarch64-sve.md.
     UNSPEC_COND_MAX	; Used in aarch64-sve.md.
     UNSPEC_COND_MIN	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMAX	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMIN	; Used in aarch64-sve.md.
     UNSPEC_COND_FMLA	; Used in aarch64-sve.md.
     UNSPEC_COND_FMLS	; Used in aarch64-sve.md.
     UNSPEC_COND_FNMLA	; Used in aarch64-sve.md.
@@ -1458,6 +1460,8 @@
 (define_int_iterator FMAXMINV [UNSPEC_FMAXV UNSPEC_FMINV
 			       UNSPEC_FMAXNMV UNSPEC_FMINNMV])
 
+(define_int_iterator FMAXMINNMV [UNSPEC_FMAXNMV UNSPEC_FMINNMV])
+
 (define_int_iterator BITWISEV [UNSPEC_ANDV UNSPEC_IORV UNSPEC_XORV])
 
 (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
@@ -1569,7 +1573,8 @@
 
 (define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_ADD UNSPEC_COND_SUB
 					 UNSPEC_COND_MUL UNSPEC_COND_DIV
-					 UNSPEC_COND_MAX UNSPEC_COND_MIN])
+					 UNSPEC_COND_MAX UNSPEC_COND_MIN
+					 UNSPEC_COND_FMAX UNSPEC_COND_FMIN])
 
 (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA
 					  UNSPEC_COND_FMLS
@@ -1616,7 +1621,9 @@
 			(UNSPEC_COND_FMLA "fma")
 			(UNSPEC_COND_FMLS "fnma")
 			(UNSPEC_COND_FNMLA "fnms")
-			(UNSPEC_COND_FNMLS "fms")])
+			(UNSPEC_COND_FNMLS "fms")
+			(UNSPEC_COND_FMAX "fmax")
+			(UNSPEC_COND_FMIN "fmin")])
 
 (define_int_attr  maxmin_uns [(UNSPEC_UMAXV "umax")
 			      (UNSPEC_UMINV "umin")
@@ -1631,6 +1638,10 @@
 			      (UNSPEC_FMAXNM "fmax")
 			      (UNSPEC_FMINNM "fmin")])
 
+
+(define_int_attr  fmaxmin_uns [(UNSPEC_FMAXNMV "fmax")
+			       (UNSPEC_FMINNMV "fmin")])
+
 (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
 				 (UNSPEC_UMINV "umin")
 				 (UNSPEC_SMAXV "smax")
@@ -1832,14 +1843,18 @@
 			    (UNSPEC_COND_MUL "fmul")
 			    (UNSPEC_COND_DIV "fdiv")
 			    (UNSPEC_COND_MAX "fmaxnm")
-			    (UNSPEC_COND_MIN "fminnm")])
+			    (UNSPEC_COND_MIN "fminnm")
+			    (UNSPEC_COND_FMAX "fmaxnm")
+			    (UNSPEC_COND_FMIN "fminnm")])
 
 (define_int_attr sve_fp_op_rev [(UNSPEC_COND_ADD "fadd")
 			        (UNSPEC_COND_SUB "fsubr")
 			        (UNSPEC_COND_MUL "fmul")
 			        (UNSPEC_COND_DIV "fdivr")
 			        (UNSPEC_COND_MAX "fmaxnm")
-			        (UNSPEC_COND_MIN "fminnm")])
+				(UNSPEC_COND_MIN "fminnm")
+				(UNSPEC_COND_FMAX "fmaxnm")
+				(UNSPEC_COND_FMIN "fminnm")])
 
 (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
 			      (UNSPEC_COND_FMLS "fmls")
diff --git gcc/gimple-match.h gcc/gimple-match.h
index b6eb888..fd657ac 100644
--- gcc/gimple-match.h
+++ gcc/gimple-match.h
@@ -327,6 +327,21 @@ gimple_simplified_result_is_gimple_val (const gimple_match_op *op)
 	  && is_gimple_val (op->ops[0]));
 }
 
+/* Return code_helper for a gassign or gcall.  */
+
+inline code_helper
+code_helper_for_stmnt (gimple * orig_stmt)
+{
+  code_helper code;
+  if (gassign * stmt = dyn_cast <gassign *> (orig_stmt))
+    code = code_helper (gimple_assign_rhs_code (stmt));
+  else if (gcall * stmt = dyn_cast <gcall *> (orig_stmt))
+    code = code_helper (gimple_call_combined_fn (stmt));
+  else
+    gcc_unreachable ();
+  return code;
+}
+
 extern tree (*mprts_hook) (gimple_match_op *);
 
 bool gimple_simplify (gimple *, gimple_match_op *, gimple_seq *,
diff --git gcc/internal-fn.c gcc/internal-fn.c
index d082dd5..629b689 100644
--- gcc/internal-fn.c
+++ gcc/internal-fn.c
@@ -3336,7 +3336,9 @@ conditional_internal_fn_code (internal_fn ifn)
   T (FMA) \
   T (FMS) \
   T (FNMA) \
-  T (FNMS)
+  T (FNMS) \
+  T (FMIN) \
+  T (FMAX)
 
 /* Return a function that only performs internal function FN when a
    certain condition is met and that uses a given fallback value otherwise.
diff --git gcc/internal-fn.def gcc/internal-fn.def
index cda314e..8ea43bf 100644
--- gcc/internal-fn.def
+++ gcc/internal-fn.def
@@ -173,6 +173,9 @@ DEF_INTERNAL_OPTAB_FN (COND_FMS, ECF_CONST, cond_fms, cond_ternary)
 DEF_INTERNAL_OPTAB_FN (COND_FNMA, ECF_CONST, cond_fnma, cond_ternary)
 DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary)
 
+DEF_INTERNAL_OPTAB_FN (COND_FMAX, ECF_CONST, cond_fmax, cond_binary)
+DEF_INTERNAL_OPTAB_FN (COND_FMIN, ECF_CONST, cond_fmin, cond_binary)
+
 DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
 
 DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
@@ -187,6 +190,10 @@ DEF_INTERNAL_OPTAB_FN (REDUC_IOR, ECF_CONST | ECF_NOTHROW,
 		       reduc_ior_scal, unary)
 DEF_INTERNAL_OPTAB_FN (REDUC_XOR, ECF_CONST | ECF_NOTHROW,
 		       reduc_xor_scal, unary)
+DEF_INTERNAL_OPTAB_FN (REDUC_FMAX, ECF_CONST | ECF_NOTHROW,
+		       reduc_fmax_scal, unary)
+DEF_INTERNAL_OPTAB_FN (REDUC_FMIN, ECF_CONST | ECF_NOTHROW,
+		       reduc_fmin_scal, unary)
 
 /* Extract the last active element from a vector.  */
 DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
diff --git gcc/optabs.def gcc/optabs.def
index 5a67f5e..8ed4de2 100644
--- gcc/optabs.def
+++ gcc/optabs.def
@@ -238,6 +238,8 @@ OPTAB_D (cond_fma_optab, "cond_fma$a")
 OPTAB_D (cond_fms_optab, "cond_fms$a")
 OPTAB_D (cond_fnma_optab, "cond_fnma$a")
 OPTAB_D (cond_fnms_optab, "cond_fnms$a")
+OPTAB_D (cond_fmin_optab, "cond_fmin$a")
+OPTAB_D (cond_fmax_optab, "cond_fmax$a")
 OPTAB_D (cmov_optab, "cmov$a6")
 OPTAB_D (cstore_optab, "cstore$a4")
 OPTAB_D (ctrap_optab, "ctrap$a4")
@@ -315,6 +317,8 @@ OPTAB_D (reduc_umin_scal_optab, "reduc_umin_scal_$a")
 OPTAB_D (reduc_and_scal_optab,  "reduc_and_scal_$a")
 OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
 OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
+OPTAB_D (reduc_fmax_scal_optab, "reduc_fmax_scal_$a")
+OPTAB_D (reduc_fmin_scal_optab, "reduc_fmin_scal_$a")
 OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
 
 OPTAB_D (extract_last_optab, "extract_last_$a")
diff --git gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 0000000..d5ebe97
--- /dev/null
+++ gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_REDUC_BI_MAXMIN(TYPE, NAME, FUNC)	\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_bi_##NAME##_##TYPE (TYPE *a, int n)	\
+{						\
+  TYPE r = 13;					\
+  for (int i = 0; i < n; ++i)			\
+    r = __builtin_##FUNC (r, a[i]);		\
+  return r;					\
+}
+
+#define TEST_BI_MAXMIN(T)			\
+  T (_Float16, max, fmaxf16)			\
+  T (float, max, fmaxf)				\
+  T (double, max, fmax)				\
+						\
+  T (_Float16, min, fminf16)			\
+  T (float, min, fminf)				\
+  T (double, min, fmin)
+
+TEST_BI_MAXMIN (DEF_REDUC_BI_MAXMIN)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
diff --git gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 0000000..9147565
--- /dev/null
+++ gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,201 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_REDUC_PLUS(TYPE)			\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_plus_##TYPE (TYPE *a, int n)		\
+{						\
+  TYPE r = 0;					\
+  for (int i = 0; i < n; ++i)			\
+    r += a[i];					\
+  return r;					\
+}
+
+#define TEST_PLUS(T)				\
+  T (int8_t)					\
+  T (int16_t)					\
+  T (int32_t)					\
+  T (int64_t)					\
+  T (uint8_t)					\
+  T (uint16_t)					\
+  T (uint32_t)					\
+  T (uint64_t)					\
+  T (_Float16)					\
+  T (float)					\
+  T (double)
+
+TEST_PLUS (DEF_REDUC_PLUS)
+
+#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP)	\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_##NAME##_##TYPE (TYPE *a, int n)		\
+{						\
+  TYPE r = 13;					\
+  for (int i = 0; i < n; ++i)			\
+    r = a[i] CMP_OP r ? a[i] : r;		\
+  return r;					\
+}
+
+#define TEST_MAXMIN(T)				\
+  T (int8_t, max, >)				\
+  T (int16_t, max, >)				\
+  T (int32_t, max, >)				\
+  T (int64_t, max, >)				\
+  T (uint8_t, max, >)				\
+  T (uint16_t, max, >)				\
+  T (uint32_t, max, >)				\
+  T (uint64_t, max, >)				\
+  T (_Float16, max, >)				\
+  T (float, max, >)				\
+  T (double, max, >)				\
+						\
+  T (int8_t, min, <)				\
+  T (int16_t, min, <)				\
+  T (int32_t, min, <)				\
+  T (int64_t, min, <)				\
+  T (uint8_t, min, <)				\
+  T (uint16_t, min, <)				\
+  T (uint32_t, min, <)				\
+  T (uint64_t, min, <)				\
+  T (_Float16, min, <)				\
+  T (float, min, <)				\
+  T (double, min, <)
+
+TEST_MAXMIN (DEF_REDUC_MAXMIN)
+
+#define DEF_REDUC_BITWISE(TYPE, NAME, BIT_OP)	\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_##NAME##_##TYPE (TYPE *a, int n)		\
+{						\
+  TYPE r = 13;					\
+  for (int i = 0; i < n; ++i)			\
+    r BIT_OP a[i];				\
+  return r;					\
+}
+
+#define TEST_BITWISE(T)				\
+  T (int8_t, and, &=)				\
+  T (int16_t, and, &=)				\
+  T (int32_t, and, &=)				\
+  T (int64_t, and, &=)				\
+  T (uint8_t, and, &=)				\
+  T (uint16_t, and, &=)				\
+  T (uint32_t, and, &=)				\
+  T (uint64_t, and, &=)				\
+						\
+  T (int8_t, ior, |=)				\
+  T (int16_t, ior, |=)				\
+  T (int32_t, ior, |=)				\
+  T (int64_t, ior, |=)				\
+  T (uint8_t, ior, |=)				\
+  T (uint16_t, ior, |=)				\
+  T (uint32_t, ior, |=)				\
+  T (uint64_t, ior, |=)				\
+						\
+  T (int8_t, xor, ^=)				\
+  T (int16_t, xor, ^=)				\
+  T (int32_t, xor, ^=)				\
+  T (int64_t, xor, ^=)				\
+  T (uint8_t, xor, ^=)				\
+  T (uint16_t, xor, ^=)				\
+  T (uint32_t, xor, ^=)				\
+  T (uint64_t, xor, ^=)
+
+TEST_BITWISE (DEF_REDUC_BITWISE)
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsmaxv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmaxv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmaxv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmaxv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tsminv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsminv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsminv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsminv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tandv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tandv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tandv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tandv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teorv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teorv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teorv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teorv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
diff --git gcc/tree-vect-loop.c gcc/tree-vect-loop.c
index 633c315..20f2f06 100644
--- gcc/tree-vect-loop.c
+++ gcc/tree-vect-loop.c
@@ -54,6 +54,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
 #include "tree-eh.h"
+#include "case-cfn-macros.h"
+#include "gimple-match.h"
+#include "builtins.h"
 
 /* Loop Vectorization Pass.
 
@@ -2322,7 +2325,7 @@ fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
 /* Function reduction_fn_for_scalar_code
 
    Input:
-   CODE - tree_code of a reduction operations.
+   CODE - code_helper of a reduction operation.
 
    Output:
    REDUC_FN - the corresponding internal function to be used to reduce the
@@ -2333,21 +2336,22 @@ fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
 static bool
-reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
+reduction_fn_for_scalar_code (code_helper code, internal_fn * reduc_fn)
 {
-  switch (code)
-    {
+  if (code.is_tree_code ())
+    switch ((enum tree_code) code)
+      {
       case MAX_EXPR:
-        *reduc_fn = IFN_REDUC_MAX;
-        return true;
+	*reduc_fn = IFN_REDUC_MAX;
+	return true;
 
       case MIN_EXPR:
-        *reduc_fn = IFN_REDUC_MIN;
-        return true;
+	*reduc_fn = IFN_REDUC_MIN;
+	return true;
 
       case PLUS_EXPR:
-        *reduc_fn = IFN_REDUC_PLUS;
-        return true;
+	*reduc_fn = IFN_REDUC_PLUS;
+	return true;
 
       case BIT_AND_EXPR:
 	*reduc_fn = IFN_REDUC_AND;
@@ -2363,12 +2367,28 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
 
       case MULT_EXPR:
       case MINUS_EXPR:
-        *reduc_fn = IFN_LAST;
-        return true;
+	*reduc_fn = IFN_LAST;
+	return true;
 
       default:
-       return false;
-    }
+	return false;
+      }
+  else
+    switch ((combined_fn) code)
+      {
+      CASE_CFN_FMAX:
+      CASE_CFN_FMAX_FN:
+	*reduc_fn = IFN_REDUC_FMAX;
+	return true;
+
+      CASE_CFN_FMIN:
+      CASE_CFN_FMIN_FN:
+	*reduc_fn = IFN_REDUC_FMIN;
+	return true;
+
+      default:
+	return false;
+      }
 }
 
 /* If there is a neutral value X such that SLP reduction NODE would not
@@ -2616,9 +2636,13 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
    overflow must wrap.  */
 
 static bool
-needs_fold_left_reduction_p (tree type, tree_code code,
+needs_fold_left_reduction_p (tree type, code_helper orig_code,
 			     bool need_wrapping_integral_overflow)
 {
+  if (orig_code.is_fn_code ())
+    return false;
+  enum tree_code code = orig_code;
+
   /* CHECKME: check for !flag_finite_math_only too?  */
   if (SCALAR_FLOAT_TYPE_P (type))
     switch (code)
@@ -2653,7 +2677,7 @@ needs_fold_left_reduction_p (tree type, tree_code code,
 
 bool
 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
-		      tree loop_arg, enum tree_code code)
+		      tree loop_arg, code_helper code)
 {
   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
   auto_bitmap visited;
@@ -2752,7 +2776,6 @@ pop:
   return ! fail && ! neg;
 }
 
-
 /* Function vect_is_simple_reduction
 
    (1) Detect a cross-iteration def-use cycle that represents a simple
@@ -2808,13 +2831,13 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   gimple *phi_use_stmt = NULL;
-  enum tree_code orig_code, code;
+  code_helper orig_code, code;
   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
   tree type;
   tree name;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
-  bool phi_def;
+  bool phi_def, is_call;
 
   *double_reduc = false;
   *v_reduc_type = TREE_CODE_REDUCTION;
@@ -2865,11 +2888,19 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
     {
       name = gimple_assign_lhs (def_stmt);
       phi_def = false;
+      is_call = false;
     }
   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
     {
       name = PHI_RESULT (def_stmt);
       phi_def = true;
+      is_call = false;
+    }
+  else if (gcall *def_stmt = dyn_cast <gcall *> (def_stmt_info->stmt))
+    {
+      name = gimple_call_lhs (def_stmt);
+      phi_def = false;
+      is_call = true;
     }
   else
     {
@@ -2970,8 +3001,43 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 	  }
     }
 
-  gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  code = orig_code = gimple_assign_rhs_code (def_stmt);
+  if (is_call)
+    {
+      gcall *def_stmt = as_a <gcall *> (def_stmt_info->stmt);
+
+      if (!gimple_call_builtin_p (def_stmt))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "reduction: unhandled reduction "
+			     "with non-builtin call: %G",
+			     def_stmt_info->stmt);
+	  return NULL;
+	}
+
+      code = orig_code = gimple_call_combined_fn (def_stmt);
+
+      switch ((combined_fn) orig_code)
+	{
+	CASE_CFN_FMAX:
+	CASE_CFN_FMAX_FN:
+	CASE_CFN_FMIN:
+	CASE_CFN_FMIN_FN:
+	  break;
+	default:
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "reduction: unhandled reduction with builtin: %G",
+			     def_stmt_info->stmt);
+	  return NULL;
+	}
+    }
+  else
+    {
+      gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
+      code = orig_code = gimple_assign_rhs_code (def_stmt);
+    }
+  gimple *def_stmt = def_stmt_info->stmt;
 
   if (nested_in_vect_loop && !check_reduction)
     {
@@ -3026,17 +3092,28 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       op1 = gimple_assign_rhs2 (def_stmt);
       op2 = gimple_assign_rhs3 (def_stmt);
     }
-  else if (!commutative_tree_code (code) || !associative_tree_code (code))
+  else if (!is_call
+	   && (!commutative_tree_code (code)
+	       || !associative_tree_code (code)))
     {
       if (dump_enabled_p ())
 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
 			"reduction: not commutative/associative: ");
       return NULL;
     }
-  else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
+  else if (is_call || get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
     {
-      op1 = gimple_assign_rhs1 (def_stmt);
-      op2 = gimple_assign_rhs2 (def_stmt);
+      if (is_call)
+	{
+	  gcc_assert (gimple_call_num_args (def_stmt) == 2);
+	  op1 = gimple_call_arg (def_stmt, 0);
+	  op2 = gimple_call_arg (def_stmt, 1);
+	}
+      else
+	{
+	  op1 = gimple_assign_rhs1 (def_stmt);
+	  op2 = gimple_assign_rhs2 (def_stmt);
+	}
     }
   else
     {
@@ -3055,7 +3132,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       return NULL;
     }
 
-  type = TREE_TYPE (gimple_assign_lhs (def_stmt));
+  type = TREE_TYPE (gimple_get_lhs (def_stmt));
   if ((TREE_CODE (op1) == SSA_NAME
        && !types_compatible_p (type,TREE_TYPE (op1)))
       || (TREE_CODE (op2) == SSA_NAME
@@ -3164,6 +3241,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 		  return NULL;
 		}
 	    }
+	  else if (is_call)
+	    swap_ssa_operands (def_stmt, gimple_call_arg_ptr (def_stmt, 0),
+			       gimple_call_arg_ptr (def_stmt, 1));
 	  else
 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
 			       gimple_assign_rhs2_ptr (def_stmt));
@@ -3172,7 +3252,10 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 	    report_vect_op (MSG_NOTE, def_stmt,
 			    "detected reduction: need to swap operands: ");
 
-	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
+	  if (!is_call && CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
+	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
+	  else if (is_call
+		   && CONSTANT_CLASS_P (gimple_call_arg (def_stmt, 0)))
 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
         }
       else
@@ -3745,6 +3828,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
 			   int ncopies, stmt_vector_for_cost *cost_vec)
 {
   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
+  code_helper orig_code;
   enum tree_code code;
   optab optab;
   tree vectype;
@@ -3765,7 +3849,9 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
   mode = TYPE_MODE (vectype);
   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
 
-  code = gimple_assign_rhs_code (orig_stmt_info->stmt);
+  orig_code = code_helper_for_stmnt (orig_stmt_info->stmt);
+  /* Use MAX_EXPR tree_code for the call-based reductions.  */
+  code = orig_code.is_tree_code () ? (enum tree_code) orig_code : MAX_EXPR;
 
   if (reduction_type == EXTRACT_LAST_REDUCTION
       || reduction_type == FOLD_LEFT_REDUCTION)
@@ -3861,7 +3947,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
 	{
 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
 	  tree bitsize =
-	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
+	    TYPE_SIZE (TREE_TYPE (gimple_get_lhs (orig_stmt_info->stmt)));
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  int nelements = vec_size_in_bits / element_bitsize;
 
@@ -3984,7 +4070,7 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (scalar_type);
-  enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
+  code_helper code = code_helper_for_stmnt (stmt_vinfo->stmt);
   tree def_for_init;
   tree init_def;
   REAL_VALUE_TYPE real_init_val = dconst0;
@@ -4002,82 +4088,106 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
   vect_reduction_type reduction_type
     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
 
-  switch (code)
-    {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-    case MULT_EXPR:
-    case BIT_AND_EXPR:
+  if (code.is_tree_code ())
+    switch ((enum tree_code) code)
       {
-        /* ADJUSTMENT_DEF is NULL when called from
-           vect_create_epilog_for_reduction to vectorize double reduction.  */
-        if (adjustment_def)
-	  *adjustment_def = init_val;
-
-        if (code == MULT_EXPR)
-          {
-            real_init_val = dconst1;
-            int_init_val = 1;
-          }
-
-        if (code == BIT_AND_EXPR)
-          int_init_val = -1;
-
-        if (SCALAR_FLOAT_TYPE_P (scalar_type))
-          def_for_init = build_real (scalar_type, real_init_val);
-        else
-          def_for_init = build_int_cst (scalar_type, int_init_val);
-
-	if (adjustment_def)
-	  /* Option1: the first element is '0' or '1' as well.  */
-	  init_def = gimple_build_vector_from_val (&stmts, vectype,
-						   def_for_init);
-	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
-	  {
-	    /* Option2 (variable length): the first element is INIT_VAL.  */
+      case WIDEN_SUM_EXPR:
+      case DOT_PROD_EXPR:
+      case SAD_EXPR:
+      case PLUS_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case MULT_EXPR:
+      case BIT_AND_EXPR:
+	{
+	  /* ADJUSTMENT_DEF is NULL when called from
+	     vect_create_epilog_for_reduction to vectorize double reduction.  */
+	  if (adjustment_def)
+	    *adjustment_def = init_val;
+
+	  if (code == MULT_EXPR)
+	    {
+	      real_init_val = dconst1;
+	      int_init_val = 1;
+	    }
+
+	  if (code == BIT_AND_EXPR)
+	    int_init_val = -1;
+
+	  if (SCALAR_FLOAT_TYPE_P (scalar_type))
+	    def_for_init = build_real (scalar_type, real_init_val);
+	  else
+	    def_for_init = build_int_cst (scalar_type, int_init_val);
+
+	  if (adjustment_def)
+	    /* Option1: the first element is '0' or '1' as well.  */
 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
 						     def_for_init);
-	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
-				     vectype, init_def, init_val);
-	  }
-	else
-	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    tree_vector_builder elts (vectype, 1, 2);
-	    elts.quick_push (init_val);
-	    elts.quick_push (def_for_init);
-	    init_def = gimple_build_vector (&stmts, &elts);
-	  }
-      }
-      break;
+	  else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+	    {
+	      /* Option2 (variable length): the first element is INIT_VAL.  */
+	      init_def = gimple_build_vector_from_val (&stmts, vectype,
+						       def_for_init);
+	      init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
+				       vectype, init_def, init_val);
+	    }
+	  else
+	    {
+	      /* Option2: the first element is INIT_VAL.  */
+	      tree_vector_builder elts (vectype, 1, 2);
+	      elts.quick_push (init_val);
+	      elts.quick_push (def_for_init);
+	      init_def = gimple_build_vector (&stmts, &elts);
+	    }
+	}
+	break;
 
-    case MIN_EXPR:
-    case MAX_EXPR:
-    case COND_EXPR:
-      {
-	if (adjustment_def)
-          {
-	    *adjustment_def = NULL_TREE;
-	    if (reduction_type != COND_REDUCTION
-		&& reduction_type != EXTRACT_LAST_REDUCTION)
-	      {
-		init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
-		break;
-	      }
-	  }
-	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
-	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
+      case MIN_EXPR:
+      case MAX_EXPR:
+      case COND_EXPR:
+	{
+	  if (adjustment_def)
+	    {
+	      *adjustment_def = NULL_TREE;
+	      init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
+	      break;
+	    }
+	  init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+	  init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
+	}
+	break;
+
+      default:
+	gcc_unreachable ();
       }
-      break;
+  else
+    switch ((combined_fn) code)
+      {
+      CASE_CFN_FMAX:
+      CASE_CFN_FMAX_FN:
+      CASE_CFN_FMIN:
+      CASE_CFN_FMIN_FN:
+	{
+	  if (adjustment_def)
+	    {
+	      *adjustment_def = NULL_TREE;
+	      if (reduction_type != COND_REDUCTION
+		  && reduction_type != EXTRACT_LAST_REDUCTION)
+		{
+		  init_def
+		    = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
+		  break;
+		}
+	    }
+	  init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+	  init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
+	}
+	break;
 
-    default:
-      gcc_unreachable ();
-    }
+      default:
+	gcc_unreachable ();
+      }
 
   if (stmts)
     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
@@ -4345,7 +4455,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
   tree vec_dest;
   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
-  enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
+  code_helper code = code_helper_for_stmnt (stmt_info->stmt);
   gimple *exit_phi;
   tree bitsize;
   tree adjustment_def = NULL;
@@ -4689,13 +4799,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
 
-  code = gimple_assign_rhs_code (orig_stmt_info->stmt);
+  code = code_helper_for_stmnt (orig_stmt_info->stmt);
   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
      partial results are added and not subtracted.  */
   if (code == MINUS_EXPR) 
     code = PLUS_EXPR;
   
-  scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
+  scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.create (group_size); 
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
@@ -5988,7 +6098,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   tree vectype_in = NULL_TREE;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  enum tree_code code, orig_code;
+  code_helper code, orig_code;
   internal_fn reduc_fn;
   machine_mode vec_mode;
   int op_type;
@@ -6065,25 +6175,54 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	/* Leave the scalar phi in place.  */
 	return true;
 
-      gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
-      code = gimple_assign_rhs_code (reduc_stmt);
-      for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
+      if (gassign *reduc_stmt = dyn_cast <gassign *> (reduc_stmt_info->stmt))
 	{
-	  tree op = gimple_op (reduc_stmt, k);
-	  if (op == phi_result)
-	    continue;
-	  if (k == 1 && code == COND_EXPR)
-	    continue;
-	  bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
-	  gcc_assert (is_simple_use);
-	  if (dt == vect_constant_def || dt == vect_external_def)
-	    continue;
-	  if (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
-	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
-	  break;
+	  code = gimple_assign_rhs_code (reduc_stmt);
+
+	  for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
+	    {
+	      tree op = gimple_op (reduc_stmt, k);
+	      if (op == phi_result)
+		continue;
+	      if (k == 1 && code == COND_EXPR)
+		continue;
+	      bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
+	      gcc_assert (is_simple_use);
+	      if (dt == vect_constant_def || dt == vect_external_def)
+		continue;
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
+		vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
+	      break;
+	    }
 	}
+      else if (gcall *reduc_stmt = dyn_cast <gcall *> (reduc_stmt_info->stmt))
+	{
+	  code = code_helper_for_stmnt (reduc_stmt);
+
+	  for (unsigned k = 1; k < gimple_call_num_args (reduc_stmt); ++k)
+	    {
+	      tree op = gimple_call_arg (reduc_stmt, k);
+	      if (op == phi_result)
+		continue;
+	      bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
+	      gcc_assert (is_simple_use);
+	      if (dt == vect_constant_def || dt == vect_external_def)
+		continue;
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
+		vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
+	      break;
+	    }
+
+	}
+      else
+	gcc_unreachable ();
+
+      gimple *reduc_stmt = reduc_stmt_info->stmt;
+
       /* For a nested cycle we might end up with an operation like
          phi_result * phi_result.  */
       if (!vectype_in)
@@ -6103,7 +6242,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	single_defuse_cycle = true;
 
       /* Create the destination vector  */
-      scalar_dest = gimple_assign_lhs (reduc_stmt);
+      scalar_dest = gimple_get_lhs (reduc_stmt);
       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
       if (slp_node)
@@ -6177,39 +6316,51 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
         inside the loop body. The last operand is the reduction variable,
         which is defined by the loop-header-phi.  */
 
-  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-
   /* Flatten RHS.  */
-  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
-    {
-    case GIMPLE_BINARY_RHS:
-      code = gimple_assign_rhs_code (stmt);
-      op_type = TREE_CODE_LENGTH (code);
-      gcc_assert (op_type == binary_op);
-      ops[0] = gimple_assign_rhs1 (stmt);
-      ops[1] = gimple_assign_rhs2 (stmt);
-      break;
-
-    case GIMPLE_TERNARY_RHS:
-      code = gimple_assign_rhs_code (stmt);
-      op_type = TREE_CODE_LENGTH (code);
-      gcc_assert (op_type == ternary_op);
-      ops[0] = gimple_assign_rhs1 (stmt);
-      ops[1] = gimple_assign_rhs2 (stmt);
-      ops[2] = gimple_assign_rhs3 (stmt);
-      break;
-
-    case GIMPLE_UNARY_RHS:
-      return false;
+  if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
+    {
+      switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
+	{
+	case GIMPLE_BINARY_RHS:
+	  code = gimple_assign_rhs_code (stmt);
+	  op_type = TREE_CODE_LENGTH ((enum tree_code) code);
+	  gcc_assert (op_type == binary_op);
+	  ops[0] = gimple_assign_rhs1 (stmt);
+	  ops[1] = gimple_assign_rhs2 (stmt);
+	  break;
 
-    default:
-      gcc_unreachable ();
+	case GIMPLE_TERNARY_RHS:
+	  code = gimple_assign_rhs_code (stmt);
+	  op_type = TREE_CODE_LENGTH ((enum tree_code) code);
+	  gcc_assert (op_type == ternary_op);
+	  ops[0] = gimple_assign_rhs1 (stmt);
+	  ops[1] = gimple_assign_rhs2 (stmt);
+	  ops[2] = gimple_assign_rhs3 (stmt);
+	  break;
+
+	case GIMPLE_UNARY_RHS:
+	  return false;
+
+	default:
+	  gcc_unreachable ();
+	}
     }
+  else if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
+    {
+      code = code_helper_for_stmnt (stmt);
+      gcc_assert (gimple_call_num_args (stmt) == 2);
+      op_type = binary_op;
+      ops[0] = gimple_call_arg (stmt, 0);
+      ops[1] = gimple_call_arg (stmt, 1);
+    }
+  else
+    gcc_unreachable ();
 
   if (code == COND_EXPR && slp_node)
     return false;
 
-  scalar_dest = gimple_assign_lhs (stmt);
+  gimple *stmt = stmt_info->stmt;
+  scalar_dest = gimple_get_lhs (stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
       && !SCALAR_FLOAT_TYPE_P (scalar_type))
@@ -6533,7 +6684,12 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       /* 4. Supportable by target?  */
 
       /* 4.1. check support for the operation in the loop  */
-      optab = optab_for_tree_code (code, vectype_in, optab_default);
+      if (code.is_tree_code ())
+	optab = optab_for_tree_code (code, vectype_in, optab_default);
+      else
+	/* Use MAX_EXPR tree_code for the call-based reductions.  */
+	optab = optab_for_tree_code (MAX_EXPR, vectype_in, optab_default);
+
       if (!optab)
         {
           if (dump_enabled_p ())
@@ -6897,7 +7053,15 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   else
     vec_num = 1;
 
-  internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_fn;
+  if (code.is_fn_code ())
+    {
+      gcall *orig_call = as_a <gcall *> (stmt_info->stmt);
+      internal_fn ifn = replacement_internal_fn (orig_call);
+      cond_fn = get_conditional_internal_fn (ifn);
+    }
+  else
+    cond_fn = get_conditional_internal_fn ((enum tree_code) code);
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 
   if (!vec_stmt) /* transformation not required.  */
@@ -7074,7 +7238,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      /* Make sure that the reduction accumulator is vop[0].  */
 	      if (reduc_index == 1)
 		{
-		  gcc_assert (commutative_tree_code (code));
+		  gcc_assert (code.is_fn_code ()
+			      || commutative_tree_code (code));
 		  std::swap (vop[0], vop[1]);
 		}
 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
@@ -7088,6 +7253,18 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      new_stmt_info
 		= vect_finish_stmt_generation (stmt_info, call, gsi);
 	    }
+	  else if (code.is_fn_code ())
+	    {
+	      gcall *orig_call = as_a <gcall *> (stmt_info->stmt);
+	      internal_fn ifn = replacement_internal_fn (orig_call);
+	      gcall *call
+		= gimple_build_call_internal (ifn, 2, vop[0], vop[1]);
+	      new_temp = make_ssa_name (vec_dest, call);
+	      gimple_call_set_lhs (call, new_temp);
+	      gimple_call_set_nothrow (call, true);
+	      new_stmt_info
+		= vect_finish_stmt_generation (stmt_info, call, gsi);
+	    }
 	  else
 	    {
 	      if (op_type == ternary_op)
diff --git gcc/tree-vectorizer.h gcc/tree-vectorizer.h
index f1c186b..578105dc 100644
--- gcc/tree-vectorizer.h
+++ gcc/tree-vectorizer.h
@@ -26,6 +26,7 @@ typedef struct _stmt_vec_info *stmt_vec_info;
 #include "tree-data-ref.h"
 #include "tree-hash-traits.h"
 #include "target.h"
+#include "gimple-match.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -1556,7 +1557,7 @@ extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
 /* Used in gimple-loop-interchange.c.  */
 extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
-				  enum tree_code);
+				  code_helper);
 /* Drive for loop analysis stage.  */
 extern opt_loop_vec_info vect_analyze_loop (struct loop *,
 					    loop_vec_info,

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [Patch, Vectorizer, SVE] fmin/fmax builtin reduction support
  2018-12-19  9:33 [Patch, Vectorizer, SVE] fmin/fmax builtin reduction support Alejandro Martinez Vicente
@ 2018-12-19 12:34 ` Richard Biener
  2018-12-19 13:41   ` Alejandro Martinez Vicente
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Biener @ 2018-12-19 12:34 UTC (permalink / raw)
  To: Alejandro.MartinezVicente; +Cc: GCC Patches, Richard Sandiford, nd

On Wed, Dec 19, 2018 at 10:33 AM Alejandro Martinez Vicente
<Alejandro.MartinezVicente@arm.com> wrote:
>
> Hi all,
>
> Loops that use the fmin/fmax builtins can be vectorized even without
> -ffast-math using SVE's FMINNM/FMAXNM instructions. This is an example:
>
> double
> f (double *x, int n)
> {
>   double res = 100.0;
>   for (int i = 0; i < n; ++i)
>     res = __builtin_fmin (res, x[i]);
>   return res;
> }
>
> Before this patch, the compiler would generate this code (-march=armv8.2-a+sve
> -O2 -ftree-vectorize):
>
> 0000000000000000 <f>:
>    0:   7100003f        cmp     w1, #0x0
>    4:   5400018d        b.le    34 <f+0x34>
>    8:   51000422        sub     w2, w1, #0x1
>    c:   91002003        add     x3, x0, #0x8
>   10:   d2e80b21        mov     x1, #0x4059000000000000
>   14:   9e670020        fmov    d0, x1
>   18:   8b224c62        add     x2, x3, w2, uxtw #3
>   1c:   d503201f        nop
>   20:   fc408401        ldr     d1, [x0],#8
>   24:   1e617800        fminnm  d0, d0, d1
>   28:   eb02001f        cmp     x0, x2
>   2c:   54ffffa1        b.ne    20 <f+0x20>
>   30:   d65f03c0        ret
>   34:   d2e80b20        mov     x0, #0x4059000000000000
>   38:   9e670000        fmov    d0, x0
>   3c:   d65f03c0        ret
>
> After this patch, this is the code that gets generated:
>
> 0000000000000000 <f>:
>    0:   7100003f        cmp     w1, #0x0
>    4:   5400020d        b.le    44 <f+0x44>
>    8:   d2800002        mov     x2, #0x0
>    c:   25d8e3e0        ptrue   p0.d
>   10:   93407c21        sxtw    x1, w1
>   14:   90000003        adrp    x3, 0 <f>
>   18:   25804001        mov     p1.b, p0.b
>   1c:   91000063        add     x3, x3, #0x0
>   20:   85c0e060        ld1rd   {z0.d}, p0/z, [x3]
>   24:   25e11fe0        whilelo p0.d, xzr, x1
>   28:   a5e24001        ld1d    {z1.d}, p0/z, [x0, x2, lsl #3]
>   2c:   04f0e3e2        incd    x2
>   30:   65c58020        fminnm  z0.d, p0/m, z0.d, z1.d
>   34:   25e11c40        whilelo p0.d, x2, x1
>   38:   54ffff81        b.ne    28 <f+0x28>  // b.any
>   3c:   65c52400        fminnmv d0, p1, z0.d
>   40:   d65f03c0        ret
>   44:   d2e80b20        mov     x0, #0x4059000000000000
>   48:   9e670000        fmov    d0, x0
>   4c:   d65f03c0        ret
>
> This patch extends the support for reductions to include calls to internal
> functions, in addition to assign statements. For this purpose, in most places
> where a tree_code would be used, a code_helper is used instead. The code_helper
> allows to hold either a tree_code or combined_fn.
>
> This patch implements these tasks:
>
> - Detect a reduction candidate based on a call to an internal function
>   (currently only fmin or fmax).
> - Process the reduction using code_helper. This means that at several places
>   we have to check whether this is as assign-based reduction or a call-based
>   reduction.
> - Add new internal functions for the fmin/fmax reductions and for conditional
>   fmin/fmax. In architectures where ieee fmin/fmax reductions are available, it
>   is still possible to vectorize the loop using unconditional instructions.
> - Update SVE's md to support these new reductions.
> - Add new SVE tests to check that the optimal code is being generated.
>
> I tested this patch in an aarch64 machine bootstrapping the compiler and
> running the checks.

Just some quick comments based on the above and the changelog.
Using code_helper is reasonable I guess.

> Alejandro
>
> gcc/Changelog:
>
> 2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
>
>         * gimple-match.h (code_helper_for_stmnt): New function to get a

code_helper_for_stmt I hope.

>         code_helper from an statement.
>         * internal-fn.def: New reduc_fmax_scal and reduc_fmin_scal optabs for
>         ieee fp max/min reductions

Aren't they necessarily fold_left reductions then?  Thus, should the optabs
be named accordingly fold_left_fmax_optab?

>         * optabs.def: Likewise.
>         * tree-vect-loop.c (reduction_fn_for_scalar_code): Changed function
>         signature to accept code_helper instead of tree_code. Handle the
>         fmax/fmin builtins.
>         (needs_fold_left_reduction_p): Likewise.
>         (check_reduction_path): Likewise.
>         (vect_is_simple_reduction): Use code_helper instead of tree_code. Check
>         for supported call-based reductions. Extend support for both
>         assignment-based and call-based reductions.
>         (vect_model_reduction_cost): Extend cost-model support to call-based
>         reductions (just use MAX expression).
>         (get_initial_def_for_reduction): Use code_helper instead of tree_code.
>         Extend support for both assignment-based and call-based reductions.
>         (vect_create_epilog_for_reduction): Likewise.
>         (vectorizable_reduction): Likewise.
>         * tree-vectorizer.h: include gimple-match.h for code_helper. Use
>         code_helper in check_reduction_path signature.
>         * config/aarch64/aarch64-sve.md: Added define_expand to capture new
>         reduc_fmax_scal and reduc_fmin_scal optabs.
>         * config/aarch64/iterators.md: New FMAXMINNMV and fmaxmin_uns iterators
>         to support the new define_expand.
>
> gcc/testsuite/Changelog:
>
> 2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
>
>         * gcc.target/aarch64/sve/reduc_9.c: New test to check
>         SVE-vectorized reductions without -ffast-math.
>         * gcc.target/aarch64/sve/reduc_10.c: New test to check
>         SVE-vectorized builtin reductions without -ffast-math.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [Patch, Vectorizer, SVE] fmin/fmax builtin reduction support
  2018-12-19 12:34 ` Richard Biener
@ 2018-12-19 13:41   ` Alejandro Martinez Vicente
  0 siblings, 0 replies; 3+ messages in thread
From: Alejandro Martinez Vicente @ 2018-12-19 13:41 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Richard Sandiford, nd

[-- Attachment #1: Type: text/plain, Size: 8133 bytes --]

Richard,

I'm happy to change the name of the helper to code_helper_for_stmt, the new patch and changelog are included. Regarding the reductions being fold_left, the FMINNM/FMINMV instructions are defined in such a way that this is not necessary (it wouldn't work with FMIN/FMINV).

Alejandro

 
gcc/Changelog:
 
2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

	* gimple-match.h (code_helper_for_stmt): New function to get a
	code_helper from an statement.
	* internal-fn.def: New reduc_fmax_scal and reduc_fmin_scal optabs for
	ieee fp max/min reductions
	* optabs.def: Likewise.
	* tree-vect-loop.c (reduction_fn_for_scalar_code): Changed function
	signature to accept code_helper instead of tree_code. Handle the
	fmax/fmin builtins.
	(needs_fold_left_reduction_p): Likewise.
	(check_reduction_path): Likewise.
	(vect_is_simple_reduction): Use code_helper instead of tree_code. Check
	for supported call-based reductions. Extend support for both
	assignment-based and call-based reductions.
	(vect_model_reduction_cost): Extend cost-model support to call-based
	reductions (just use MAX expression).
	(get_initial_def_for_reduction): Use code_helper instead of tree_code.
	Extend support for both assignment-based and call-based reductions.
	(vect_create_epilog_for_reduction): Likewise.
	(vectorizable_reduction): Likewise.
	* tree-vectorizer.h: include gimple-match.h for code_helper. Use
	code_helper in check_reduction_path signature.
	* config/aarch64/aarch64-sve.md: Added define_expand to capture new
	reduc_fmax_scal and reduc_fmin_scal optabs.
	* config/aarch64/iterators.md: New FMAXMINNMV and fmaxmin_uns iterators
	to support the new define_expand.
 
gcc/testsuite/Changelog:
 
2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

	* gcc.target/aarch64/sve/reduc_9.c: New test to check
	SVE-vectorized reductions without -ffast-math.
	* gcc.target/aarch64/sve/reduc_10.c: New test to check
	SVE-vectorized builtin reductions without -ffast-math.

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: 19 December 2018 12:35
To: Alejandro Martinez Vicente <Alejandro.MartinezVicente@arm.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Richard Sandiford <Richard.Sandiford@arm.com>; nd <nd@arm.com>
Subject: Re: [Patch, Vectorizer, SVE] fmin/fmax builtin reduction support

On Wed, Dec 19, 2018 at 10:33 AM Alejandro Martinez Vicente <Alejandro.MartinezVicente@arm.com> wrote:
>
> Hi all,
>
> Loops that use the fmin/fmax builtins can be vectorized even without 
> -ffast-math using SVE's FMINNM/FMAXNM instructions. This is an example:
>
> double
> f (double *x, int n)
> {
>   double res = 100.0;
>   for (int i = 0; i < n; ++i)
>     res = __builtin_fmin (res, x[i]);
>   return res;
> }
>
> Before this patch, the compiler would generate this code 
> (-march=armv8.2-a+sve
> -O2 -ftree-vectorize):
>
> 0000000000000000 <f>:
>    0:   7100003f        cmp     w1, #0x0
>    4:   5400018d        b.le    34 <f+0x34>
>    8:   51000422        sub     w2, w1, #0x1
>    c:   91002003        add     x3, x0, #0x8
>   10:   d2e80b21        mov     x1, #0x4059000000000000
>   14:   9e670020        fmov    d0, x1
>   18:   8b224c62        add     x2, x3, w2, uxtw #3
>   1c:   d503201f        nop
>   20:   fc408401        ldr     d1, [x0],#8
>   24:   1e617800        fminnm  d0, d0, d1
>   28:   eb02001f        cmp     x0, x2
>   2c:   54ffffa1        b.ne    20 <f+0x20>
>   30:   d65f03c0        ret
>   34:   d2e80b20        mov     x0, #0x4059000000000000
>   38:   9e670000        fmov    d0, x0
>   3c:   d65f03c0        ret
>
> After this patch, this is the code that gets generated:
>
> 0000000000000000 <f>:
>    0:   7100003f        cmp     w1, #0x0
>    4:   5400020d        b.le    44 <f+0x44>
>    8:   d2800002        mov     x2, #0x0
>    c:   25d8e3e0        ptrue   p0.d
>   10:   93407c21        sxtw    x1, w1
>   14:   90000003        adrp    x3, 0 <f>
>   18:   25804001        mov     p1.b, p0.b
>   1c:   91000063        add     x3, x3, #0x0
>   20:   85c0e060        ld1rd   {z0.d}, p0/z, [x3]
>   24:   25e11fe0        whilelo p0.d, xzr, x1
>   28:   a5e24001        ld1d    {z1.d}, p0/z, [x0, x2, lsl #3]
>   2c:   04f0e3e2        incd    x2
>   30:   65c58020        fminnm  z0.d, p0/m, z0.d, z1.d
>   34:   25e11c40        whilelo p0.d, x2, x1
>   38:   54ffff81        b.ne    28 <f+0x28>  // b.any
>   3c:   65c52400        fminnmv d0, p1, z0.d
>   40:   d65f03c0        ret
>   44:   d2e80b20        mov     x0, #0x4059000000000000
>   48:   9e670000        fmov    d0, x0
>   4c:   d65f03c0        ret
>
> This patch extends the support for reductions to include calls to 
> internal functions, in addition to assign statements. For this 
> purpose, in most places where a tree_code would be used, a code_helper 
> is used instead. The code_helper allows to hold either a tree_code or combined_fn.
>
> This patch implements these tasks:
>
> - Detect a reduction candidate based on a call to an internal function
>   (currently only fmin or fmax).
> - Process the reduction using code_helper. This means that at several places
>   we have to check whether this is as assign-based reduction or a call-based
>   reduction.
> - Add new internal functions for the fmin/fmax reductions and for conditional
>   fmin/fmax. In architectures where ieee fmin/fmax reductions are available, it
>   is still possible to vectorize the loop using unconditional instructions.
> - Update SVE's md to support these new reductions.
> - Add new SVE tests to check that the optimal code is being generated.
>
> I tested this patch in an aarch64 machine bootstrapping the compiler 
> and running the checks.

Just some quick comments based on the above and the changelog.
Using code_helper is reasonable I guess.

> Alejandro
>
> gcc/Changelog:
>
> 2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
>
>         * gimple-match.h (code_helper_for_stmnt): New function to get 
> a

code_helper_for_stmt I hope.

>         code_helper from an statement.
>         * internal-fn.def: New reduc_fmax_scal and reduc_fmin_scal optabs for
>         ieee fp max/min reductions

Aren't they necessarily fold_left reductions then?  Thus, should the optabs be named accordingly fold_left_fmax_optab?

>         * optabs.def: Likewise.
>         * tree-vect-loop.c (reduction_fn_for_scalar_code): Changed function
>         signature to accept code_helper instead of tree_code. Handle the
>         fmax/fmin builtins.
>         (needs_fold_left_reduction_p): Likewise.
>         (check_reduction_path): Likewise.
>         (vect_is_simple_reduction): Use code_helper instead of tree_code. Check
>         for supported call-based reductions. Extend support for both
>         assignment-based and call-based reductions.
>         (vect_model_reduction_cost): Extend cost-model support to call-based
>         reductions (just use MAX expression).
>         (get_initial_def_for_reduction): Use code_helper instead of tree_code.
>         Extend support for both assignment-based and call-based reductions.
>         (vect_create_epilog_for_reduction): Likewise.
>         (vectorizable_reduction): Likewise.
>         * tree-vectorizer.h: include gimple-match.h for code_helper. Use
>         code_helper in check_reduction_path signature.
>         * config/aarch64/aarch64-sve.md: Added define_expand to capture new
>         reduc_fmax_scal and reduc_fmin_scal optabs.
>         * config/aarch64/iterators.md: New FMAXMINNMV and fmaxmin_uns iterators
>         to support the new define_expand.
>
> gcc/testsuite/Changelog:
>
> 2018-12-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
>
>         * gcc.target/aarch64/sve/reduc_9.c: New test to check
>         SVE-vectorized reductions without -ffast-math.
>         * gcc.target/aarch64/sve/reduc_10.c: New test to check
>         SVE-vectorized builtin reductions without -ffast-math.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: final.patch --]
[-- Type: text/x-patch; name="final.patch", Size: 43971 bytes --]

diff --git gcc/config/aarch64/aarch64-sve.md gcc/config/aarch64/aarch64-sve.md
index 5cd591b..d9fbc79 100644
--- gcc/config/aarch64/aarch64-sve.md
+++ gcc/config/aarch64/aarch64-sve.md
@@ -2109,6 +2109,18 @@
   }
 )
 
+;; Unpredicated ieee floating-point MIN/MAX reduction.
+(define_expand "reduc_<fmaxmin_uns>_scal_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(unspec:<VEL> [(match_dup 2)
+		       (match_operand:SVE_F 1 "register_operand")]
+		      FMAXMINNMV))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
 ;; Predicated floating-point MIN/MAX reduction.
 (define_insn "*reduc_<maxmin_uns>_scal_<mode>"
   [(set (match_operand:<VEL> 0 "register_operand" "=w")
diff --git gcc/config/aarch64/iterators.md gcc/config/aarch64/iterators.md
index 524e4e6..ccc9f9d 100644
--- gcc/config/aarch64/iterators.md
+++ gcc/config/aarch64/iterators.md
@@ -474,6 +474,8 @@
     UNSPEC_COND_DIV	; Used in aarch64-sve.md.
     UNSPEC_COND_MAX	; Used in aarch64-sve.md.
     UNSPEC_COND_MIN	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMAX	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMIN	; Used in aarch64-sve.md.
     UNSPEC_COND_FMLA	; Used in aarch64-sve.md.
     UNSPEC_COND_FMLS	; Used in aarch64-sve.md.
     UNSPEC_COND_FNMLA	; Used in aarch64-sve.md.
@@ -1458,6 +1460,8 @@
 (define_int_iterator FMAXMINV [UNSPEC_FMAXV UNSPEC_FMINV
 			       UNSPEC_FMAXNMV UNSPEC_FMINNMV])
 
+(define_int_iterator FMAXMINNMV [UNSPEC_FMAXNMV UNSPEC_FMINNMV])
+
 (define_int_iterator BITWISEV [UNSPEC_ANDV UNSPEC_IORV UNSPEC_XORV])
 
 (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
@@ -1569,7 +1573,8 @@
 
 (define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_ADD UNSPEC_COND_SUB
 					 UNSPEC_COND_MUL UNSPEC_COND_DIV
-					 UNSPEC_COND_MAX UNSPEC_COND_MIN])
+					 UNSPEC_COND_MAX UNSPEC_COND_MIN
+					 UNSPEC_COND_FMAX UNSPEC_COND_FMIN])
 
 (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA
 					  UNSPEC_COND_FMLS
@@ -1616,7 +1621,9 @@
 			(UNSPEC_COND_FMLA "fma")
 			(UNSPEC_COND_FMLS "fnma")
 			(UNSPEC_COND_FNMLA "fnms")
-			(UNSPEC_COND_FNMLS "fms")])
+			(UNSPEC_COND_FNMLS "fms")
+			(UNSPEC_COND_FMAX "fmax")
+			(UNSPEC_COND_FMIN "fmin")])
 
 (define_int_attr  maxmin_uns [(UNSPEC_UMAXV "umax")
 			      (UNSPEC_UMINV "umin")
@@ -1631,6 +1638,10 @@
 			      (UNSPEC_FMAXNM "fmax")
 			      (UNSPEC_FMINNM "fmin")])
 
+
+(define_int_attr  fmaxmin_uns [(UNSPEC_FMAXNMV "fmax")
+			       (UNSPEC_FMINNMV "fmin")])
+
 (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
 				 (UNSPEC_UMINV "umin")
 				 (UNSPEC_SMAXV "smax")
@@ -1832,14 +1843,18 @@
 			    (UNSPEC_COND_MUL "fmul")
 			    (UNSPEC_COND_DIV "fdiv")
 			    (UNSPEC_COND_MAX "fmaxnm")
-			    (UNSPEC_COND_MIN "fminnm")])
+			    (UNSPEC_COND_MIN "fminnm")
+			    (UNSPEC_COND_FMAX "fmaxnm")
+			    (UNSPEC_COND_FMIN "fminnm")])
 
 (define_int_attr sve_fp_op_rev [(UNSPEC_COND_ADD "fadd")
 			        (UNSPEC_COND_SUB "fsubr")
 			        (UNSPEC_COND_MUL "fmul")
 			        (UNSPEC_COND_DIV "fdivr")
 			        (UNSPEC_COND_MAX "fmaxnm")
-			        (UNSPEC_COND_MIN "fminnm")])
+				(UNSPEC_COND_MIN "fminnm")
+				(UNSPEC_COND_FMAX "fmaxnm")
+				(UNSPEC_COND_FMIN "fminnm")])
 
 (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
 			      (UNSPEC_COND_FMLS "fmls")
diff --git gcc/gimple-match.h gcc/gimple-match.h
index b6eb888..fd657ac 100644
--- gcc/gimple-match.h
+++ gcc/gimple-match.h
@@ -327,6 +327,21 @@ gimple_simplified_result_is_gimple_val (const gimple_match_op *op)
 	  && is_gimple_val (op->ops[0]));
 }
 
+/* Return code_helper for a gassign or gcall.  */
+
+inline code_helper
+code_helper_for_stmt (gimple * orig_stmt)
+{
+  code_helper code;
+  if (gassign * stmt = dyn_cast <gassign *> (orig_stmt))
+    code = code_helper (gimple_assign_rhs_code (stmt));
+  else if (gcall * stmt = dyn_cast <gcall *> (orig_stmt))
+    code = code_helper (gimple_call_combined_fn (stmt));
+  else
+    gcc_unreachable ();
+  return code;
+}
+
 extern tree (*mprts_hook) (gimple_match_op *);
 
 bool gimple_simplify (gimple *, gimple_match_op *, gimple_seq *,
diff --git gcc/internal-fn.c gcc/internal-fn.c
index d082dd5..629b689 100644
--- gcc/internal-fn.c
+++ gcc/internal-fn.c
@@ -3336,7 +3336,9 @@ conditional_internal_fn_code (internal_fn ifn)
   T (FMA) \
   T (FMS) \
   T (FNMA) \
-  T (FNMS)
+  T (FNMS) \
+  T (FMIN) \
+  T (FMAX)
 
 /* Return a function that only performs internal function FN when a
    certain condition is met and that uses a given fallback value otherwise.
diff --git gcc/internal-fn.def gcc/internal-fn.def
index cda314e..8ea43bf 100644
--- gcc/internal-fn.def
+++ gcc/internal-fn.def
@@ -173,6 +173,9 @@ DEF_INTERNAL_OPTAB_FN (COND_FMS, ECF_CONST, cond_fms, cond_ternary)
 DEF_INTERNAL_OPTAB_FN (COND_FNMA, ECF_CONST, cond_fnma, cond_ternary)
 DEF_INTERNAL_OPTAB_FN (COND_FNMS, ECF_CONST, cond_fnms, cond_ternary)
 
+DEF_INTERNAL_OPTAB_FN (COND_FMAX, ECF_CONST, cond_fmax, cond_binary)
+DEF_INTERNAL_OPTAB_FN (COND_FMIN, ECF_CONST, cond_fmin, cond_binary)
+
 DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
 
 DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
@@ -187,6 +190,10 @@ DEF_INTERNAL_OPTAB_FN (REDUC_IOR, ECF_CONST | ECF_NOTHROW,
 		       reduc_ior_scal, unary)
 DEF_INTERNAL_OPTAB_FN (REDUC_XOR, ECF_CONST | ECF_NOTHROW,
 		       reduc_xor_scal, unary)
+DEF_INTERNAL_OPTAB_FN (REDUC_FMAX, ECF_CONST | ECF_NOTHROW,
+		       reduc_fmax_scal, unary)
+DEF_INTERNAL_OPTAB_FN (REDUC_FMIN, ECF_CONST | ECF_NOTHROW,
+		       reduc_fmin_scal, unary)
 
 /* Extract the last active element from a vector.  */
 DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
diff --git gcc/optabs.def gcc/optabs.def
index 5a67f5e..8ed4de2 100644
--- gcc/optabs.def
+++ gcc/optabs.def
@@ -238,6 +238,8 @@ OPTAB_D (cond_fma_optab, "cond_fma$a")
 OPTAB_D (cond_fms_optab, "cond_fms$a")
 OPTAB_D (cond_fnma_optab, "cond_fnma$a")
 OPTAB_D (cond_fnms_optab, "cond_fnms$a")
+OPTAB_D (cond_fmin_optab, "cond_fmin$a")
+OPTAB_D (cond_fmax_optab, "cond_fmax$a")
 OPTAB_D (cmov_optab, "cmov$a6")
 OPTAB_D (cstore_optab, "cstore$a4")
 OPTAB_D (ctrap_optab, "ctrap$a4")
@@ -315,6 +317,8 @@ OPTAB_D (reduc_umin_scal_optab, "reduc_umin_scal_$a")
 OPTAB_D (reduc_and_scal_optab,  "reduc_and_scal_$a")
 OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
 OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
+OPTAB_D (reduc_fmax_scal_optab, "reduc_fmax_scal_$a")
+OPTAB_D (reduc_fmin_scal_optab, "reduc_fmin_scal_$a")
 OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
 
 OPTAB_D (extract_last_optab, "extract_last_$a")
diff --git gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
new file mode 100644
index 0000000..d5ebe97
--- /dev/null
+++ gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_REDUC_BI_MAXMIN(TYPE, NAME, FUNC)	\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_bi_##NAME##_##TYPE (TYPE *a, int n)	\
+{						\
+  TYPE r = 13;					\
+  for (int i = 0; i < n; ++i)			\
+    r = __builtin_##FUNC (r, a[i]);		\
+  return r;					\
+}
+
+#define TEST_BI_MAXMIN(T)			\
+  T (_Float16, max, fmaxf16)			\
+  T (float, max, fmaxf)				\
+  T (double, max, fmax)				\
+						\
+  T (_Float16, min, fminf16)			\
+  T (float, min, fminf)				\
+  T (double, min, fmin)
+
+TEST_BI_MAXMIN (DEF_REDUC_BI_MAXMIN)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
diff --git gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
new file mode 100644
index 0000000..9147565
--- /dev/null
+++ gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c
@@ -0,0 +1,201 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_REDUC_PLUS(TYPE)			\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_plus_##TYPE (TYPE *a, int n)		\
+{						\
+  TYPE r = 0;					\
+  for (int i = 0; i < n; ++i)			\
+    r += a[i];					\
+  return r;					\
+}
+
+#define TEST_PLUS(T)				\
+  T (int8_t)					\
+  T (int16_t)					\
+  T (int32_t)					\
+  T (int64_t)					\
+  T (uint8_t)					\
+  T (uint16_t)					\
+  T (uint32_t)					\
+  T (uint64_t)					\
+  T (_Float16)					\
+  T (float)					\
+  T (double)
+
+TEST_PLUS (DEF_REDUC_PLUS)
+
+#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP)	\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_##NAME##_##TYPE (TYPE *a, int n)		\
+{						\
+  TYPE r = 13;					\
+  for (int i = 0; i < n; ++i)			\
+    r = a[i] CMP_OP r ? a[i] : r;		\
+  return r;					\
+}
+
+#define TEST_MAXMIN(T)				\
+  T (int8_t, max, >)				\
+  T (int16_t, max, >)				\
+  T (int32_t, max, >)				\
+  T (int64_t, max, >)				\
+  T (uint8_t, max, >)				\
+  T (uint16_t, max, >)				\
+  T (uint32_t, max, >)				\
+  T (uint64_t, max, >)				\
+  T (_Float16, max, >)				\
+  T (float, max, >)				\
+  T (double, max, >)				\
+						\
+  T (int8_t, min, <)				\
+  T (int16_t, min, <)				\
+  T (int32_t, min, <)				\
+  T (int64_t, min, <)				\
+  T (uint8_t, min, <)				\
+  T (uint16_t, min, <)				\
+  T (uint32_t, min, <)				\
+  T (uint64_t, min, <)				\
+  T (_Float16, min, <)				\
+  T (float, min, <)				\
+  T (double, min, <)
+
+TEST_MAXMIN (DEF_REDUC_MAXMIN)
+
+#define DEF_REDUC_BITWISE(TYPE, NAME, BIT_OP)	\
+TYPE __attribute__ ((noinline, noclone))	\
+reduc_##NAME##_##TYPE (TYPE *a, int n)		\
+{						\
+  TYPE r = 13;					\
+  for (int i = 0; i < n; ++i)			\
+    r BIT_OP a[i];				\
+  return r;					\
+}
+
+#define TEST_BITWISE(T)				\
+  T (int8_t, and, &=)				\
+  T (int16_t, and, &=)				\
+  T (int32_t, and, &=)				\
+  T (int64_t, and, &=)				\
+  T (uint8_t, and, &=)				\
+  T (uint16_t, and, &=)				\
+  T (uint32_t, and, &=)				\
+  T (uint64_t, and, &=)				\
+						\
+  T (int8_t, ior, |=)				\
+  T (int16_t, ior, |=)				\
+  T (int32_t, ior, |=)				\
+  T (int64_t, ior, |=)				\
+  T (uint8_t, ior, |=)				\
+  T (uint16_t, ior, |=)				\
+  T (uint32_t, ior, |=)				\
+  T (uint64_t, ior, |=)				\
+						\
+  T (int8_t, xor, ^=)				\
+  T (int16_t, xor, ^=)				\
+  T (int32_t, xor, ^=)				\
+  T (int64_t, xor, ^=)				\
+  T (uint8_t, xor, ^=)				\
+  T (uint16_t, xor, ^=)				\
+  T (uint32_t, xor, ^=)				\
+  T (uint64_t, xor, ^=)
+
+TEST_BITWISE (DEF_REDUC_BITWISE)
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsmaxv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmaxv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmaxv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmaxv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumaxv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tsminv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsminv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsminv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsminv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuminv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 0 } } */
+/* { dg-final { scan-assembler-times {\tfminnmv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 0 } } */
+
+/* { dg-final { scan-assembler-times {\tandv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tandv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tandv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tandv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\torv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teorv\tb[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teorv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teorv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\teorv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
diff --git gcc/tree-vect-loop.c gcc/tree-vect-loop.c
index 633c315..20f2f06 100644
--- gcc/tree-vect-loop.c
+++ gcc/tree-vect-loop.c
@@ -54,6 +54,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
 #include "tree-eh.h"
+#include "case-cfn-macros.h"
+#include "gimple-match.h"
+#include "builtins.h"
 
 /* Loop Vectorization Pass.
 
@@ -2322,7 +2325,7 @@ fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
 /* Function reduction_fn_for_scalar_code
 
    Input:
-   CODE - tree_code of a reduction operations.
+   CODE - code_helper of a reduction operation.
 
    Output:
    REDUC_FN - the corresponding internal function to be used to reduce the
@@ -2333,21 +2336,22 @@ fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
 static bool
-reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
+reduction_fn_for_scalar_code (code_helper code, internal_fn * reduc_fn)
 {
-  switch (code)
-    {
+  if (code.is_tree_code ())
+    switch ((enum tree_code) code)
+      {
       case MAX_EXPR:
-        *reduc_fn = IFN_REDUC_MAX;
-        return true;
+	*reduc_fn = IFN_REDUC_MAX;
+	return true;
 
       case MIN_EXPR:
-        *reduc_fn = IFN_REDUC_MIN;
-        return true;
+	*reduc_fn = IFN_REDUC_MIN;
+	return true;
 
       case PLUS_EXPR:
-        *reduc_fn = IFN_REDUC_PLUS;
-        return true;
+	*reduc_fn = IFN_REDUC_PLUS;
+	return true;
 
       case BIT_AND_EXPR:
 	*reduc_fn = IFN_REDUC_AND;
@@ -2363,12 +2367,28 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
 
       case MULT_EXPR:
       case MINUS_EXPR:
-        *reduc_fn = IFN_LAST;
-        return true;
+	*reduc_fn = IFN_LAST;
+	return true;
 
       default:
-       return false;
-    }
+	return false;
+      }
+  else
+    switch ((combined_fn) code)
+      {
+      CASE_CFN_FMAX:
+      CASE_CFN_FMAX_FN:
+	*reduc_fn = IFN_REDUC_FMAX;
+	return true;
+
+      CASE_CFN_FMIN:
+      CASE_CFN_FMIN_FN:
+	*reduc_fn = IFN_REDUC_FMIN;
+	return true;
+
+      default:
+	return false;
+      }
 }
 
 /* If there is a neutral value X such that SLP reduction NODE would not
@@ -2616,9 +2636,13 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
    overflow must wrap.  */
 
 static bool
-needs_fold_left_reduction_p (tree type, tree_code code,
+needs_fold_left_reduction_p (tree type, code_helper orig_code,
 			     bool need_wrapping_integral_overflow)
 {
+  if (orig_code.is_fn_code ())
+    return false;
+  enum tree_code code = orig_code;
+
   /* CHECKME: check for !flag_finite_math_only too?  */
   if (SCALAR_FLOAT_TYPE_P (type))
     switch (code)
@@ -2653,7 +2677,7 @@ needs_fold_left_reduction_p (tree type, tree_code code,
 
 bool
 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
-		      tree loop_arg, enum tree_code code)
+		      tree loop_arg, code_helper code)
 {
   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
   auto_bitmap visited;
@@ -2752,7 +2776,6 @@ pop:
   return ! fail && ! neg;
 }
 
-
 /* Function vect_is_simple_reduction
 
    (1) Detect a cross-iteration def-use cycle that represents a simple
@@ -2808,13 +2831,13 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   gimple *phi_use_stmt = NULL;
-  enum tree_code orig_code, code;
+  code_helper orig_code, code;
   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
   tree type;
   tree name;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
-  bool phi_def;
+  bool phi_def, is_call;
 
   *double_reduc = false;
   *v_reduc_type = TREE_CODE_REDUCTION;
@@ -2865,11 +2888,19 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
     {
       name = gimple_assign_lhs (def_stmt);
       phi_def = false;
+      is_call = false;
     }
   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
     {
       name = PHI_RESULT (def_stmt);
       phi_def = true;
+      is_call = false;
+    }
+  else if (gcall *def_stmt = dyn_cast <gcall *> (def_stmt_info->stmt))
+    {
+      name = gimple_call_lhs (def_stmt);
+      phi_def = false;
+      is_call = true;
     }
   else
     {
@@ -2970,8 +3001,43 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 	  }
     }
 
-  gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  code = orig_code = gimple_assign_rhs_code (def_stmt);
+  if (is_call)
+    {
+      gcall *def_stmt = as_a <gcall *> (def_stmt_info->stmt);
+
+      if (!gimple_call_builtin_p (def_stmt))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "reduction: unhandled reduction "
+			     "with non-builtin call: %G",
+			     def_stmt_info->stmt);
+	  return NULL;
+	}
+
+      code = orig_code = gimple_call_combined_fn (def_stmt);
+
+      switch ((combined_fn) orig_code)
+	{
+	CASE_CFN_FMAX:
+	CASE_CFN_FMAX_FN:
+	CASE_CFN_FMIN:
+	CASE_CFN_FMIN_FN:
+	  break;
+	default:
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "reduction: unhandled reduction with builtin: %G",
+			     def_stmt_info->stmt);
+	  return NULL;
+	}
+    }
+  else
+    {
+      gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
+      code = orig_code = gimple_assign_rhs_code (def_stmt);
+    }
+  gimple *def_stmt = def_stmt_info->stmt;
 
   if (nested_in_vect_loop && !check_reduction)
     {
@@ -3026,17 +3092,28 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       op1 = gimple_assign_rhs2 (def_stmt);
       op2 = gimple_assign_rhs3 (def_stmt);
     }
-  else if (!commutative_tree_code (code) || !associative_tree_code (code))
+  else if (!is_call
+	   && (!commutative_tree_code (code)
+	       || !associative_tree_code (code)))
     {
       if (dump_enabled_p ())
 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
 			"reduction: not commutative/associative: ");
       return NULL;
     }
-  else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
+  else if (is_call || get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
     {
-      op1 = gimple_assign_rhs1 (def_stmt);
-      op2 = gimple_assign_rhs2 (def_stmt);
+      if (is_call)
+	{
+	  gcc_assert (gimple_call_num_args (def_stmt) == 2);
+	  op1 = gimple_call_arg (def_stmt, 0);
+	  op2 = gimple_call_arg (def_stmt, 1);
+	}
+      else
+	{
+	  op1 = gimple_assign_rhs1 (def_stmt);
+	  op2 = gimple_assign_rhs2 (def_stmt);
+	}
     }
   else
     {
@@ -3055,7 +3132,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       return NULL;
     }
 
-  type = TREE_TYPE (gimple_assign_lhs (def_stmt));
+  type = TREE_TYPE (gimple_get_lhs (def_stmt));
   if ((TREE_CODE (op1) == SSA_NAME
        && !types_compatible_p (type,TREE_TYPE (op1)))
       || (TREE_CODE (op2) == SSA_NAME
@@ -3164,6 +3241,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 		  return NULL;
 		}
 	    }
+	  else if (is_call)
+	    swap_ssa_operands (def_stmt, gimple_call_arg_ptr (def_stmt, 0),
+			       gimple_call_arg_ptr (def_stmt, 1));
 	  else
 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
 			       gimple_assign_rhs2_ptr (def_stmt));
@@ -3172,7 +3252,10 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 	    report_vect_op (MSG_NOTE, def_stmt,
 			    "detected reduction: need to swap operands: ");
 
-	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
+	  if (!is_call && CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
+	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
+	  else if (is_call
+		   && CONSTANT_CLASS_P (gimple_call_arg (def_stmt, 0)))
 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
         }
       else
@@ -3745,6 +3828,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
 			   int ncopies, stmt_vector_for_cost *cost_vec)
 {
   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
+  code_helper orig_code;
   enum tree_code code;
   optab optab;
   tree vectype;
@@ -3765,7 +3849,9 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
   mode = TYPE_MODE (vectype);
   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
 
-  code = gimple_assign_rhs_code (orig_stmt_info->stmt);
+  orig_code = code_helper_for_stmt (orig_stmt_info->stmt);
+  /* Use MAX_EXPR tree_code for the call-based reductions.  */
+  code = orig_code.is_tree_code () ? (enum tree_code) orig_code : MAX_EXPR;
 
   if (reduction_type == EXTRACT_LAST_REDUCTION
       || reduction_type == FOLD_LEFT_REDUCTION)
@@ -3861,7 +3947,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
 	{
 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
 	  tree bitsize =
-	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
+	    TYPE_SIZE (TREE_TYPE (gimple_get_lhs (orig_stmt_info->stmt)));
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  int nelements = vec_size_in_bits / element_bitsize;
 
@@ -3984,7 +4070,7 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (scalar_type);
-  enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
+  code_helper code = code_helper_for_stmt (stmt_vinfo->stmt);
   tree def_for_init;
   tree init_def;
   REAL_VALUE_TYPE real_init_val = dconst0;
@@ -4002,82 +4088,106 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
   vect_reduction_type reduction_type
     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
 
-  switch (code)
-    {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-    case MULT_EXPR:
-    case BIT_AND_EXPR:
+  if (code.is_tree_code ())
+    switch ((enum tree_code) code)
       {
-        /* ADJUSTMENT_DEF is NULL when called from
-           vect_create_epilog_for_reduction to vectorize double reduction.  */
-        if (adjustment_def)
-	  *adjustment_def = init_val;
-
-        if (code == MULT_EXPR)
-          {
-            real_init_val = dconst1;
-            int_init_val = 1;
-          }
-
-        if (code == BIT_AND_EXPR)
-          int_init_val = -1;
-
-        if (SCALAR_FLOAT_TYPE_P (scalar_type))
-          def_for_init = build_real (scalar_type, real_init_val);
-        else
-          def_for_init = build_int_cst (scalar_type, int_init_val);
-
-	if (adjustment_def)
-	  /* Option1: the first element is '0' or '1' as well.  */
-	  init_def = gimple_build_vector_from_val (&stmts, vectype,
-						   def_for_init);
-	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
-	  {
-	    /* Option2 (variable length): the first element is INIT_VAL.  */
+      case WIDEN_SUM_EXPR:
+      case DOT_PROD_EXPR:
+      case SAD_EXPR:
+      case PLUS_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case MULT_EXPR:
+      case BIT_AND_EXPR:
+	{
+	  /* ADJUSTMENT_DEF is NULL when called from
+	     vect_create_epilog_for_reduction to vectorize double reduction.  */
+	  if (adjustment_def)
+	    *adjustment_def = init_val;
+
+	  if (code == MULT_EXPR)
+	    {
+	      real_init_val = dconst1;
+	      int_init_val = 1;
+	    }
+
+	  if (code == BIT_AND_EXPR)
+	    int_init_val = -1;
+
+	  if (SCALAR_FLOAT_TYPE_P (scalar_type))
+	    def_for_init = build_real (scalar_type, real_init_val);
+	  else
+	    def_for_init = build_int_cst (scalar_type, int_init_val);
+
+	  if (adjustment_def)
+	    /* Option1: the first element is '0' or '1' as well.  */
 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
 						     def_for_init);
-	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
-				     vectype, init_def, init_val);
-	  }
-	else
-	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    tree_vector_builder elts (vectype, 1, 2);
-	    elts.quick_push (init_val);
-	    elts.quick_push (def_for_init);
-	    init_def = gimple_build_vector (&stmts, &elts);
-	  }
-      }
-      break;
+	  else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+	    {
+	      /* Option2 (variable length): the first element is INIT_VAL.  */
+	      init_def = gimple_build_vector_from_val (&stmts, vectype,
+						       def_for_init);
+	      init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
+				       vectype, init_def, init_val);
+	    }
+	  else
+	    {
+	      /* Option2: the first element is INIT_VAL.  */
+	      tree_vector_builder elts (vectype, 1, 2);
+	      elts.quick_push (init_val);
+	      elts.quick_push (def_for_init);
+	      init_def = gimple_build_vector (&stmts, &elts);
+	    }
+	}
+	break;
 
-    case MIN_EXPR:
-    case MAX_EXPR:
-    case COND_EXPR:
-      {
-	if (adjustment_def)
-          {
-	    *adjustment_def = NULL_TREE;
-	    if (reduction_type != COND_REDUCTION
-		&& reduction_type != EXTRACT_LAST_REDUCTION)
-	      {
-		init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
-		break;
-	      }
-	  }
-	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
-	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
+      case MIN_EXPR:
+      case MAX_EXPR:
+      case COND_EXPR:
+	{
+	  if (adjustment_def)
+	    {
+	      *adjustment_def = NULL_TREE;
+	      init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
+	      break;
+	    }
+	  init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+	  init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
+	}
+	break;
+
+      default:
+	gcc_unreachable ();
       }
-      break;
+  else
+    switch ((combined_fn) code)
+      {
+      CASE_CFN_FMAX:
+      CASE_CFN_FMAX_FN:
+      CASE_CFN_FMIN:
+      CASE_CFN_FMIN_FN:
+	{
+	  if (adjustment_def)
+	    {
+	      *adjustment_def = NULL_TREE;
+	      if (reduction_type != COND_REDUCTION
+		  && reduction_type != EXTRACT_LAST_REDUCTION)
+		{
+		  init_def
+		    = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
+		  break;
+		}
+	    }
+	  init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
+	  init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
+	}
+	break;
 
-    default:
-      gcc_unreachable ();
-    }
+      default:
+	gcc_unreachable ();
+      }
 
   if (stmts)
     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
@@ -4345,7 +4455,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
   tree vec_dest;
   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
-  enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
+  code_helper code = code_helper_for_stmt (stmt_info->stmt);
   gimple *exit_phi;
   tree bitsize;
   tree adjustment_def = NULL;
@@ -4689,13 +4799,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
 
-  code = gimple_assign_rhs_code (orig_stmt_info->stmt);
+  code = code_helper_for_stmt (orig_stmt_info->stmt);
   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
      partial results are added and not subtracted.  */
   if (code == MINUS_EXPR) 
     code = PLUS_EXPR;
   
-  scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
+  scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.create (group_size); 
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
@@ -5988,7 +6098,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   tree vectype_in = NULL_TREE;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  enum tree_code code, orig_code;
+  code_helper code, orig_code;
   internal_fn reduc_fn;
   machine_mode vec_mode;
   int op_type;
@@ -6065,25 +6175,54 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	/* Leave the scalar phi in place.  */
 	return true;
 
-      gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
-      code = gimple_assign_rhs_code (reduc_stmt);
-      for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
+      if (gassign *reduc_stmt = dyn_cast <gassign *> (reduc_stmt_info->stmt))
 	{
-	  tree op = gimple_op (reduc_stmt, k);
-	  if (op == phi_result)
-	    continue;
-	  if (k == 1 && code == COND_EXPR)
-	    continue;
-	  bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
-	  gcc_assert (is_simple_use);
-	  if (dt == vect_constant_def || dt == vect_external_def)
-	    continue;
-	  if (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
-	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
-	  break;
+	  code = gimple_assign_rhs_code (reduc_stmt);
+
+	  for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
+	    {
+	      tree op = gimple_op (reduc_stmt, k);
+	      if (op == phi_result)
+		continue;
+	      if (k == 1 && code == COND_EXPR)
+		continue;
+	      bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
+	      gcc_assert (is_simple_use);
+	      if (dt == vect_constant_def || dt == vect_external_def)
+		continue;
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
+		vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
+	      break;
+	    }
 	}
+      else if (gcall *reduc_stmt = dyn_cast <gcall *> (reduc_stmt_info->stmt))
+	{
+	  code = code_helper_for_stmt (reduc_stmt);
+
+	  for (unsigned k = 1; k < gimple_call_num_args (reduc_stmt); ++k)
+	    {
+	      tree op = gimple_call_arg (reduc_stmt, k);
+	      if (op == phi_result)
+		continue;
+	      bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
+	      gcc_assert (is_simple_use);
+	      if (dt == vect_constant_def || dt == vect_external_def)
+		continue;
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		      < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
+		vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
+	      break;
+	    }
+
+	}
+      else
+	gcc_unreachable ();
+
+      gimple *reduc_stmt = reduc_stmt_info->stmt;
+
       /* For a nested cycle we might end up with an operation like
          phi_result * phi_result.  */
       if (!vectype_in)
@@ -6103,7 +6242,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	single_defuse_cycle = true;
 
       /* Create the destination vector  */
-      scalar_dest = gimple_assign_lhs (reduc_stmt);
+      scalar_dest = gimple_get_lhs (reduc_stmt);
       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
       if (slp_node)
@@ -6177,39 +6316,51 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
         inside the loop body. The last operand is the reduction variable,
         which is defined by the loop-header-phi.  */
 
-  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-
   /* Flatten RHS.  */
-  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
-    {
-    case GIMPLE_BINARY_RHS:
-      code = gimple_assign_rhs_code (stmt);
-      op_type = TREE_CODE_LENGTH (code);
-      gcc_assert (op_type == binary_op);
-      ops[0] = gimple_assign_rhs1 (stmt);
-      ops[1] = gimple_assign_rhs2 (stmt);
-      break;
-
-    case GIMPLE_TERNARY_RHS:
-      code = gimple_assign_rhs_code (stmt);
-      op_type = TREE_CODE_LENGTH (code);
-      gcc_assert (op_type == ternary_op);
-      ops[0] = gimple_assign_rhs1 (stmt);
-      ops[1] = gimple_assign_rhs2 (stmt);
-      ops[2] = gimple_assign_rhs3 (stmt);
-      break;
-
-    case GIMPLE_UNARY_RHS:
-      return false;
+  if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
+    {
+      switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
+	{
+	case GIMPLE_BINARY_RHS:
+	  code = gimple_assign_rhs_code (stmt);
+	  op_type = TREE_CODE_LENGTH ((enum tree_code) code);
+	  gcc_assert (op_type == binary_op);
+	  ops[0] = gimple_assign_rhs1 (stmt);
+	  ops[1] = gimple_assign_rhs2 (stmt);
+	  break;
 
-    default:
-      gcc_unreachable ();
+	case GIMPLE_TERNARY_RHS:
+	  code = gimple_assign_rhs_code (stmt);
+	  op_type = TREE_CODE_LENGTH ((enum tree_code) code);
+	  gcc_assert (op_type == ternary_op);
+	  ops[0] = gimple_assign_rhs1 (stmt);
+	  ops[1] = gimple_assign_rhs2 (stmt);
+	  ops[2] = gimple_assign_rhs3 (stmt);
+	  break;
+
+	case GIMPLE_UNARY_RHS:
+	  return false;
+
+	default:
+	  gcc_unreachable ();
+	}
     }
+  else if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
+    {
+      code = code_helper_for_stmt (stmt);
+      gcc_assert (gimple_call_num_args (stmt) == 2);
+      op_type = binary_op;
+      ops[0] = gimple_call_arg (stmt, 0);
+      ops[1] = gimple_call_arg (stmt, 1);
+    }
+  else
+    gcc_unreachable ();
 
   if (code == COND_EXPR && slp_node)
     return false;
 
-  scalar_dest = gimple_assign_lhs (stmt);
+  gimple *stmt = stmt_info->stmt;
+  scalar_dest = gimple_get_lhs (stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
       && !SCALAR_FLOAT_TYPE_P (scalar_type))
@@ -6533,7 +6684,12 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       /* 4. Supportable by target?  */
 
       /* 4.1. check support for the operation in the loop  */
-      optab = optab_for_tree_code (code, vectype_in, optab_default);
+      if (code.is_tree_code ())
+	optab = optab_for_tree_code (code, vectype_in, optab_default);
+      else
+	/* Use MAX_EXPR tree_code for the call-based reductions.  */
+	optab = optab_for_tree_code (MAX_EXPR, vectype_in, optab_default);
+
       if (!optab)
         {
           if (dump_enabled_p ())
@@ -6897,7 +7053,15 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   else
     vec_num = 1;
 
-  internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_fn;
+  if (code.is_fn_code ())
+    {
+      gcall *orig_call = as_a <gcall *> (stmt_info->stmt);
+      internal_fn ifn = replacement_internal_fn (orig_call);
+      cond_fn = get_conditional_internal_fn (ifn);
+    }
+  else
+    cond_fn = get_conditional_internal_fn ((enum tree_code) code);
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 
   if (!vec_stmt) /* transformation not required.  */
@@ -7074,7 +7238,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      /* Make sure that the reduction accumulator is vop[0].  */
 	      if (reduc_index == 1)
 		{
-		  gcc_assert (commutative_tree_code (code));
+		  gcc_assert (code.is_fn_code ()
+			      || commutative_tree_code (code));
 		  std::swap (vop[0], vop[1]);
 		}
 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
@@ -7088,6 +7253,18 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      new_stmt_info
 		= vect_finish_stmt_generation (stmt_info, call, gsi);
 	    }
+	  else if (code.is_fn_code ())
+	    {
+	      gcall *orig_call = as_a <gcall *> (stmt_info->stmt);
+	      internal_fn ifn = replacement_internal_fn (orig_call);
+	      gcall *call
+		= gimple_build_call_internal (ifn, 2, vop[0], vop[1]);
+	      new_temp = make_ssa_name (vec_dest, call);
+	      gimple_call_set_lhs (call, new_temp);
+	      gimple_call_set_nothrow (call, true);
+	      new_stmt_info
+		= vect_finish_stmt_generation (stmt_info, call, gsi);
+	    }
 	  else
 	    {
 	      if (op_type == ternary_op)
diff --git gcc/tree-vectorizer.h gcc/tree-vectorizer.h
index f1c186b..578105dc 100644
--- gcc/tree-vectorizer.h
+++ gcc/tree-vectorizer.h
@@ -26,6 +26,7 @@ typedef struct _stmt_vec_info *stmt_vec_info;
 #include "tree-data-ref.h"
 #include "tree-hash-traits.h"
 #include "target.h"
+#include "gimple-match.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -1556,7 +1557,7 @@ extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
 /* Used in gimple-loop-interchange.c.  */
 extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
-				  enum tree_code);
+				  code_helper);
 /* Drive for loop analysis stage.  */
 extern opt_loop_vec_info vect_analyze_loop (struct loop *,
 					    loop_vec_info,

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-12-19 13:41 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-19  9:33 [Patch, Vectorizer, SVE] fmin/fmax builtin reduction support Alejandro Martinez Vicente
2018-12-19 12:34 ` Richard Biener
2018-12-19 13:41   ` Alejandro Martinez Vicente

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).