From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <liuhongt@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2078)
	id A83A4385841F; Tue, 20 Sep 2022 06:54:26 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A83A4385841F
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1663656866;
	bh=4JKMvp4RTWrbKQ7YkdM5gugenMLILUVANE1Bv/YLL/c=;
	h=From:To:Subject:Date:From;
	b=mAIkVZH+oiN3JPoP1viwF+XpwNg5fJh6aPNJ3w+cyoGPTwr60+HPaXiiENmFy4j2e
	 lIAV3JQTSLRZXo6G0tPpkofX5me6+Nbw1HRbDiTxLKaEQNhOkhBLeFrMJIEFGAnMfw
	 LiPs+J2oU8ry9IPQuSqgYUrMAK5MnW8e6cTgz378=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: hongtao Liu <liuhongt@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r13-2730] Support 64-bit vectorization for single-precision
 floating rounding operation.
X-Act-Checkin: gcc
X-Git-Author: liuhongt <hongtao.liu@intel.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 3a035f1932eeb26f997cf28a5c752617dd09cb91
X-Git-Newrev: d0c73b6c85677e6755b60fa02d79a5c5e1a8eacd
Message-Id: <20220920065426.A83A4385841F@sourceware.org>
Date: Tue, 20 Sep 2022 06:54:26 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:d0c73b6c85677e6755b60fa02d79a5c5e1a8eacd

commit r13-2730-gd0c73b6c85677e6755b60fa02d79a5c5e1a8eacd
Author: liuhongt <hongtao.liu@intel.com>
Date:   Fri Sep 16 14:28:34 2022 +0800

    Support 64-bit vectorization for single-precision floating rounding operation.
    
    Here's list the patch supported.
    rint/nearbyint/ceil/floor/trunc/lrint/lceil/lfloor/round/lround.
    
    gcc/ChangeLog:
    
            PR target/106910
            * config/i386/mmx.md (nearbyintv2sf2): New expander.
            (rintv2sf2): Ditto.
            (ceilv2sf2): Ditto.
            (lceilv2sfv2si2): Ditto.
            (floorv2sf2): Ditto.
            (lfloorv2sfv2si2): Ditto.
            (btruncv2sf2): Ditto.
            (lrintv2sfv2si2): Ditto.
            (roundv2sf2): Ditto.
            (lroundv2sfv2si2): Ditto.
            (*mmx_roundv2sf2): New define_insn.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr106910-1.c: New test.

Diff:
---
 gcc/config/i386/mmx.md                     | 154 +++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106910-1.c |  77 +++++++++++++++
 2 files changed, 231 insertions(+)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index dda4b43f5c1..222a041de58 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1627,6 +1627,160 @@
   DONE;
 })
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point rounding operations.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "nearbyintv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF
+	  [(match_operand:V2SF 1 "register_operand")
+	   (match_dup 2)]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
+
+(define_expand "rintv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF
+	  [(match_operand:V2SF 1 "register_operand")
+	   (match_dup 2)]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "operands[2] = GEN_INT (ROUND_MXCSR);")
+
+(define_expand "ceilv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF
+	  [(match_operand:V2SF 1 "register_operand")
+	   (match_dup 2)]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && !flag_trapping_math
+   && TARGET_MMX_WITH_SSE"
+  "operands[2] = GEN_INT (ROUND_CEIL | ROUND_NO_EXC);")
+
+(define_expand "lceilv2sfv2si2"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V2SF 1 "register_operand")]
+ "TARGET_SSE4_1 && !flag_trapping_math
+  && TARGET_MMX_WITH_SSE"
+{
+  rtx tmp = gen_reg_rtx (V2SFmode);
+  emit_insn (gen_ceilv2sf2 (tmp, operands[1]));
+  emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "floorv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF
+	  [(match_operand:V2SF 1 "vector_operand")
+	   (match_dup 2)]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && !flag_trapping_math
+  && TARGET_MMX_WITH_SSE"
+  "operands[2] = GEN_INT (ROUND_FLOOR | ROUND_NO_EXC);")
+
+(define_expand "lfloorv2sfv2si2"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V2SF 1 "register_operand")]
+ "TARGET_SSE4_1 && !flag_trapping_math
+  && TARGET_MMX_WITH_SSE"
+{
+  rtx tmp = gen_reg_rtx (V2SFmode);
+  emit_insn (gen_floorv2sf2 (tmp, operands[1]));
+  emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "btruncv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF
+	  [(match_operand:V2SF 1 "register_operand")
+	   (match_dup 2)]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && !flag_trapping_math"
+  "operands[2] = GEN_INT (ROUND_TRUNC | ROUND_NO_EXC);")
+
+(define_insn "*mmx_roundv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,v")
+	(unspec:V2SF
+	  [(match_operand:V2SF 1 "register_operand" "Yr,x,v")
+	   (match_operand:SI 2 "const_0_to_15_operand")]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "%vroundps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssecvt")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "lrintv2sfv2si2"
+  [(set (match_operand:V2SI 0 "register_operand" "=v")
+	(unspec:V2SI
+	  [(match_operand:V2SF 1 "register_operand" "v")]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_MMX_WITH_SSE"
+  "%vcvtps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "roundv2sf2"
+  [(set (match_dup 3)
+	(plus:V2SF
+	  (match_operand:V2SF 1 "register_operand")
+	  (match_dup 2)))
+   (set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF
+	  [(match_dup 3) (match_dup 4)]
+	  UNSPEC_ROUND))]
+  "TARGET_SSE4_1 && !flag_trapping_math
+   && TARGET_MMX_WITH_SSE"
+{
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx half, vec_half;
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (SFmode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, SFmode);
+  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+  half = const_double_from_real_value (pred_half, SFmode);
+
+  vec_half = ix86_build_const_vector (V2SFmode, true, half);
+  vec_half = force_reg (V2SFmode, vec_half);
+
+  operands[2] = gen_reg_rtx (V2SFmode);
+  emit_insn (gen_copysignv2sf3 (operands[2], vec_half, operands[1]));
+
+  operands[3] = gen_reg_rtx (V2SFmode);
+  operands[4] = GEN_INT (ROUND_TRUNC);
+})
+
+(define_expand "lroundv2sfv2si2"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V2SF 1 "register_operand")]
+ "TARGET_SSE4_1 && !flag_trapping_math
+  && TARGET_MMX_WITH_SSE"
+{
+  rtx tmp = gen_reg_rtx (V2SFmode);
+  emit_insn (gen_roundv2sf2 (tmp, operands[1]));
+  emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
+  DONE;
+})
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel half-precision floating point arithmetic
diff --git a/gcc/testsuite/gcc.target/i386/pr106910-1.c b/gcc/testsuite/gcc.target/i386/pr106910-1.c
new file mode 100644
index 00000000000..c7685a32183
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106910-1.c
@@ -0,0 +1,77 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse4.1 -O2 -Ofast" } */
+/* { dg-final { scan-assembler-times "roundps" 9 } } */
+/* { dg-final { scan-assembler-times "cvtps2dq" 1 } } */
+/* { dg-final { scan-assembler-times "cvttps2dq" 3 } } */
+
+#include<math.h>
+
+void
+foo (float* p, float* __restrict q)
+{
+  p[0] = truncf (q[0]);
+  p[1] = truncf (q[1]);
+}
+
+void
+foo1 (float* p, float* __restrict q)
+{
+  p[0] = floorf (q[0]);
+  p[1] = floorf (q[1]);
+}
+
+void
+foo1i (int* p, float* __restrict q)
+{
+  p[0] = (int) floorf (q[0]);
+  p[1] = (int) floorf (q[1]);
+}
+
+void
+foo2 (float* p, float* __restrict q)
+{
+  p[0] = ceilf (q[0]);
+  p[1] = ceilf (q[1]);
+}
+
+void
+foo2i (int* p, float* __restrict q)
+{
+  p[0] = (int) ceilf (q[0]);
+  p[1] = (int) ceilf (q[1]);
+}
+
+void
+foo3 (float* p, float* __restrict q)
+{
+  p[0] = rintf (q[0]);
+  p[1] = rintf (q[1]);
+}
+
+void
+foo3i (int* p, float* __restrict q)
+{
+  p[0] = (int) rintf (q[0]);
+  p[1] = (int) rintf (q[1]);
+}
+
+void
+foo4 (float* p, float* __restrict q)
+{
+  p[0] = nearbyintf (q[0]);
+  p[1] = nearbyintf (q[1]);
+}
+
+void
+foo5(float* p, float* __restrict q)
+{
+  p[0] = roundf (q[0]);
+  p[1] = roundf (q[1]);
+}
+
+void
+foo5i(int* p, float* __restrict q)
+{
+  p[0] = (int) roundf (q[0]);
+  p[1] = (int) roundf (q[1]);
+}