public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* RISC-V: Use convert instructions instead of calling library functions
@ 2024-03-18  9:09 Jivan Hakobyan
  2024-03-19  3:50 ` Jeff Law
  0 siblings, 1 reply; 7+ messages in thread
From: Jivan Hakobyan @ 2024-03-18  9:09 UTC (permalink / raw)
  To: GCC Patches, Jeff Law


[-- Attachment #1.1: Type: text/plain, Size: 2368 bytes --]

As RV has round instructions it is reasonable to use them instead of
calling the library functions.

With my patch for the following C code:
double foo(double a) {
    return ceil(a);
}

GCC generates the following ASM code (before it was tail call)
foo:
        fabs.d  fa4,fa0
        lui     a5,%hi(.LC0)
        fld     fa3,%lo(.LC0)(a5)
        flt.d   a5,fa4,fa3
        beq     a5,zero,.L3
        fcvt.l.d a5,fa0,rup
        fcvt.d.l        fa4,a5
        fsgnj.d fa0,fa4,fa0
.L3:
        ret

.LC0:
        .word   0
        .word   1127219200     // 0x4330000000000000


The patch I have evaluated on SPEC2017.
Counted dynamic instructions counts and got the following improvements

510.parest_r       262 m      -
511.povray_r      2.1  b        0.04%
521.wrt_r            269 m       -
526.blender_r    3 b             0.1%
527.cam4_r       15 b           0.6%
538.imagick_r    365 b         7.6%

Overall executed 385 billion fewer instructions which is 0.5%.


gcc/ChangeLog:
        * config/riscv/iterators.md (fix_ops, fix_uns): New iterators for
fix patterns.
        (RINT, rint_pattern, rint_rm): Removed.
        * config/riscv/riscv-protos.h (get_fp_rounding_coefficient): Add
function declaration.
        * config/riscv/riscv-v.cc (get_fp_rounding_coefficient): Turned to
not static
        * config/riscv/riscv.md (UNSPEC_LROUND): Removed.
        (<fix_uns>_trunc<ANYF:mode>si2, lrint<ANYF:mode>si2): New expanders.
        (l<round_pattern><ANYF:mode>si2,
<round_pattern><ANYF:mode>2):  Likewise.
        (<fix_uns>_trunc<ANYF:mode>si2_sext,
lrint<ANYF:mode>si2_sext): Expose generator.
        (l<round_pattern><ANYF:mode>si2_sext): Likewise.
        (<fix_uns>_trunc<ANYF:mode>si2, lrint<ANYF:mode>si2): Hide
generator.
        (l<round_pattern><ANYF:mode>si2): Hide generator.
        (fix_trunc<ANYF:mode><GPR:mode>2,
fixuns_trunc<ANYF:mode><GPR:mode>2): Removed.
        (l<rint_pattern><ANYF:mode><GPR:mode>2,
<round_pattern><ANYF:mode>2): Likewise.
        (<fix_uns>_trunc<ANYF:mode>di2, lrint<ANYF:mode>di2): New patterns.
        (l<round_pattern><ANYF:mode>di2): Likewise.

gcc/testsuite/ChangeLog:
        * gcc.target/riscv/fix.c: New test.
        * gcc.target/riscv/round.c: Likewise.
        * gcc.target/riscv/round_32.c: Likewise.
        * gcc.target/riscv/round_64.c: Likewise.


-- 
With the best regards
Jivan Hakobyan

[-- Attachment #2: round.diff --]
[-- Type: text/x-patch, Size: 19600 bytes --]

diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index a7694137685aee97ca249c0e720afdfc62ec33c9..75e119e407a36c273eaa6e5ffab24be42af7a8d7 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -196,6 +196,13 @@
 
 (define_code_iterator bitmanip_rotate [rotate rotatert])
 
+;; These code iterators allow the signed and unsigned fix operations to use
+;; the same template.
+(define_code_iterator fix_ops [fix unsigned_fix])
+
+(define_code_attr fix_uns [(fix "fix") (unsigned_fix "fixuns")])
+
+
 ;; -------------------------------------------------------------------
 ;; Code Attributes
 ;; -------------------------------------------------------------------
@@ -312,11 +319,6 @@
 ;; Int Iterators.
 ;; -------------------------------------------------------------------
 
-;; Iterator and attributes for floating-point rounding instructions.
-(define_int_iterator RINT [UNSPEC_LRINT UNSPEC_LROUND])
-(define_int_attr rint_pattern [(UNSPEC_LRINT "rint") (UNSPEC_LROUND "round")])
-(define_int_attr rint_rm [(UNSPEC_LRINT "dyn") (UNSPEC_LROUND "rmm")])
-
 ;; Iterator and attributes for quiet comparisons.
 (define_int_iterator QUIET_COMPARISON [UNSPEC_FLT_QUIET UNSPEC_FLE_QUIET])
 (define_int_attr quiet_pattern [(UNSPEC_FLT_QUIET "lt") (UNSPEC_FLE_QUIET "le")])
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index b87355938052a3a0ca9107774bb3a683c85b74d9..03486f4c4e3dab733e48a702c4ffbb5865f1884d 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -709,6 +709,7 @@ bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
 bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
 bool splat_to_scalar_move_p (rtx *);
+rtx get_fp_rounding_coefficient (machine_mode);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 967f4e382875dfeee7d5de5ab05a2b537766844e..95a233dea44168dec2a28c9ffff4a46004a40e18 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4494,7 +4494,7 @@ vls_mode_valid_p (machine_mode vls_mode)
       All double floating point will be unchanged for ceil if it is
       greater than and equal to 4503599627370496.
  */
-static rtx
+rtx
 get_fp_rounding_coefficient (machine_mode inner_mode)
 {
   REAL_VALUE_TYPE real;
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index b16ed97909c04456ce4fe5234a82c5597549b67d..d4eb440d7baeb3d71c7e58291ce4136da6852246 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -64,7 +64,6 @@
   UNSPEC_ROUNDEVEN
   UNSPEC_NEARBYINT
   UNSPEC_LRINT
-  UNSPEC_LROUND
   UNSPEC_FMIN
   UNSPEC_FMAX
   UNSPEC_FMINM
@@ -1967,21 +1966,48 @@
 ;;
 ;;  ....................
 
-(define_insn "fix_trunc<ANYF:mode><GPR:mode>2"
-  [(set (match_operand:GPR      0 "register_operand" "=r")
-	(fix:GPR
+(define_expand "<fix_uns>_trunc<ANYF:mode>si2"
+  [(set (match_operand:SI      0 "register_operand" "=r")
+	(fix_ops:SI
 	    (match_operand:ANYF 1 "register_operand" " f")))]
   "TARGET_HARD_FLOAT || TARGET_ZFINX"
-  "fcvt.<GPR:ifmt>.<ANYF:fmt> %0,%1,rtz"
+{
+  if (TARGET_64BIT)
+    {
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_<fix_uns>_trunc<ANYF:mode>si2_sext (t, operands[1]));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      DONE;
+    }
+})
+
+(define_insn "*<fix_uns>_trunc<ANYF:mode>si2"
+  [(set (match_operand:SI      0 "register_operand" "=r")
+	(fix_ops:SI
+	    (match_operand:ANYF 1 "register_operand" " f")))]
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
+  "fcvt.w<u>.<ANYF:fmt> %0,%1,rtz"
+  [(set_attr "type" "fcvt_f2i")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+(define_insn "<fix_uns>_trunc<ANYF:mode>si2_sext"
+  [(set (match_operand:DI      0 "register_operand" "=r")
+  (sign_extend:DI (fix_ops:SI
+	    (match_operand:ANYF 1 "register_operand" " f"))))]
+  "TARGET_64BIT && (TARGET_HARD_FLOAT || TARGET_ZFINX)"
+  "fcvt.w<u>.<ANYF:fmt> %0,%1,rtz"
   [(set_attr "type" "fcvt_f2i")
    (set_attr "mode" "<ANYF:MODE>")])
 
-(define_insn "fixuns_trunc<ANYF:mode><GPR:mode>2"
-  [(set (match_operand:GPR      0 "register_operand" "=r")
-	(unsigned_fix:GPR
+(define_insn "<fix_uns>_trunc<ANYF:mode>di2"
+  [(set (match_operand:DI      0 "register_operand" "=r")
+	(fix_ops:DI
 	    (match_operand:ANYF 1 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT  || TARGET_ZFINX"
-  "fcvt.<GPR:ifmt>u.<ANYF:fmt> %0,%1,rtz"
+  "TARGET_64BIT && (TARGET_HARD_FLOAT || TARGET_ZFINX)"
+  "fcvt.l<u>.<ANYF:fmt> %0,%1,rtz"
   [(set_attr "type" "fcvt_f2i")
    (set_attr "mode" "<ANYF:MODE>")])
 
@@ -2003,17 +2029,170 @@
   [(set_attr "type" "fcvt_i2f")
    (set_attr "mode" "<ANYF:MODE>")])
 
-(define_insn "l<rint_pattern><ANYF:mode><GPR:mode>2"
-  [(set (match_operand:GPR       0 "register_operand" "=r")
-	(unspec:GPR
+(define_expand "lrint<ANYF:mode>si2"
+  [(set (match_operand:SI       0 "register_operand" "=r")
+	(unspec:SI
 	    [(match_operand:ANYF 1 "register_operand" " f")]
-	    RINT))]
+	    UNSPEC_LRINT))]
   "TARGET_HARD_FLOAT || TARGET_ZFINX"
-  "fcvt.<GPR:ifmt>.<ANYF:fmt> %0,%1,<rint_rm>"
+{
+  if (TARGET_64BIT)
+    {
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_lrint<ANYF:mode>si2_sext (t, operands[1]));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      DONE;
+    }
+})
+
+(define_insn "*lrint<ANYF:mode>si2"
+  [(set (match_operand:SI       0 "register_operand" "=r")
+	(unspec:SI
+	    [(match_operand:ANYF 1 "register_operand" " f")]
+	    UNSPEC_LRINT))]
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
+  "fcvt.w.<ANYF:fmt> %0,%1,dyn"
   [(set_attr "type" "fcvt_f2i")
    (set_attr "mode" "<ANYF:MODE>")])
 
-(define_insn "<round_pattern><ANYF:mode>2"
+(define_insn "lrint<ANYF:mode>si2_sext"
+  [(set (match_operand:DI       0 "register_operand" "=r")
+  (sign_extend:DI (unspec:SI
+	    [(match_operand:ANYF 1 "register_operand" " f")]
+	    UNSPEC_LRINT)))]
+  "TARGET_64BIT && (TARGET_HARD_FLOAT || TARGET_ZFINX)"
+  "fcvt.w.<ANYF:fmt> %0,%1,dyn"
+  [(set_attr "type" "fcvt_f2i")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+(define_insn "lrint<ANYF:mode>di2"
+  [(set (match_operand:DI       0 "register_operand" "=r")
+	(unspec:DI
+	    [(match_operand:ANYF 1 "register_operand" " f")]
+	    UNSPEC_LRINT))]
+  "TARGET_64BIT && (TARGET_HARD_FLOAT || TARGET_ZFINX)"
+  "fcvt.l.<ANYF:fmt> %0,%1,dyn"
+  [(set_attr "type" "fcvt_f2i")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+(define_expand "l<round_pattern><ANYF:mode>si2"
+  [(set (match_operand:SI       0 "register_operand" "=r")
+	(unspec:SI
+	    [(match_operand:ANYF 1 "register_operand" " f")]
+    ROUND))]
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
+{
+  if (TARGET_64BIT)
+    {
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_l<round_pattern><ANYF:mode>si2_sext (t, operands[1]));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      DONE;
+    }
+})
+
+(define_insn "*l<round_pattern><ANYF:mode>si2"
+  [(set (match_operand:SI       0 "register_operand" "=r")
+	(unspec:SI
+	    [(match_operand:ANYF 1 "register_operand" " f")]
+    ROUND))]
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
+  "fcvt.w.<ANYF:fmt> %0,%1,<round_rm>"
+  [(set_attr "type" "fcvt_f2i")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+(define_insn "l<round_pattern><ANYF:mode>si2_sext"
+  [(set (match_operand:DI       0 "register_operand" "=r")
+	 (sign_extend:DI (unspec:SI
+	                     [(match_operand:ANYF 1 "register_operand" " f")]
+                      ROUND)))]
+  "TARGET_64BIT && (TARGET_HARD_FLOAT || TARGET_ZFINX)"
+  "fcvt.w.<ANYF:fmt> %0,%1,<round_rm>"
+  [(set_attr "type" "fcvt_f2i")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+(define_insn "l<round_pattern><ANYF:mode>di2"
+  [(set (match_operand:DI       0 "register_operand" "=r")
+	(unspec:DI
+	    [(match_operand:ANYF 1 "register_operand" " f")]
+    ROUND))]
+  "TARGET_64BIT && (TARGET_HARD_FLOAT || TARGET_ZFINX)"
+  "fcvt.l.<ANYF:fmt> %0,%1,<round_rm>"
+  [(set_attr "type" "fcvt_f2i")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+(define_expand "<round_pattern><ANYF:mode>2"
+  [(set (match_operand:ANYF     0 "register_operand" "=f")
+        (unspec:ANYF
+            [(match_operand:ANYF 1 "register_operand" " f")]
+        ROUND))]
+  "TARGET_HARD_FLOAT && (TARGET_ZFA
+                         || flag_fp_int_builtin_inexact || !flag_trapping_math)"
+{
+  if (TARGET_ZFA)
+    emit_insn (gen_<round_pattern><ANYF:mode>_zfa2 (operands[0],
+                                                    operands[1]));
+  else
+    {
+      rtx reg;
+      rtx label = gen_label_rtx ();
+      rtx end_label = gen_label_rtx ();
+      rtx abs_reg = gen_reg_rtx (<ANYF:MODE>mode);
+      rtx coeff_reg = gen_reg_rtx (<ANYF:MODE>mode);
+      rtx tmp_reg = gen_reg_rtx (<ANYF:MODE>mode);
+
+      riscv_emit_move (tmp_reg, operands[1]);
+      riscv_emit_move (coeff_reg,
+                       riscv_vector::get_fp_rounding_coefficient (<ANYF:MODE>mode));
+      emit_insn (gen_abs<ANYF:mode>2 (abs_reg, operands[1]));
+
+      riscv_expand_conditional_branch (label, LT, abs_reg, coeff_reg);
+
+      emit_jump_insn (gen_jump (end_label));
+      emit_barrier ();
+
+      emit_label (label);
+      switch (<ANYF:MODE>mode)
+        {
+        case SFmode:
+          reg = gen_reg_rtx (SImode);
+          emit_insn (gen_l<round_pattern>sfsi2 (reg, operands[1]));
+          emit_insn (gen_floatsisf2 (abs_reg, reg));
+          break;
+        case DFmode:
+          if (TARGET_64BIT)
+            {
+              reg = gen_reg_rtx (DImode);
+              emit_insn (gen_l<round_pattern>dfdi2 (reg, operands[1]));
+              emit_insn (gen_floatdidf2 (abs_reg, reg));
+            }
+          else
+            {
+              reg = gen_reg_rtx (SImode);
+              emit_insn (gen_l<round_pattern>dfsi2 (reg, operands[1]));
+              emit_insn (gen_floatsidf2 (abs_reg, reg));
+            }
+          break;
+        default:
+          gcc_unreachable ();
+        }
+
+      emit_insn (gen_copysign<ANYF:mode>3 (tmp_reg, abs_reg, operands[1]));
+
+      emit_label (end_label);
+      riscv_emit_move (operands[0], tmp_reg);
+    }
+
+  DONE;
+})
+
+(define_insn "<round_pattern><ANYF:mode>_zfa2"
   [(set (match_operand:ANYF     0 "register_operand" "=f")
 	(unspec:ANYF
 	    [(match_operand:ANYF 1 "register_operand" " f")]
diff --git a/gcc/testsuite/gcc.target/riscv/fix.c b/gcc/testsuite/gcc.target/riscv/fix.c
new file mode 100644
index 0000000000000000000000000000000000000000..265a7da1fc5d49cfbe8f3459407c7fa787b747dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fix.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc -mabi=lp64d" } */
+/* { dg-skip-if "" { *-*-* }  { "-O0" } } */
+
+int
+foo (double n)
+{
+  return n;
+}
+
+int
+foo_1 (float n)
+{
+  return n;
+}
+
+unsigned int
+foo_2 (double n)
+{
+  return n;
+}
+
+unsigned int
+foo_3 (float n)
+{
+  return n;
+}
+
+/* { dg-final { scan-assembler-times {\mfcvt.w.d} 1 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.w.s} 1 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.wu.d} 1 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.wu.s} 1 } } */
+/* { dg-final { scan-assembler-not "\\ssext.w\\s" } } */
+
diff --git a/gcc/testsuite/gcc.target/riscv/round.c b/gcc/testsuite/gcc.target/riscv/round.c
new file mode 100644
index 0000000000000000000000000000000000000000..decfc82a390e8f89bdc839cd00b0cb451030e4c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/round.c
@@ -0,0 +1,144 @@
+#include <math.h>
+
+extern void abort (void);
+extern void exit (int);
+
+#define NEQ(a, b) (fabs((a) - (b)) > 0.000001)
+
+#define DECL_FUNC(TYPE1, TYPE2, ROUND)              \
+   __attribute__ ((noinline, noclone)) TYPE2        \
+   convert_##TYPE1##_to_##TYPE2##_##ROUND (TYPE1 N) \
+    {                                               \
+      return ROUND (N);                             \
+    }
+
+#define DECL_ALL_ROUNDS_FOR(ROUND_FUNC) \
+  DECL_FUNC(float, float, ROUND_FUNC)   \
+  DECL_FUNC(double, double, ROUND_FUNC) \
+  DECL_FUNC(double, int, ROUND_FUNC)    \
+  DECL_FUNC(double, long, ROUND_FUNC)   \
+  DECL_FUNC(float, int, ROUND_FUNC)     \
+  DECL_FUNC(float, long, ROUND_FUNC)
+
+
+DECL_ALL_ROUNDS_FOR(round)
+DECL_ALL_ROUNDS_FOR(ceil)
+DECL_ALL_ROUNDS_FOR(floor)
+DECL_ALL_ROUNDS_FOR(trunc)
+DECL_ALL_ROUNDS_FOR(nearbyint)
+
+#define TEST_ROUND(TYPE1, TYPE2, N, N_R, ROUND)      \
+  if (NEQ (convert_##TYPE1##_to_##TYPE2##_##ROUND (N), N_R)) \
+    abort ();
+
+
+int main () {
+
+  /* Round */
+  TEST_ROUND(double, double, -4.8, -5.0, round);
+  TEST_ROUND(double, double, -4.2, -4.0, round);
+  TEST_ROUND(double, double, 4.8, 5.0, round);
+  TEST_ROUND(double, double, 4.2, 4.0, round);
+
+  TEST_ROUND(double, int, -4.8, -5, round);
+  TEST_ROUND(double, int, -4.2, -4, round);
+  TEST_ROUND(double, int, 4.8, 5, round);
+  TEST_ROUND(double, int, 4.2, 4, round);
+
+  TEST_ROUND(double, long, -4.8, -5, round);
+  TEST_ROUND(double, long, -4.2, -4, round);
+  TEST_ROUND(double, long, 4.8, 5, round);
+  TEST_ROUND(double, long, 4.2, 4, round);
+
+  TEST_ROUND(float, long, -4.8, -5, round);
+  TEST_ROUND(float, long, -4.2, -4, round);
+  TEST_ROUND(float, long, 4.8, 5, round);
+  TEST_ROUND(float, long, 4.2, 4, round);
+
+  /* Ceil */
+  TEST_ROUND(double, double, -4.8, -4.0, ceil);
+  TEST_ROUND(double, double, -4.2, -4.0, ceil);
+  TEST_ROUND(double, double, 4.8, 5.0, ceil);
+  TEST_ROUND(double, double, 4.2, 5.0, ceil);
+
+  TEST_ROUND(double, int, -4.8, -4, ceil);
+  TEST_ROUND(double, int, -4.2, -4, ceil);
+  TEST_ROUND(double, int, 4.8, 5, ceil);
+  TEST_ROUND(double, int, 4.2, 5, ceil);
+
+  TEST_ROUND(double, long, -4.8, -4, ceil);
+  TEST_ROUND(double, long, -4.2, -4, ceil);
+  TEST_ROUND(double, long, 4.8, 5, ceil);
+  TEST_ROUND(double, long, 4.2, 5, ceil);
+
+  TEST_ROUND(float, long, -4.8, -4, ceil);
+  TEST_ROUND(float, long, -4.2, -4, ceil);
+  TEST_ROUND(float, long, 4.8, 5, ceil);
+  TEST_ROUND(float, long, 4.2, 5, ceil);
+
+  /* Floor */
+  TEST_ROUND(double, double, -4.8, -5.0, floor);
+  TEST_ROUND(double, double, -4.2, -5.0, floor);
+  TEST_ROUND(double, double, 4.8, 4.0, floor);
+  TEST_ROUND(double, double, 4.2, 4.0, floor);
+
+  TEST_ROUND(double, int, -4.8, -5, floor);
+  TEST_ROUND(double, int, -4.2, -5, floor);
+  TEST_ROUND(double, int, 4.8, 4, floor);
+  TEST_ROUND(double, int, 4.2, 4, floor);
+
+  TEST_ROUND(double, long, -4.8, -5, floor);
+  TEST_ROUND(double, long, -4.2, -5, floor);
+  TEST_ROUND(double, long, 4.8, 4, floor);
+  TEST_ROUND(double, long, 4.2, 4, floor);
+
+  TEST_ROUND(float, long, -4.8, -5, floor);
+  TEST_ROUND(float, long, -4.2, -5, floor);
+  TEST_ROUND(float, long, 4.8, 4, floor);
+  TEST_ROUND(float, long, 4.2, 4, floor);
+
+  /* Trunc */
+  TEST_ROUND(double, double, -4.8, -4.0, trunc);
+  TEST_ROUND(double, double, -4.2, -4.0, trunc);
+  TEST_ROUND(double, double, 4.8, 4.0, trunc);
+  TEST_ROUND(double, double, 4.2, 4.0, trunc);
+
+  TEST_ROUND(double, int, -4.8, -4, trunc);
+  TEST_ROUND(double, int, -4.2, -4, trunc);
+  TEST_ROUND(double, int, 4.8, 4, trunc);
+  TEST_ROUND(double, int, 4.2, 4, trunc);
+
+  TEST_ROUND(double, long, -4.8, -4, trunc);
+  TEST_ROUND(double, long, -4.2, -4, trunc);
+  TEST_ROUND(double, long, 4.8, 4, trunc);
+  TEST_ROUND(double, long, 4.2, 4, trunc);
+
+  TEST_ROUND(float, long, -4.8, -4, trunc);
+  TEST_ROUND(float, long, -4.2, -4, trunc);
+  TEST_ROUND(float, long, 4.8, 4, trunc);
+  TEST_ROUND(float, long, 4.2, 4, trunc);
+
+  /* Nearbyint */
+  TEST_ROUND(double, double, -4.8, -5.0, nearbyint);
+  TEST_ROUND(double, double, -4.2, -4.0, nearbyint);
+  TEST_ROUND(double, double, 4.8, 5.0, nearbyint);
+  TEST_ROUND(double, double, 4.2, 4.0, nearbyint);
+
+  TEST_ROUND(double, int, -4.8, -5, nearbyint);
+  TEST_ROUND(double, int, -4.2, -4, nearbyint);
+  TEST_ROUND(double, int, 4.8, 5, nearbyint);
+  TEST_ROUND(double, int, 4.2, 4, nearbyint);
+
+  TEST_ROUND(double, long, -4.8, -5, nearbyint);
+  TEST_ROUND(double, long, -4.2, -4, nearbyint);
+  TEST_ROUND(double, long, 4.8, 5, nearbyint);
+  TEST_ROUND(double, long, 4.2, 4, nearbyint);
+
+  TEST_ROUND(float, long, -4.8, -5, nearbyint);
+  TEST_ROUND(float, long, -4.2, -4, nearbyint);
+  TEST_ROUND(float, long, 4.8, 5, nearbyint);
+  TEST_ROUND(float, long, 4.2, 4, nearbyint);
+
+  exit(0);
+}
+
diff --git a/gcc/testsuite/gcc.target/riscv/round_32.c b/gcc/testsuite/gcc.target/riscv/round_32.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9fea70ad5523c76c98bd2c20ce4d17db98b8b79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/round_32.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { riscv32*-*-* } } } */
+/* { dg-options "-march=rv32gc -mabi=ilp32d -fno-math-errno -funsafe-math-optimizations -fno-inline" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+#include "round.c"
+
+/* { dg-final { scan-assembler-times {\mfcvt.w.s} 15 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.s.w} 5 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.d.w} 65 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.w.d} 15 } } */
+/* { dg-final { scan-assembler-times {,rup} 6 } } */
+/* { dg-final { scan-assembler-times {,rmm} 6 } } */
+/* { dg-final { scan-assembler-times {,rdn} 6 } } */
+/* { dg-final { scan-assembler-times {,rtz} 6 } } */
+/* { dg-final { scan-assembler-not {\mfcvt.l.d} } } */
+/* { dg-final { scan-assembler-not {\mfcvt.d.l} } } */
+/* { dg-final { scan-assembler-not "\\sceil\\s" } } */
+/* { dg-final { scan-assembler-not "\\sfloor\\s" } } */
+/* { dg-final { scan-assembler-not "\\sround\\s" } } */
+/* { dg-final { scan-assembler-not "\\snearbyint\\s" } } */
+/* { dg-final { scan-assembler-not "\\srint\\s" } } */
+/* { dg-final { scan-assembler-not "\\stail\\s" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/round_64.c b/gcc/testsuite/gcc.target/riscv/round_64.c
new file mode 100644
index 0000000000000000000000000000000000000000..e79690979a50663c1c3518724cc23466d9e20490
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/round_64.c
@@ -0,0 +1,23 @@
+/* { dg-do compile { target { riscv64*-*-* } } } */
+/* { dg-options "-march=rv64gc -mabi=lp64d -fno-math-errno -funsafe-math-optimizations -fno-inline" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+#include "round.c"
+
+/* { dg-final { scan-assembler-times {\mfcvt.w.s} 10 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.s.w} 5 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.l.d} 10 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.d.l} 45 } } */
+/* { dg-final { scan-assembler-times {\mfcvt.w.d} 5 } } */
+/* { dg-final { scan-assembler-times {,rup} 6 } } */
+/* { dg-final { scan-assembler-times {,rmm} 6 } } */
+/* { dg-final { scan-assembler-times {,rdn} 6 } } */
+/* { dg-final { scan-assembler-times {,rtz} 6 } } */
+/* { dg-final { scan-assembler-not "\\sceil\\s" } } */
+/* { dg-final { scan-assembler-not "\\sfloor\\s" } } */
+/* { dg-final { scan-assembler-not "\\sround\\s" } } */
+/* { dg-final { scan-assembler-not "\\snearbyint\\s" } } */
+/* { dg-final { scan-assembler-not "\\srint\\s" } } */
+/* { dg-final { scan-assembler-not "\\stail\\s" } } */
+/* { dg-final { scan-assembler-not "\\ssext.w\\s" } } */
+

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: RISC-V: Use convert instructions instead of calling library functions
  2024-03-18  9:09 RISC-V: Use convert instructions instead of calling library functions Jivan Hakobyan
@ 2024-03-19  3:50 ` Jeff Law
  2024-03-19 16:23   ` Palmer Dabbelt
  0 siblings, 1 reply; 7+ messages in thread
From: Jeff Law @ 2024-03-19  3:50 UTC (permalink / raw)
  To: Jivan Hakobyan, GCC Patches, Jeff Law



On 3/18/24 3:09 AM, Jivan Hakobyan wrote:
> As RV has round instructions it is reasonable to use them instead of 
> calling the library functions.
> 
> With my patch for the following C code:
> double foo(double a) {
>      return ceil(a);
> }
> 
> GCC generates the following ASM code (before it was tail call)
> foo:
>          fabs.d  fa4,fa0
>          lui     a5,%hi(.LC0)
>          fld     fa3,%lo(.LC0)(a5)
>          flt.d   a5,fa4,fa3
>          beq     a5,zero,.L3
>          fcvt.l.d a5,fa0,rup
>          fcvt.d.l        fa4,a5
>          fsgnj.d fa0,fa4,fa0
> .L3:
>          ret
> 
> .LC0:
>          .word   0
>          .word   1127219200     // 0x4330000000000000
> 
> 
> The patch I have evaluated on SPEC2017.
> Counted dynamic instructions counts and got the following improvements
> 
> 510.parest_r       262 m      -
> 511.povray_r      2.1  b        0.04%
> 521.wrt_r            269 m       -
> 526.blender_r    3 b             0.1%
> 527.cam4_r       15 b           0.6%
> 538.imagick_r    365 b         7.6%
> 
> Overall executed 385 billion fewer instructions which is 0.5%.
A few more notes.

The sequence Jivan is using is derived from LLVM.  The condition in the 
generated code tests for those values were are supposed to pass through 
unaltered.  The condition in the pattern ensures we do something 
sensible WRT FE_INEXACT and mirrors how other ports handle these insns.

Our internal testing shows a benefit well beyond the 7% reduction in 
icounts.  Presumably due to fewer calls, fewer transfers across the 
register files, better scheduling around the call site, etc.

Obviously for Zfa we'll use the more efficient instructions for that 
extension.  But there's no reason to not go forward with this change for 
gcc-15.


Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: RISC-V: Use convert instructions instead of calling library functions
  2024-03-19  3:50 ` Jeff Law
@ 2024-03-19 16:23   ` Palmer Dabbelt
  2024-03-19 19:58     ` Andrew Waterman
  2024-03-20 18:54     ` Jeff Law
  0 siblings, 2 replies; 7+ messages in thread
From: Palmer Dabbelt @ 2024-03-19 16:23 UTC (permalink / raw)
  To: jeffreyalaw; +Cc: jivanhakobyan9, gcc-patches, Jeff Law

On Mon, 18 Mar 2024 20:50:14 PDT (-0700), jeffreyalaw@gmail.com wrote:
>
>
> On 3/18/24 3:09 AM, Jivan Hakobyan wrote:
>> As RV has round instructions it is reasonable to use them instead of
>> calling the library functions.
>>
>> With my patch for the following C code:
>> double foo(double a) {
>>      return ceil(a);
>> }
>>
>> GCC generates the following ASM code (before it was tail call)
>> foo:
>>          fabs.d  fa4,fa0
>>          lui     a5,%hi(.LC0)
>>          fld     fa3,%lo(.LC0)(a5)
>>          flt.d   a5,fa4,fa3
>>          beq     a5,zero,.L3
>>          fcvt.l.d a5,fa0,rup

I'm not sure exactly what context this is in, but my reading of 
"according to the current rounding mode" means we'd usually use the 
dynamic rounding mode.

>>          fcvt.d.l        fa4,a5
>>          fsgnj.d fa0,fa4,fa0
>> .L3:
>>          ret
>>
>> .LC0:
>>          .word   0
>>          .word   1127219200     // 0x4330000000000000
>>
>>
>> The patch I have evaluated on SPEC2017.
>> Counted dynamic instructions counts and got the following improvements
>>
>> 510.parest_r       262 m      -
>> 511.povray_r      2.1  b        0.04%
>> 521.wrt_r            269 m       -
>> 526.blender_r    3 b             0.1%
>> 527.cam4_r       15 b           0.6%
>> 538.imagick_r    365 b         7.6%
>>
>> Overall executed 385 billion fewer instructions which is 0.5%.
> A few more notes.
>
> The sequence Jivan is using is derived from LLVM.  The condition in the
> generated code tests for those values were are supposed to pass through
> unaltered.  The condition in the pattern ensures we do something
> sensible WRT FE_INEXACT and mirrors how other ports handle these insns.

My only worry here is that when we were doing the other patterns we 
decided not to do rint.  I can't remember exactly why, but from reading 
the docs we probably just skipped it due to the inexact handling and Zfa 
having an instruction that just does this.  FP stuff is always a bit of 
a time sink, so at that point it probably just fell off the priority 
list.

I'm not really an FP guy, so I usually just poke around what the other 
ports generate and try to figure out what's going on.  arm64 has the Zfa 
instruction and x86 FP is complicated, so I'm not sure exactly who else 
to look at for this sort of stuff.  From just looking at the code, 
though, I think there's two issues -- I'm not really an FP person, 
though, so take this with a grain of salt:

IIUC what we've got here doesn't actually set the inexact flag for the 
bounds check failure case, as we're just loading up an exact constant 
and doing the bounds check.  We're also not clamping to INT_MAX-type 
values, but not sure if we're supposed to.  I think we could fix both of 
those by adjusting the expansion to something like

              fabs.d  fa4,fa0
              lui     a5,%hi(.LC0)
              fld     fa3,%lo(.LC0)(a5)
              flt.d   a5,fa4,fa3
              bne     a5,zero,.L3
              mv      fa0, fa3
     .L3:
              fcvt.l.d a5,fa0,rup
              fcvt.d.l        fa4,a5
              fsgnj.d fa0,fa4,fa0
              ret

and then adjusting the constant to be an epsilon larger than INT_MAX so 
it'll still trigger the clamping but also inexact.

There's also a pair of changes to the ISA in 2020 that added the 
conversion inexact handling requirement, it was a grey area before.  I 
don't remember exactly what happened there, but I did remember it 
happening.  I don't think anyone cares all that much about the 
performance of systems that target the older ISAs, so maybe we just 
restrict the non-libcall expansion to ISAs that contain the new wording?

    commit 5890a1a702abf4157d5879717a39d8ecdae0de68 (tag: draft-20201229-5890a1a)
    Author: Andrew Waterman <andrew@sifive.com>
    Date:   Mon Dec 28 18:19:44 2020 -0800
    
        Clarify when FP conversions raise the Inexact flag
    
    diff --git a/src/f.tex b/src/f.tex
    index 8649d75..97c253b 100644
    --- a/src/f.tex
    +++ b/src/f.tex
    @@ -594,9 +594,9 @@ instructions round according to the {\em rm} field.  A floating-point register
     can be initialized to floating-point positive zero using FCVT.S.W {\em rd},
     {\tt x0}, which will never set any exception flags.
    
    -All floating-point conversion instructions set the Inexact exception flag if the
    -result differs from its operand value, yet is representable in the destination
    -format.
    +All floating-point conversion instructions set the Inexact exception flag if
    +the rounded result differs from the operand value and the Invalid exception
    +flag is not set.
    
     \vspace{-0.2in}
     \begin{center}
    
    commit 27b40fbc798357fcb4b1deaba4553646fe677576 (tag: draft-20200229-27b40fb)
    Author: Andrew Waterman <andrew@sifive.com>
    Date:   Fri Feb 28 18:12:47 2020 -0800
    
        Clarify that FCVT instructions signal inexact
    
    diff --git a/src/f.tex b/src/f.tex
    index 7680347..a9022c4 100644
    --- a/src/f.tex
    +++ b/src/f.tex
    @@ -582,6 +582,10 @@ instructions round according to the {\em rm} field.  A floating-point register
     can be initialized to floating-point positive zero using FCVT.S.W {\em rd},
     {\tt x0}, which will never set any exception flags.
    
    +All floating-point conversion instructions raise the Inexact exception if the
    +result differs from its operand value, yet is representable in the destination
    +format.
    +
     \vspace{-0.2in}
     \begin{center}
     \begin{tabular}{R@{}F@{}R@{}R@{}F@{}R@{}O}

> Our internal testing shows a benefit well beyond the 7% reduction in
> icounts.  Presumably due to fewer calls, fewer transfers across the
> register files, better scheduling around the call site, etc.
>
> Obviously for Zfa we'll use the more efficient instructions for that
> extension.  But there's no reason to not go forward with this change for
> gcc-15.

Ya, I think those are all pretty minor issues (If the exist at all).   
So it seems like the right way to go in general.

>
> Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: RISC-V: Use convert instructions instead of calling library functions
  2024-03-19 16:23   ` Palmer Dabbelt
@ 2024-03-19 19:58     ` Andrew Waterman
  2024-03-19 20:19       ` Palmer Dabbelt
  2024-03-20 18:54     ` Jeff Law
  1 sibling, 1 reply; 7+ messages in thread
From: Andrew Waterman @ 2024-03-19 19:58 UTC (permalink / raw)
  To: Palmer Dabbelt; +Cc: jeffreyalaw, jivanhakobyan9, gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 7132 bytes --]

On Tue, Mar 19, 2024 at 9:23 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:

> On Mon, 18 Mar 2024 20:50:14 PDT (-0700), jeffreyalaw@gmail.com wrote:
> >
> >
> > On 3/18/24 3:09 AM, Jivan Hakobyan wrote:
> >> As RV has round instructions it is reasonable to use them instead of
> >> calling the library functions.
> >>
> >> With my patch for the following C code:
> >> double foo(double a) {
> >>      return ceil(a);
> >> }
> >>
> >> GCC generates the following ASM code (before it was tail call)
> >> foo:
> >>          fabs.d  fa4,fa0
> >>          lui     a5,%hi(.LC0)
> >>          fld     fa3,%lo(.LC0)(a5)
> >>          flt.d   a5,fa4,fa3
> >>          beq     a5,zero,.L3
> >>          fcvt.l.d a5,fa0,rup
>
> I'm not sure exactly what context this is in, but my reading of
> "according to the current rounding mode" means we'd usually use the
> dynamic rounding mode.
>

ceil doesn't depend on the current rounding mode; rup is correct.  For
rint, you'd be correct.


>
> >>          fcvt.d.l        fa4,a5
> >>          fsgnj.d fa0,fa4,fa0
> >> .L3:
> >>          ret
> >>
> >> .LC0:
> >>          .word   0
> >>          .word   1127219200     // 0x4330000000000000
> >>
> >>
> >> The patch I have evaluated on SPEC2017.
> >> Counted dynamic instructions counts and got the following improvements
> >>
> >> 510.parest_r       262 m      -
> >> 511.povray_r      2.1  b        0.04%
> >> 521.wrt_r            269 m       -
> >> 526.blender_r    3 b             0.1%
> >> 527.cam4_r       15 b           0.6%
> >> 538.imagick_r    365 b         7.6%
> >>
> >> Overall executed 385 billion fewer instructions which is 0.5%.
> > A few more notes.
> >
> > The sequence Jivan is using is derived from LLVM.  The condition in the
> > generated code tests for those values were are supposed to pass through
> > unaltered.  The condition in the pattern ensures we do something
> > sensible WRT FE_INEXACT and mirrors how other ports handle these insns.
>
> My only worry here is that when we were doing the other patterns we
> decided not to do rint.  I can't remember exactly why, but from reading
> the docs we probably just skipped it due to the inexact handling and Zfa
> having an instruction that just does this.  FP stuff is always a bit of
> a time sink, so at that point it probably just fell off the priority
> list.
>
> I'm not really an FP guy, so I usually just poke around what the other
> ports generate and try to figure out what's going on.  arm64 has the Zfa
> instruction and x86 FP is complicated, so I'm not sure exactly who else
> to look at for this sort of stuff.  From just looking at the code,
> though, I think there's two issues -- I'm not really an FP person,
> though, so take this with a grain of salt:
>
> IIUC what we've got here doesn't actually set the inexact flag for the
> bounds check failure case


I think the original code was correct.  If you exceed the bounds, then by
construction you know that the input was already an integer, and so not
setting NX is correct.  If you didn't exceed the bounds, then fcvt.l.d will
set NX when appropriate.

, as we're just loading up an exact constant
> and doing the bounds check.  We're also not clamping to INT_MAX-type
> values, but not sure if we're supposed to.  I think we could fix both of
> those by adjusting the expansion to something like
>
>               fabs.d  fa4,fa0
>               lui     a5,%hi(.LC0)
>               fld     fa3,%lo(.LC0)(a5)
>               flt.d   a5,fa4,fa3
>               bne     a5,zero,.L3
>               mv      fa0, fa3
>      .L3:
>               fcvt.l.d a5,fa0,rup
>               fcvt.d.l        fa4,a5
>               fsgnj.d fa0,fa4,fa0
>               ret
>
> and then adjusting the constant to be an epsilon larger than INT_MAX so
> it'll still trigger the clamping but also inexact.
>

ceil/rint/etc. are supposed to work for the whole range of their
floating-point type; the range of the integral types isn't supposed to
affect the result.  The original code was correct in this regard, too.


>
> There's also a pair of changes to the ISA in 2020 that added the
> conversion inexact handling requirement, it was a grey area before.  I
> don't remember exactly what happened there, but I did remember it
> happening.  I don't think anyone cares all that much about the
> performance of systems that target the older ISAs, so maybe we just
> restrict the non-libcall expansion to ISAs that contain the new wording?
>
>     commit 5890a1a702abf4157d5879717a39d8ecdae0de68 (tag:
> draft-20201229-5890a1a)
>     Author: Andrew Waterman <andrew@sifive.com>
>     Date:   Mon Dec 28 18:19:44 2020 -0800
>
>         Clarify when FP conversions raise the Inexact flag
>
>     diff --git a/src/f.tex b/src/f.tex
>     index 8649d75..97c253b 100644
>     --- a/src/f.tex
>     +++ b/src/f.tex
>     @@ -594,9 +594,9 @@ instructions round according to the {\em rm}
> field.  A floating-point register
>      can be initialized to floating-point positive zero using FCVT.S.W
> {\em rd},
>      {\tt x0}, which will never set any exception flags.
>
>     -All floating-point conversion instructions set the Inexact exception
> flag if the
>     -result differs from its operand value, yet is representable in the
> destination
>     -format.
>     +All floating-point conversion instructions set the Inexact exception
> flag if
>     +the rounded result differs from the operand value and the Invalid
> exception
>     +flag is not set.
>
>      \vspace{-0.2in}
>      \begin{center}
>
>     commit 27b40fbc798357fcb4b1deaba4553646fe677576 (tag:
> draft-20200229-27b40fb)
>     Author: Andrew Waterman <andrew@sifive.com>
>     Date:   Fri Feb 28 18:12:47 2020 -0800
>
>         Clarify that FCVT instructions signal inexact
>
>     diff --git a/src/f.tex b/src/f.tex
>     index 7680347..a9022c4 100644
>     --- a/src/f.tex
>     +++ b/src/f.tex
>     @@ -582,6 +582,10 @@ instructions round according to the {\em rm}
> field.  A floating-point register
>      can be initialized to floating-point positive zero using FCVT.S.W
> {\em rd},
>      {\tt x0}, which will never set any exception flags.
>
>     +All floating-point conversion instructions raise the Inexact
> exception if the
>     +result differs from its operand value, yet is representable in the
> destination
>     +format.
>     +
>      \vspace{-0.2in}
>      \begin{center}
>      \begin{tabular}{R@{}F@{}R@{}R@{}F@{}R@{}O}
>
> > Our internal testing shows a benefit well beyond the 7% reduction in
> > icounts.  Presumably due to fewer calls, fewer transfers across the
> > register files, better scheduling around the call site, etc.
> >
> > Obviously for Zfa we'll use the more efficient instructions for that
> > extension.  But there's no reason to not go forward with this change for
> > gcc-15.
>
> Ya, I think those are all pretty minor issues (If the exist at all).
> So it seems like the right way to go in general.
>
> >
> > Jeff
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: RISC-V: Use convert instructions instead of calling library functions
  2024-03-19 19:58     ` Andrew Waterman
@ 2024-03-19 20:19       ` Palmer Dabbelt
  0 siblings, 0 replies; 7+ messages in thread
From: Palmer Dabbelt @ 2024-03-19 20:19 UTC (permalink / raw)
  To: Andrew Waterman; +Cc: jeffreyalaw, jivanhakobyan9, gcc-patches, Jeff Law

On Tue, 19 Mar 2024 12:58:41 PDT (-0700), Andrew Waterman wrote:
> On Tue, Mar 19, 2024 at 9:23 AM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
>> On Mon, 18 Mar 2024 20:50:14 PDT (-0700), jeffreyalaw@gmail.com wrote:
>> >
>> >
>> > On 3/18/24 3:09 AM, Jivan Hakobyan wrote:
>> >> As RV has round instructions it is reasonable to use them instead of
>> >> calling the library functions.
>> >>
>> >> With my patch for the following C code:
>> >> double foo(double a) {
>> >>      return ceil(a);
>> >> }
>> >>
>> >> GCC generates the following ASM code (before it was tail call)
>> >> foo:
>> >>          fabs.d  fa4,fa0
>> >>          lui     a5,%hi(.LC0)
>> >>          fld     fa3,%lo(.LC0)(a5)
>> >>          flt.d   a5,fa4,fa3
>> >>          beq     a5,zero,.L3
>> >>          fcvt.l.d a5,fa0,rup
>>
>> I'm not sure exactly what context this is in, but my reading of
>> "according to the current rounding mode" means we'd usually use the
>> dynamic rounding mode.
>>
>
> ceil doesn't depend on the current rounding mode; rup is correct.  For
> rint, you'd be correct.

Ya, right, that's pretty obvious -- I must have just been falling asleep 
this morning.

>> >>          fcvt.d.l        fa4,a5
>> >>          fsgnj.d fa0,fa4,fa0
>> >> .L3:
>> >>          ret
>> >>
>> >> .LC0:
>> >>          .word   0
>> >>          .word   1127219200     // 0x4330000000000000
>> >>
>> >>
>> >> The patch I have evaluated on SPEC2017.
>> >> Counted dynamic instructions counts and got the following improvements
>> >>
>> >> 510.parest_r       262 m      -
>> >> 511.povray_r      2.1  b        0.04%
>> >> 521.wrt_r            269 m       -
>> >> 526.blender_r    3 b             0.1%
>> >> 527.cam4_r       15 b           0.6%
>> >> 538.imagick_r    365 b         7.6%
>> >>
>> >> Overall executed 385 billion fewer instructions which is 0.5%.
>> > A few more notes.
>> >
>> > The sequence Jivan is using is derived from LLVM.  The condition in the
>> > generated code tests for those values were are supposed to pass through
>> > unaltered.  The condition in the pattern ensures we do something
>> > sensible WRT FE_INEXACT and mirrors how other ports handle these insns.
>>
>> My only worry here is that when we were doing the other patterns we
>> decided not to do rint.  I can't remember exactly why, but from reading
>> the docs we probably just skipped it due to the inexact handling and Zfa
>> having an instruction that just does this.  FP stuff is always a bit of
>> a time sink, so at that point it probably just fell off the priority
>> list.
>>
>> I'm not really an FP guy, so I usually just poke around what the other
>> ports generate and try to figure out what's going on.  arm64 has the Zfa
>> instruction and x86 FP is complicated, so I'm not sure exactly who else
>> to look at for this sort of stuff.  From just looking at the code,
>> though, I think there's two issues -- I'm not really an FP person,
>> though, so take this with a grain of salt:
>>
>> IIUC what we've got here doesn't actually set the inexact flag for the
>> bounds check failure case
>
>
> I think the original code was correct.  If you exceed the bounds, then by
> construction you know that the input was already an integer, and so not
> setting NX is correct.  If you didn't exceed the bounds, then fcvt.l.d will
> set NX when appropriate.
>
> , as we're just loading up an exact constant
>> and doing the bounds check.  We're also not clamping to INT_MAX-type
>> values, but not sure if we're supposed to.  I think we could fix both of
>> those by adjusting the expansion to something like
>>
>>               fabs.d  fa4,fa0
>>               lui     a5,%hi(.LC0)
>>               fld     fa3,%lo(.LC0)(a5)
>>               flt.d   a5,fa4,fa3
>>               bne     a5,zero,.L3
>>               mv      fa0, fa3
>>      .L3:
>>               fcvt.l.d a5,fa0,rup
>>               fcvt.d.l        fa4,a5
>>               fsgnj.d fa0,fa4,fa0
>>               ret
>>
>> and then adjusting the constant to be an epsilon larger than INT_MAX so
>> it'll still trigger the clamping but also inexact.
>>
>
> ceil/rint/etc. are supposed to work for the whole range of their
> floating-point type; the range of the integral types isn't supposed to
> affect the result.  The original code was correct in this regard, too.

Ya, sorry, I think I just misunderstood what was going on here -- it's 
not INT_MAX, it's just the largest FP values for which mantissas might 
produce non-integral values.

>> There's also a pair of changes to the ISA in 2020 that added the
>> conversion inexact handling requirement, it was a grey area before.  I
>> don't remember exactly what happened there, but I did remember it
>> happening.  I don't think anyone cares all that much about the
>> performance of systems that target the older ISAs, so maybe we just
>> restrict the non-libcall expansion to ISAs that contain the new wording?
>>
>>     commit 5890a1a702abf4157d5879717a39d8ecdae0de68 (tag:
>> draft-20201229-5890a1a)
>>     Author: Andrew Waterman <andrew@sifive.com>
>>     Date:   Mon Dec 28 18:19:44 2020 -0800
>>
>>         Clarify when FP conversions raise the Inexact flag
>>
>>     diff --git a/src/f.tex b/src/f.tex
>>     index 8649d75..97c253b 100644
>>     --- a/src/f.tex
>>     +++ b/src/f.tex
>>     @@ -594,9 +594,9 @@ instructions round according to the {\em rm}
>> field.  A floating-point register
>>      can be initialized to floating-point positive zero using FCVT.S.W
>> {\em rd},
>>      {\tt x0}, which will never set any exception flags.
>>
>>     -All floating-point conversion instructions set the Inexact exception
>> flag if the
>>     -result differs from its operand value, yet is representable in the
>> destination
>>     -format.
>>     +All floating-point conversion instructions set the Inexact exception
>> flag if
>>     +the rounded result differs from the operand value and the Invalid
>> exception
>>     +flag is not set.
>>
>>      \vspace{-0.2in}
>>      \begin{center}
>>
>>     commit 27b40fbc798357fcb4b1deaba4553646fe677576 (tag:
>> draft-20200229-27b40fb)
>>     Author: Andrew Waterman <andrew@sifive.com>
>>     Date:   Fri Feb 28 18:12:47 2020 -0800
>>
>>         Clarify that FCVT instructions signal inexact
>>
>>     diff --git a/src/f.tex b/src/f.tex
>>     index 7680347..a9022c4 100644
>>     --- a/src/f.tex
>>     +++ b/src/f.tex
>>     @@ -582,6 +582,10 @@ instructions round according to the {\em rm}
>> field.  A floating-point register
>>      can be initialized to floating-point positive zero using FCVT.S.W
>> {\em rd},
>>      {\tt x0}, which will never set any exception flags.
>>
>>     +All floating-point conversion instructions raise the Inexact
>> exception if the
>>     +result differs from its operand value, yet is representable in the
>> destination
>>     +format.
>>     +
>>      \vspace{-0.2in}
>>      \begin{center}
>>      \begin{tabular}{R@{}F@{}R@{}R@{}F@{}R@{}O}
>>
>> > Our internal testing shows a benefit well beyond the 7% reduction in
>> > icounts.  Presumably due to fewer calls, fewer transfers across the
>> > register files, better scheduling around the call site, etc.
>> >
>> > Obviously for Zfa we'll use the more efficient instructions for that
>> > extension.  But there's no reason to not go forward with this change for
>> > gcc-15.
>>
>> Ya, I think those are all pretty minor issues (If the exist at all).
>> So it seems like the right way to go in general.
>>
>> >
>> > Jeff
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: RISC-V: Use convert instructions instead of calling library functions
  2024-03-19 16:23   ` Palmer Dabbelt
  2024-03-19 19:58     ` Andrew Waterman
@ 2024-03-20 18:54     ` Jeff Law
  2024-03-20 19:34       ` Palmer Dabbelt
  1 sibling, 1 reply; 7+ messages in thread
From: Jeff Law @ 2024-03-20 18:54 UTC (permalink / raw)
  To: Palmer Dabbelt, jeffreyalaw; +Cc: jivanhakobyan9, gcc-patches



On 3/19/24 10:23 AM, Palmer Dabbelt wrote:
> On Mon, 18 Mar 2024 20:50:14 PDT (-0700), jeffreyalaw@gmail.com wrote:
>>
>>
>> On 3/18/24 3:09 AM, Jivan Hakobyan wrote:
>>> As RV has round instructions it is reasonable to use them instead of
>>> calling the library functions.
>>>
>>> With my patch for the following C code:
>>> double foo(double a) {
>>>      return ceil(a);
>>> }
>>>
>>> GCC generates the following ASM code (before it was tail call)
>>> foo:
>>>          fabs.d  fa4,fa0
>>>          lui     a5,%hi(.LC0)
>>>          fld     fa3,%lo(.LC0)(a5)
>>>          flt.d   a5,fa4,fa3
>>>          beq     a5,zero,.L3
>>>          fcvt.l.d a5,fa0,rup
> 
> I'm not sure exactly what context this is in, but my reading of 
> "according to the current rounding mode" means we'd usually use the 
> dynamic rounding mode.
As Andrew W. noted, we're dealing with ceil and thus rup is the correct 
rounding mode to use here.



> 
> My only worry here is that when we were doing the other patterns we 
> decided not to do rint.  I can't remember exactly why, but from reading 
> the docs we probably just skipped it due to the inexact handling and Zfa 
> having an instruction that just does this.  FP stuff is always a bit of 
> a time sink, so at that point it probably just fell off the priority list.
rint is supposed to raise FE_INEXACT, so it's actually a good match for 
RISC-V fcvt semantics as they appropriately raise FE_INEXACT.

nearby* do not arise FE_INEXACT and thus would rely on the new Zfa 
instructions where we have ones that do not raise FE_INEXACT or they 
need to be conditional on flag_fp_int_builtin_inexact.  One could 
reasonably argue that when flag_fp_int_builtin_inexact is enabled that a 
call to nearby* ought to be converted into a call to rint*.


> 
> I'm not really an FP guy, so I usually just poke around what the other 
> ports generate and try to figure out what's going on.  arm64 has the Zfa 
> instruction and x86 FP is complicated, so I'm not sure exactly who else 
> to look at for this sort of stuff.  From just looking at the code, 
> though, I think there's two issues -- I'm not really an FP person, 
> though, so take this with a grain of salt:
Right.  And the condition under which we use the new sequence for 
ceil/round actually borrows from x86.  Essentially we only use the new 
sequence when we've been told we don't care about FE_INEXACT or fp 
exceptions in general.

> 
> IIUC what we've got here doesn't actually set the inexact flag for the 
> bounds check failure case, as we're just loading up an exact constant 
> and doing the bounds check.  We're also not clamping to INT_MAX-type 
> values, but not sure if we're supposed to.  I think we could fix both of 
> those by adjusting the expansion to something like
The state of FE_INEXACT is a don't care here due the condition on the 
expansion code.


> 
>               fabs.d  fa4,fa0
>               lui     a5,%hi(.LC0)
>               fld     fa3,%lo(.LC0)(a5)
>               flt.d   a5,fa4,fa3
>               bne     a5,zero,.L3
>               mv      fa0, fa3
>      .L3:
>               fcvt.l.d a5,fa0,rup
>               fcvt.d.l        fa4,a5
>               fsgnj.d fa0,fa4,fa0
>               ret
> 
> and then adjusting the constant to be an epsilon larger than INT_MAX so 
> it'll still trigger the clamping but also inexact.
I think Jivan's sequence is more correct.  It's not just INT_MAX here 
that's concerning, there's a whole class of values that cause problems.

> 
> There's also a pair of changes to the ISA in 2020 that added the 
> conversion inexact handling requirement, it was a grey area before.  I 
> don't remember exactly what happened there, but I did remember it 
> happening.  I don't think anyone cares all that much about the 
> performance of systems that target the older ISAs, so maybe we just 
> restrict the non-libcall expansion to ISAs that contain the new wording?
I think all this got sufficiently cleaned up.  The spec is explicit 
about when FE_INEXACT gets raised on the fcvt instructions.  I referred 
to it repeatedly when analyzing Jivan's work.

We can hash through the final items in a few weeks once the trunk 
re-opens for development.

Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: RISC-V: Use convert instructions instead of calling library functions
  2024-03-20 18:54     ` Jeff Law
@ 2024-03-20 19:34       ` Palmer Dabbelt
  0 siblings, 0 replies; 7+ messages in thread
From: Palmer Dabbelt @ 2024-03-20 19:34 UTC (permalink / raw)
  To: Jeff Law; +Cc: jeffreyalaw, jivanhakobyan9, gcc-patches

On Wed, 20 Mar 2024 11:54:34 PDT (-0700), Jeff Law wrote:
>
>
> On 3/19/24 10:23 AM, Palmer Dabbelt wrote:
>> On Mon, 18 Mar 2024 20:50:14 PDT (-0700), jeffreyalaw@gmail.com wrote:
>>>
>>>
>>> On 3/18/24 3:09 AM, Jivan Hakobyan wrote:
>>>> As RV has round instructions it is reasonable to use them instead of
>>>> calling the library functions.
>>>>
>>>> With my patch for the following C code:
>>>> double foo(double a) {
>>>>      return ceil(a);
>>>> }
>>>>
>>>> GCC generates the following ASM code (before it was tail call)
>>>> foo:
>>>>          fabs.d  fa4,fa0
>>>>          lui     a5,%hi(.LC0)
>>>>          fld     fa3,%lo(.LC0)(a5)
>>>>          flt.d   a5,fa4,fa3
>>>>          beq     a5,zero,.L3
>>>>          fcvt.l.d a5,fa0,rup
>>
>> I'm not sure exactly what context this is in, but my reading of
>> "according to the current rounding mode" means we'd usually use the
>> dynamic rounding mode.
> As Andrew W. noted, we're dealing with ceil and thus rup is the correct
> rounding mode to use here.
>
>
>
>>
>> My only worry here is that when we were doing the other patterns we
>> decided not to do rint.  I can't remember exactly why, but from reading
>> the docs we probably just skipped it due to the inexact handling and Zfa
>> having an instruction that just does this.  FP stuff is always a bit of
>> a time sink, so at that point it probably just fell off the priority list.
> rint is supposed to raise FE_INEXACT, so it's actually a good match for
> RISC-V fcvt semantics as they appropriately raise FE_INEXACT.
>
> nearby* do not arise FE_INEXACT and thus would rely on the new Zfa
> instructions where we have ones that do not raise FE_INEXACT or they
> need to be conditional on flag_fp_int_builtin_inexact.  One could
> reasonably argue that when flag_fp_int_builtin_inexact is enabled that a
> call to nearby* ought to be converted into a call to rint*.
>
>
>>
>> I'm not really an FP guy, so I usually just poke around what the other
>> ports generate and try to figure out what's going on.  arm64 has the Zfa
>> instruction and x86 FP is complicated, so I'm not sure exactly who else
>> to look at for this sort of stuff.  From just looking at the code,
>> though, I think there's two issues -- I'm not really an FP person,
>> though, so take this with a grain of salt:
> Right.  And the condition under which we use the new sequence for
> ceil/round actually borrows from x86.  Essentially we only use the new
> sequence when we've been told we don't care about FE_INEXACT or fp
> exceptions in general.
>
>>
>> IIUC what we've got here doesn't actually set the inexact flag for the
>> bounds check failure case, as we're just loading up an exact constant
>> and doing the bounds check.  We're also not clamping to INT_MAX-type
>> values, but not sure if we're supposed to.  I think we could fix both of
>> those by adjusting the expansion to something like
> The state of FE_INEXACT is a don't care here due the condition on the
> expansion code.
>
>
>>
>>               fabs.d  fa4,fa0
>>               lui     a5,%hi(.LC0)
>>               fld     fa3,%lo(.LC0)(a5)
>>               flt.d   a5,fa4,fa3
>>               bne     a5,zero,.L3
>>               mv      fa0, fa3
>>      .L3:
>>               fcvt.l.d a5,fa0,rup
>>               fcvt.d.l        fa4,a5
>>               fsgnj.d fa0,fa4,fa0
>>               ret
>>
>> and then adjusting the constant to be an epsilon larger than INT_MAX so
>> it'll still trigger the clamping but also inexact.
> I think Jivan's sequence is more correct.  It's not just INT_MAX here
> that's concerning, there's a whole class of values that cause problems.

Ya, sorry, I thought I'd replied to Andrew's email somewhere -- I'd just 
managed to confuse myself about how the FP stuff works, I also think 
Jivan's code is correct now.

>> There's also a pair of changes to the ISA in 2020 that added the
>> conversion inexact handling requirement, it was a grey area before.  I
>> don't remember exactly what happened there, but I did remember it
>> happening.  I don't think anyone cares all that much about the
>> performance of systems that target the older ISAs, so maybe we just
>> restrict the non-libcall expansion to ISAs that contain the new wording?
> I think all this got sufficiently cleaned up.  The spec is explicit
> about when FE_INEXACT gets raised on the fcvt instructions.  I referred
> to it repeatedly when analyzing Jivan's work.

We still have support for stuf like -misa-spec=2.2 (and some 2019 
releases with clunky version numbers).  Those all predate the 
convert/inexact wording that got added.

Though if FE_INEXACT is a don't care here, then I think it doesn't 
matter if the wording got changed.  In that case I think this is fine, 
so 

Reviewed-by: Palmer Dabbelt <palmer@rivosinc.com>

> We can hash through the final items in a few weeks once the trunk
> re-opens for development.

I think those were all the isuses on my end ;)

>
>
> Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-03-20 19:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-18  9:09 RISC-V: Use convert instructions instead of calling library functions Jivan Hakobyan
2024-03-19  3:50 ` Jeff Law
2024-03-19 16:23   ` Palmer Dabbelt
2024-03-19 19:58     ` Andrew Waterman
2024-03-19 20:19       ` Palmer Dabbelt
2024-03-20 18:54     ` Jeff Law
2024-03-20 19:34       ` Palmer Dabbelt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).