From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tnfchris@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1984)
	id 5A2693858CDB; Wed, 18 Oct 2023 08:54:45 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5A2693858CDB
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697619285;
	bh=qBRrzOHTh8z+i7DTH8DFcETUbASWZ8KZFg+Y/1dFYK0=;
	h=From:To:Subject:Date:From;
	b=U54SO8j6eqsqouVRackIc8QI4pa0Xo0ASKcG1Jrj2YzhJML2LVvTNARfD5NQbeCK9
	 E9ZybhNdkzt0HytiPJaB83yBqJqoq9qyK0KueFFMAMJg9ue9JZnt84w5l2Bm95PO5t
	 T1N00XsKZN2zm8HJSLYl8hlWj26zPydETG8C0V6A=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Tamar Christina <tnfchris@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-4713] middle-end: Fold vec_cond into conditional ternary or
 binary operation when sharing operand [PR10915
X-Act-Checkin: gcc
X-Git-Author: Tamar Christina <tamar.christina@arm.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: b588dcb77e96d77777eb5647cba9e8f454e314dc
X-Git-Newrev: 4b39aeef594f311e2c1715f15608f1d7ebc2d868
Message-Id: <20231018085445.5A2693858CDB@sourceware.org>
Date: Wed, 18 Oct 2023 08:54:45 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:4b39aeef594f311e2c1715f15608f1d7ebc2d868

commit r14-4713-g4b39aeef594f311e2c1715f15608f1d7ebc2d868
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Wed Oct 18 09:32:55 2023 +0100

    middle-end: Fold vec_cond into conditional ternary or binary operation when sharing operand [PR109154]
    
    When we have a vector conditional on a masked target which is doing a selection
    on the result of a conditional operation where one of the operands of the
    conditional operation is the other operand of the select, then we can fold the
    vector conditional into the operation.
    
    Concretely this transforms
    
      c = mask1 ? (masked_op mask2 a b) : b
    
    into
    
      c = masked_op (mask1 & mask2) a b
    
    The mask is then propagated upwards by the compiler.  In the SVE case we don't
    end up needing a mask AND here since `mask2` will end up in the instruction
    creating `mask` which gives us a natural &.
    
    Such transformations are more common now in GCC 13+ as PRE has not started
    unsharing of common code in case it can make one branch fully independent.
    
    e.g. in this case `b` becomes a loop invariant value after PRE.
    
    This transformation removes the extra select for masked architectures but
    doesn't fix the general case.
    
    gcc/ChangeLog:
    
            PR tree-optimization/109154
            * match.pd: Add new cond_op rule.
    
    gcc/testsuite/ChangeLog:
    
            PR tree-optimization/109154
            * gcc.target/aarch64/sve/pre_cond_share_1.c: New test.

Diff:
---
 gcc/match.pd                                       |  24 ++++
 .../gcc.target/aarch64/sve/pre_cond_share_1.c      | 132 +++++++++++++++++++++
 2 files changed, 156 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 067328a2a16b..a56838fb388a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8970,6 +8970,30 @@ and,
 	    && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
     (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
 
+/* Detect simplification for vector condition folding where
+
+  c = mask1 ? (masked_op mask2 a b) : b
+
+  into
+
+  c = masked_op (mask1 & mask2) a b
+
+  where the operation can be partially applied to one operand. */
+
+(for cond_op (COND_BINARY)
+ (simplify
+  (vec_cond @0
+   (cond_op:s @1 @2 @3 @4) @3)
+  (cond_op (bit_and @1 @0) @2 @3 @4)))
+
+/* And same for ternary expressions.  */
+
+(for cond_op (COND_TERNARY)
+ (simplify
+  (vec_cond @0
+   (cond_op:s @1 @2 @3 @4 @5) @4)
+  (cond_op (bit_and @1 @0) @2 @3 @4 @5)))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
    expressions like:
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c
new file mode 100644
index 000000000000..b51d0f298ea1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c
@@ -0,0 +1,132 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+
+typedef struct __attribute__((__packed__)) _Atom {
+    float x, y, z;
+    int32_t type;
+} Atom;
+
+typedef struct __attribute__((__packed__)) _FFParams {
+    int32_t hbtype;
+    float radius;
+    float hphb;
+    float elsc;
+} FFParams;
+
+#ifndef PPWI
+#define PPWI (64)
+#endif
+
+#ifndef ITERS
+#define ITERS 8
+#endif
+
+#define DIFF_TOLERANCE_PCT 0.025f
+
+#define POSES_SIZE 393216
+#define PROTEIN_SIZE 938
+#define LIGAND_SIZE 26
+#define FORCEFIELD_SIZE 34
+
+#define ZERO 0.0f
+#define QUARTER 0.25f
+#define HALF 0.5f
+#define ONE 1.0f
+#define TWO 2.0f
+#define FOUR 4.0f
+#define CNSTNT 45.0f
+
+// Energy evaluation parameters
+#define HBTYPE_F 70
+#define HBTYPE_E 69
+#define HARDNESS 38.0f
+#define NPNPDIST 5.5f
+#define NPPDIST 1.0f
+
+void
+fasten_main(size_t group, size_t ntypes, size_t nposes, size_t natlig, size_t natpro,        //
+            const Atom *protein, const Atom *ligand,                                         //
+            const float *transforms_0, const float *transforms_1, const float *transforms_2, //
+            const float *transforms_3, const float *transforms_4, const float *transforms_5, //
+            const FFParams *forcefield, float *energies                                      //
+) {
+
+    float etot[PPWI];
+    float lpos_x[PPWI];
+
+    for (int l = 0; l < PPWI; l++) {
+        etot[l] = 0.f;
+        lpos_x[l] = 0.f;
+    }
+
+    // Loop over ligand atoms
+    for (int il = 0; il < natlig; il++) {
+        // Load ligand atom data
+        const Atom l_atom = ligand[il];
+        const FFParams l_params = forcefield[l_atom.type];
+        const int lhphb_ltz = l_params.hphb < 0.f;
+        const int lhphb_gtz = l_params.hphb > 0.f;
+
+        // Transform ligand atom
+
+        // Loop over protein atoms
+        for (int ip = 0; ip < natpro; ip++) {
+            // Load protein atom data
+            const Atom p_atom = protein[ip];
+            const FFParams p_params = forcefield[p_atom.type];
+
+            const float radij = p_params.radius + l_params.radius;
+            const float r_radij = ONE / radij;
+
+            const float elcdst = (p_params.hbtype == HBTYPE_F && l_params.hbtype == HBTYPE_F) ? FOUR
+                                                                                              : TWO;
+            const float elcdst1 = (p_params.hbtype == HBTYPE_F && l_params.hbtype == HBTYPE_F)
+                                  ? QUARTER : HALF;
+            const int type_E = ((p_params.hbtype == HBTYPE_E || l_params.hbtype == HBTYPE_E));
+
+            const int phphb_ltz = p_params.hphb < 0.f;
+            const int phphb_gtz = p_params.hphb > 0.f;
+            const int phphb_nz = p_params.hphb != 0.f;
+            const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE);
+            const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE);
+            const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz
+                                                                                    ? NPPDIST
+                                                                                    : -FLT_MAX));
+            const float r_distdslv = ONE / distdslv;
+
+            const float chrg_init = l_params.elsc * p_params.elsc;
+            const float dslv_init = p_hphb + l_hphb;
+
+            for (int l = 0; l < PPWI; l++) {
+                // Calculate distance between atoms
+                const float x = lpos_x[l] - p_atom.x;
+                const float distij = (x * x);
+
+                // Calculate the sum of the sphere radii
+                const float distbb = distij - radij;
+
+                const int zone1 = (distbb < ZERO);
+
+                // Calculate formal and dipole charge interactions
+                float chrg_e = chrg_init * ((zone1 ? ONE : (ONE - distbb * elcdst1)) *
+                                            (distbb < elcdst ? ONE : ZERO));
+                float neg_chrg_e = -fabsf(chrg_e);
+                chrg_e = type_E ? neg_chrg_e : chrg_e;
+                etot[l] += chrg_e * CNSTNT;
+            }
+        }
+    }
+
+    // Write result
+    for (int l = 0; l < PPWI; l++) {
+        energies[group * PPWI + l] = etot[l] * HALF;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times {\.COND_MUL} 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times {\.VCOND} 1 "optimized" } } */