From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <meissner@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1005)
	id 870C33858D37; Sat, 29 Apr 2023 02:40:57 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 870C33858D37
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1682736057;
	bh=hzB6J567kFZ82ASi8lRtuOuEOhTHPbgMttIWcRcHj80=;
	h=From:To:Subject:Date:From;
	b=qV9qdTM5FnhzBAqS2T31+Qg3+R1R/5IqK/RqYeknkuco5pvfSggIlCTrxZVhDtLqY
	 gzQW1oCIjmItejzoFcQL83QZ42GSQL+qjUfnDZhbCRuINa/ObO/XX4/RbBXlFMQNFm
	 KVmo8wCm47ZjKqlC+u5pvlmov52qnNWRl2rjg7kw=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Michael Meissner <meissner@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF
 from memory with constant element numbers.
X-Act-Checkin: gcc
X-Git-Author: Michael Meissner <meissner@linux.ibm.com>
X-Git-Refname: refs/users/meissner/heads/work119
X-Git-Oldrev: 20d3a09037cd05d36fc9d73eae93fb75dead73bc
X-Git-Newrev: b43b46429c8492deeed191c539e114840ef6dc2a
Message-Id: <20230429024057.870C33858D37@sourceware.org>
Date: Sat, 29 Apr 2023 02:40:57 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:b43b46429c8492deeed191c539e114840ef6dc2a

commit b43b46429c8492deeed191c539e114840ef6dc2a
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Apr 28 22:40:37 2023 -0400

    Optimize vec_extract of V4SF from memory with constant element numbers.
    
    This patch updates vec_extract of V4SF from memory with constant element
    numbers.
    
    This patch corrects the ISA for loading SF values to altivec registers to be
    power8 vector, and not power7.
    
    This patch adds a combiner patch to combine loading up a SF element and
    converting it to double.
    
    It also removes the '?' from the 'r' constraint so that if the SFmode is needed
    in a GPR, it doesn't have to load it to the vector unit and then store it.
    
    2023-04-28   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Fix ISA for loading
            up SFmode values with x-form addresses.  Remove ? from 'r' constraint.
            (vsx_extract_v4sf_load_to_df): New insn.
    
    gc/testsuite/
    
            * gcc.target/powerpc/vec-extract-mem-float-1.c: New file.

Diff:
---
 gcc/config/rs6000/vsx.md                           | 73 +++++++++++++++++++---
 .../gcc.target/powerpc/vec-extract-mem-float-1.c   | 29 +++++++++
 2 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 417aff5e24b..9d3b3441ed5 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3549,12 +3549,33 @@
   [(set_attr "length" "8")
    (set_attr "type" "fp")])
 
+;; V4SF extract from memory with constant element number.
+;; Alternatives:
+;;       Reg:  Ele:  Cpu: Addr:                 need scratch
+;;    1: FPR   0     any  normal address        no
+;;    2: FPR   1-3   any  offsettable address   no
+;;    3: FPR   1-3   any  single register       yes
+;;    4: VMX   0     p8   reg+reg or reg        no
+;;    5: VMX   1-3   p8   single register       yes
+;;    6: VMX   0     p9   normal address        no
+;;    7: VMX   1-3   p9   offsettable address   no
+;;    8: GPR   0     any  normal address        no
+;;    9: GPR   0-3   any  offsettable address   no
+;;   10: GPR   0-3   any  single register       yes
 (define_insn_and_split "*vsx_extract_v4sf_load"
-  [(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
+  [(set (match_operand:SF 0 "register_operand"
+		"=f,     f,      f,      v,      v,      v,      v,
+		 r,      r,      r")
 	(vec_select:SF
-	 (match_operand:V4SF 1 "memory_operand" "m,Z,m,m")
-	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
-   (clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
+	 (match_operand:V4SF 1 "memory_operand"
+		"m,      o,      Q,      Z,      Q,      m,      o,
+		 m,      o,      Q")
+	 (parallel [(match_operand:QI 2 "const_0_to_3_operand"
+		"O,      n,      n,      O,      n,      O,      n,
+		 O,      n,      n")])))
+   (clobber (match_scratch:P 3
+		 "=X,    X,      &b,     X,      &b,     X,      X,
+		  X,      X,      &b"))]
   "VECTOR_MEM_VSX_P (V4SFmode)"
   "#"
   "&& reload_completed"
@@ -3563,9 +3584,47 @@
   operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
 					   operands[3], SFmode);
 }
-  [(set_attr "type" "fpload,fpload,fpload,load")
-   (set_attr "length" "8")
-   (set_attr "isa" "*,p7v,p9v,*")])
+  [(set_attr "type"
+		"fpload, fpload, fpload, fpload, fpload, fpload, fpload,
+		 load,   load,   load")
+   (set_attr "isa"
+		"*,      *,      *,      p8v,    p8v,    p9v,    p9v,
+		 *,      *,      *")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+;; Alternatives:
+;;       Reg:  Ele:  Cpu: Addr:                 need scratch
+;;    1: FPR   0     any  normal address        no
+;;    2: FPR   1-3   any  offsettable address   no
+;;    3: FPR   1-3   any  single register       yes
+;;    4: VMX   0     p8   reg+reg or reg        no
+;;    5: VMX   1-3   p8   single register       yes
+;;    6: VMX   0     p9   normal address        no
+;;    7: VMX   1-3   p9   offsettable address   no
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+  [(set (match_operand:DF 0 "register_operand"
+		"=f,     f,      f,      v,      v,      v,      v")
+	(float_extend:DF
+	 (vec_select:SF
+	  (match_operand:V4SF 1 "memory_operand"
+		"m,      o,      Q,      Z,      Q,      m,      o")
+	  (parallel [(match_operand:QI 2 "const_0_to_3_operand"
+		 "=X,    X,      &b,     X,      &b,     X,      X")]))))
+   (clobber (match_scratch:P 3
+		 "=X,    X,      &b,     X,      &b,     X,      X"))]
+  "VECTOR_MEM_VSX_P (V4SFmode)"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(float_extend:DF (match_dup 4)))]
+{
+  operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+					   operands[3], SFmode);
+}
+  [(set_attr "type"
+		"fpload, fpload, fpload, fpload, fpload, fpload, fpload")
+   (set_attr "isa"
+		"*,      *,      *,      p8v,    p8v,    p9v,    p9v")])
 
 ;; Variable V4SF extract from a register
 (define_insn_and_split "vsx_extract_v4sf_var"
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+   float elements into a GPR register without doing a LFS/STFS.  */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+  float x = vec_extract (*p, 0);
+  __asm__ (" # %0" : "+r" (x));		/* lwz, no lfs/stfs.  */
+  *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+  float x = vec_extract (*p, 1);
+  __asm__ (" # %0" : "+r" (x));		/* lwz, no lfs/stfs.  */
+  *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M}               2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M}                 2 } } */
+/* { dg-final { scan-assembler-not   {\mlfsx?\M|\mlxsspx?\M}     } } */
+/* { dg-final { scan-assembler-not   {\mstfsx?\M|\mstxsspx?\M}   } } */