public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers.
@ 2023-04-27 21:25 Michael Meissner
0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-04-27 21:25 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:2059b7d7a1e1f6244ca73e2ba45e5ab1c44ba159
commit 2059b7d7a1e1f6244ca73e2ba45e5ab1c44ba159
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Thu Apr 27 17:24:59 2023 -0400
Optimize vec_extract of V4SF from memory with constant element numbers.
This patch updates vec_extract of V4SF from memory with constant element
numbers.
This patch changes the splits so that they can be done before register
allocation.
This patch corrects the ISA for loading SF values to altivec registers to be
power8 vector, and not power7.
This patch adds a combiner patch to combine loading up a SF element and
converting it to double.
2023-04-27 Michael Meissner <meissner@linux.ibm.com>
gcc/
* gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Allow splitting
before register allocation. Fix ISA for loading up SFmode values to
traditional Altivec registers.
(vsx_extract_v4sf_load_to_df): New insn.
gc/testsuite/
* gcc.target/powerpc/vec-extract-mem-float-1.c: New file.
Diff:
---
gcc/config/rs6000/vsx.md | 26 +++++++++++++++++--
| 29 ++++++++++++++++++++++
2 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 417aff5e24b..695b5cbd126 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3549,6 +3549,7 @@
[(set_attr "length" "8")
(set_attr "type" "fp")])
+;; V4SF extract from memory with constant element number.
(define_insn_and_split "*vsx_extract_v4sf_load"
[(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
(vec_select:SF
@@ -3557,7 +3558,7 @@
(clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
"VECTOR_MEM_VSX_P (V4SFmode)"
"#"
- "&& reload_completed"
+ "&& 1"
[(set (match_dup 0) (match_dup 4))]
{
operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
@@ -3565,7 +3566,28 @@
}
[(set_attr "type" "fpload,fpload,fpload,load")
(set_attr "length" "8")
- (set_attr "isa" "*,p7v,p9v,*")])
+ (set_attr "isa" "*,p8v,p9v,*")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+ [(set (match_operand:DF 0 "register_operand" "=f,v,v")
+ (float_extend:DF
+ (vec_select:SF
+ (match_operand:V4SF 1 "memory_operand" "m,Z,m")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")]))))
+ (clobber (match_scratch:P 3 "=&b,&b,&b"))]
+ "VECTOR_MEM_VSX_P (V4SFmode)"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (float_extend:DF (match_dup 4)))]
+{
+ operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+ operands[3], SFmode);
+}
+ [(set_attr "type" "fpload")
+ (set_attr "length" "8")
+ (set_attr "isa" "*,p8v,p9v")])
;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
--git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+ float elements into a GPR register without doing a LFS/STFS. */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 0);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 1);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlfsx?\M|\mlxsspx?\M} } } */
+/* { dg-final { scan-assembler-not {\mstfsx?\M|\mstxsspx?\M} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers.
@ 2023-04-29 2:40 Michael Meissner
0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-04-29 2:40 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:b43b46429c8492deeed191c539e114840ef6dc2a
commit b43b46429c8492deeed191c539e114840ef6dc2a
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Fri Apr 28 22:40:37 2023 -0400
Optimize vec_extract of V4SF from memory with constant element numbers.
This patch updates vec_extract of V4SF from memory with constant element
numbers.
This patch corrects the ISA for loading SF values to altivec registers to be
power8 vector, and not power7.
This patch adds a combiner patch to combine loading up a SF element and
converting it to double.
It also removes the '?' from the 'r' constraint so that if the SFmode is needed
in a GPR, it doesn't have to load it to the vector unit and then store it.
2023-04-28 Michael Meissner <meissner@linux.ibm.com>
gcc/
* gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Fix ISA for loading
up SFmode values with x-form addresses. Remove ? from 'r' constraint.
(vsx_extract_v4sf_load_to_df): New insn.
gc/testsuite/
* gcc.target/powerpc/vec-extract-mem-float-1.c: New file.
Diff:
---
gcc/config/rs6000/vsx.md | 73 +++++++++++++++++++---
| 29 +++++++++
2 files changed, 95 insertions(+), 7 deletions(-)
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 417aff5e24b..9d3b3441ed5 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3549,12 +3549,33 @@
[(set_attr "length" "8")
(set_attr "type" "fp")])
+;; V4SF extract from memory with constant element number.
+;; Alternatives:
+;; Reg: Ele: Cpu: Addr: need scratch
+;; 1: FPR 0 any normal address no
+;; 2: FPR 1-3 any offsettable address no
+;; 3: FPR 1-3 any single register yes
+;; 4: VMX 0 p8 reg+reg or reg no
+;; 5: VMX 1-3 p8 single register yes
+;; 6: VMX 0 p9 normal address no
+;; 7: VMX 1-3 p9 offsettable address no
+;; 8: GPR 0 any normal address no
+;; 9: GPR 0-3 any offsettable address no
+;; 10: GPR 0-3 any single register yes
(define_insn_and_split "*vsx_extract_v4sf_load"
- [(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
+ [(set (match_operand:SF 0 "register_operand"
+ "=f, f, f, v, v, v, v,
+ r, r, r")
(vec_select:SF
- (match_operand:V4SF 1 "memory_operand" "m,Z,m,m")
- (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
- (clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
+ (match_operand:V4SF 1 "memory_operand"
+ "m, o, Q, Z, Q, m, o,
+ m, o, Q")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand"
+ "O, n, n, O, n, O, n,
+ O, n, n")])))
+ (clobber (match_scratch:P 3
+ "=X, X, &b, X, &b, X, X,
+ X, X, &b"))]
"VECTOR_MEM_VSX_P (V4SFmode)"
"#"
"&& reload_completed"
@@ -3563,9 +3584,47 @@
operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
operands[3], SFmode);
}
- [(set_attr "type" "fpload,fpload,fpload,load")
- (set_attr "length" "8")
- (set_attr "isa" "*,p7v,p9v,*")])
+ [(set_attr "type"
+ "fpload, fpload, fpload, fpload, fpload, fpload, fpload,
+ load, load, load")
+ (set_attr "isa"
+ "*, *, *, p8v, p8v, p9v, p9v,
+ *, *, *")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+;; Alternatives:
+;; Reg: Ele: Cpu: Addr: need scratch
+;; 1: FPR 0 any normal address no
+;; 2: FPR 1-3 any offsettable address no
+;; 3: FPR 1-3 any single register yes
+;; 4: VMX 0 p8 reg+reg or reg no
+;; 5: VMX 1-3 p8 single register yes
+;; 6: VMX 0 p9 normal address no
+;; 7: VMX 1-3 p9 offsettable address no
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+ [(set (match_operand:DF 0 "register_operand"
+ "=f, f, f, v, v, v, v")
+ (float_extend:DF
+ (vec_select:SF
+ (match_operand:V4SF 1 "memory_operand"
+ "m, o, Q, Z, Q, m, o")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand"
+ "=X, X, &b, X, &b, X, X")]))))
+ (clobber (match_scratch:P 3
+ "=X, X, &b, X, &b, X, X"))]
+ "VECTOR_MEM_VSX_P (V4SFmode)"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0)
+ (float_extend:DF (match_dup 4)))]
+{
+ operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+ operands[3], SFmode);
+}
+ [(set_attr "type"
+ "fpload, fpload, fpload, fpload, fpload, fpload, fpload")
+ (set_attr "isa"
+ "*, *, *, p8v, p8v, p9v, p9v")])
;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
--git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+ float elements into a GPR register without doing a LFS/STFS. */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 0);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 1);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlfsx?\M|\mlxsspx?\M} } } */
+/* { dg-final { scan-assembler-not {\mstfsx?\M|\mstxsspx?\M} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers.
@ 2023-04-29 0:02 Michael Meissner
0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-04-29 0:02 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:590d55ae10495faf15ffaf122205d095eb3aa440
commit 590d55ae10495faf15ffaf122205d095eb3aa440
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Fri Apr 28 20:01:43 2023 -0400
Optimize vec_extract of V4SF from memory with constant element numbers.
This patch updates vec_extract of V4SF from memory with constant element
numbers.
This patch corrects the ISA for loading SF values to altivec registers to be
power8 vector, and not power7.
This patch adds a combiner patch to combine loading up a SF element and
converting it to double.
This patch expands the alternatives, so that if the element number is 0 or the
address is offsettable, we don't need a scratch register.
2023-04-28 Michael Meissner <meissner@linux.ibm.com>
gcc/
* gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Fix ISA for loading
up SFmode values with x-form addresses. Drill down on the alternatives
to prevent allocating a scratch register if we don't need it.
(vsx_extract_v4sf_load_to_df): New insn.
gc/testsuite/
* gcc.target/powerpc/vec-extract-mem-float-1.c: New file.
Diff:
---
gcc/config/rs6000/vsx.md | 73 +++++++++++++++++++---
| 29 +++++++++
2 files changed, 95 insertions(+), 7 deletions(-)
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 7121f50a449..ce00e8a1db6 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3555,12 +3555,33 @@
[(set_attr "length" "8")
(set_attr "type" "fp")])
+;; V4SF extract from memory with constant element number.
+;; Alternatives:
+;; Reg: Ele: Cpu: Addr: need scratch
+;; 1: FPR 0 any normal address no
+;; 2: FPR 1-3 any offsettable address no
+;; 3: FPR 1-3 any single register yes
+;; 4: VMX 0 p8 reg+reg or reg no
+;; 5: VMX 1-3 p8 single register yes
+;; 6: VMX 0 p9 normal address no
+;; 7: VMX 1-3 p9 offsettable address no
+;; 8: GPR 0 any normal address no
+;; 9: GPR 0-3 any offsettable address no
+;; 10: GPR 0-3 any single register yes
(define_insn_and_split "*vsx_extract_v4sf_load"
- [(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
+ [(set (match_operand:SF 0 "register_operand"
+ "=f, f, f, v, v, v, v,
+ ?r, ?r, ?r")
(vec_select:SF
- (match_operand:V4SF 1 "memory_operand" "m,Z,m,m")
- (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
- (clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
+ (match_operand:V4SF 1 "memory_operand"
+ "m, o, Q, Z, Q, m, o,
+ m, o, Q")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand"
+ "O, n, n, O, n, O, n,
+ O, n, n")])))
+ (clobber (match_scratch:P 3
+ "=X, X, &b, X, &b, X, X,
+ X, X, &b"))]
"VECTOR_MEM_VSX_P (V4SFmode)"
"#"
"&& reload_completed"
@@ -3569,9 +3590,47 @@
operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
operands[3], SFmode);
}
- [(set_attr "type" "fpload,fpload,fpload,load")
- (set_attr "length" "8")
- (set_attr "isa" "*,p7v,p9v,*")])
+ [(set_attr "type"
+ "fpload, fpload, fpload, fpload, fpload, fpload, fpload,
+ load, load, load")
+ (set_attr "isa"
+ "*, *, *, p8v, p8v, p9v, p9v,
+ *, *, *")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+;; Alternatives:
+;; Reg: Ele: Cpu: Addr: need scratch
+;; 1: FPR 0 any normal address no
+;; 2: FPR 1-3 any offsettable address no
+;; 3: FPR 1-3 any single register yes
+;; 4: VMX 0 p8 reg+reg or reg no
+;; 5: VMX 1-3 p8 single register yes
+;; 6: VMX 0 p9 normal address no
+;; 7: VMX 1-3 p9 offsettable address no
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+ [(set (match_operand:DF 0 "register_operand"
+ "=f, f, f, v, v, v, v")
+ (float_extend:DF
+ (vec_select:SF
+ (match_operand:V4SF 1 "memory_operand"
+ "m, o, Q, Z, Q, m, o")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand"
+ "=X, X, &b, X, &b, X, X")]))))
+ (clobber (match_scratch:P 3
+ "=X, X, &b, X, &b, X, X"))]
+ "VECTOR_MEM_VSX_P (V4SFmode)"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0)
+ (float_extend:DF (match_dup 4)))]
+{
+ operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+ operands[3], SFmode);
+}
+ [(set_attr "type"
+ "fpload, fpload, fpload, fpload, fpload, fpload, fpload")
+ (set_attr "isa"
+ "*, *, *, p8v, p8v, p9v, p9v")])
;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
--git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+ float elements into a GPR register without doing a LFS/STFS. */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 0);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 1);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlfsx?\M|\mlxsspx?\M} } } */
+/* { dg-final { scan-assembler-not {\mstfsx?\M|\mstxsspx?\M} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers.
@ 2023-04-28 22:12 Michael Meissner
0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-04-28 22:12 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:51302d4ec98a7f197d350785dfb0ed0fc1ce6dad
commit 51302d4ec98a7f197d350785dfb0ed0fc1ce6dad
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Fri Apr 28 18:09:23 2023 -0400
Optimize vec_extract of V4SF from memory with constant element numbers.
This patch updates vec_extract of V4SF from memory with constant element
numbers.
This patch corrects the ISA for loading SF values to altivec registers to be
power8 vector, and not power7.
This patch adds a combiner patch to combine loading up a SF element and
converting it to double.
This patch expands the alternatives, so that if the element number is 0 or the
address is offsettable, we don't need a scratch register.
2023-04-28 Michael Meissner <meissner@linux.ibm.com>
gcc/
* gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Fix ISA for loading
up SFmode values with x-form addresses. Drill down on the alternatives
to prevent allocating a scratch register if we don't need it.
(vsx_extract_v4sf_load_to_df): New insn.
gc/testsuite/
* gcc.target/powerpc/vec-extract-mem-float-1.c: New file.
Diff:
---
gcc/config/rs6000/vsx.md | 53 +++++++++++++++++++---
| 29 ++++++++++++
2 files changed, 75 insertions(+), 7 deletions(-)
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 417aff5e24b..4777c870514 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3549,12 +3549,22 @@
[(set_attr "length" "8")
(set_attr "type" "fp")])
+;; V4SF extract from memory with constant element number.
+;; Alternatives:
+;; 1: Load FPR, index 0, normal address, no address change.
+;; 2: Load FPR, index 0-3, offsettable address, element folded into addr.
+;; 3: Load FPR, index 0-3, single register, offset in op[3].
+;; 4: Load VMX, index 0, x-form, power8, no address change.
+;; 5: Load VMX, index 0-3, single register, power8, offset in op[3].
+;; 6: Load VMX, index 0, normal address, power9, no address change.
+;; 7: Load VMX, index 0-3, offsettable address, power9, element in addr.
+;; 8: Load GPR, index 0-3, single register, offset in op[3].
(define_insn_and_split "*vsx_extract_v4sf_load"
- [(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
+ [(set (match_operand:SF 0 "register_operand" "=f,f,f,v,v,v,v,?r")
(vec_select:SF
- (match_operand:V4SF 1 "memory_operand" "m,Z,m,m")
- (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
- (clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
+ (match_operand:V4SF 1 "memory_operand" "m,o,Q,Z,Q,m,o,Q")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand" "O,n,n,O,n,O,n,n")])))
+ (clobber (match_scratch:P 3 "=X,X,&b,X,&b,X,X,&b"))]
"VECTOR_MEM_VSX_P (V4SFmode)"
"#"
"&& reload_completed"
@@ -3563,9 +3573,38 @@
operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
operands[3], SFmode);
}
- [(set_attr "type" "fpload,fpload,fpload,load")
- (set_attr "length" "8")
- (set_attr "isa" "*,p7v,p9v,*")])
+ [(set_attr "type" "fpload,fpload,fpload,fpload,fpload,fpload,fpload,load")
+ (set_attr "length" "4,4,8,4,8,4,4,8")
+ (set_attr "isa" "*,*,*,p8v,p8v,p9v,p9v,*")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+;; Alternatives:
+;; 1: Load FPR, index 0, normal address, no address change.
+;; 2: Load FPR, index 0-3, offsettable address, element folded into addr.
+;; 3: Load FPR, index 0-3, single register, offset in op[3].
+;; 4: Load VMX, index 0, x-form, power8, no address change.
+;; 5: Load VMX, index 0-3, single register, power8, offset in op[3].
+;; 6: Load VMX, index 0, normal address, power9, no address change.
+;; 7: Load VMX, index 0-3, offsettable address, power9, element in addr.
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+ [(set (match_operand:DF 0 "register_operand" "=f,f,f,v,v,v,v")
+ (float_extend:DF
+ (vec_select:SF
+ (match_operand:V4SF 1 "memory_operand" "m,o,Q,Z,Q,m,o")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand" "O,n,n,O,n,O,n")]))))
+ (clobber (match_scratch:P 3 "=X,X,&b,X,&b,X,&b"))]
+ "VECTOR_MEM_VSX_P (V4SFmode)"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0)
+ (float_extend:DF (match_dup 4)))]
+{
+ operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+ operands[3], SFmode);
+}
+ [(set_attr "type" "fpload")
+ (set_attr "length" "4,4,8,4,8,4,4")
+ (set_attr "isa" "*,*,*,p8v,p8v,p9v,p9v")])
;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
--git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+ float elements into a GPR register without doing a LFS/STFS. */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 0);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 1);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlfsx?\M|\mlxsspx?\M} } } */
+/* { dg-final { scan-assembler-not {\mstfsx?\M|\mstxsspx?\M} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers.
@ 2023-04-28 17:57 Michael Meissner
0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-04-28 17:57 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:984b341d78ddbc4ed3ad90dad7cb607edfa1fd12
commit 984b341d78ddbc4ed3ad90dad7cb607edfa1fd12
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Fri Apr 28 13:57:19 2023 -0400
Optimize vec_extract of V4SF from memory with constant element numbers.
This patch updates vec_extract of V4SF from memory with constant element
numbers.
This patch changes the splits so that they can be done before register
allocation.
This patch corrects the ISA for loading SF values to altivec registers to be
power8 vector, and not power7.
This patch adds a combiner patch to combine loading up a SF element and
converting it to double.
In order to do the splitting before register allocation, I modified the various
vec_extract insns to allow the split to occur before register allocation. This
patch goes through the support function rs6000_adjust_vec_address and the
functions it calls to allow them to be called before register allocation. The
places that take a scratch register will allocate a new pseudo register if they
are passed a SCRATCH register.
I also added a new predicate that checks if the operand is a normal memory
address but not an Altivec vector addresses (i.e. with an AND -16). These
addresses are used in power8 as part of the vector swap optimization. In the
past, because we use the 'Q' constraint, ira/reload would handle the AND
etc. so that the address was only a single register.
2023-04-28 Michael Meissner <meissner@linux.ibm.com>
gcc/
* config/rs6000/predicates.md (non_altivec_memory_operand): New
predicate.
* config/rs6000/rs6000.cc (get_vector_offset): Allow function to be
called before register allocation.
(adjust_vec_address_pcrel): Likewise.
(rs6000_adjust_vec_address): Likewise.
* gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Allow splitting
before register allocation. Fix ISA for loading up SFmode values to
traditional Altivec registers. Require that the memory being optimized
does not use Altivec memory addresses.
(vsx_extract_v4sf_load_to_df): New insn.
gc/testsuite/
* gcc.target/powerpc/vec-extract-mem-float-1.c: New file.
Diff:
---
gcc/config/rs6000/predicates.md | 10 ++++
gcc/config/rs6000/rs6000.cc | 58 +++++++++++++++-------
gcc/config/rs6000/vsx.md | 28 +++++++++--
| 29 +++++++++++
4 files changed, 104 insertions(+), 21 deletions(-)
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 52c65534e51..3b9265ef1c0 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -957,6 +957,16 @@
return memory_operand (op, mode);
})
+;; Anything that matches memory_operand but does not match
+;; altivec_indexed_or_indirect_operand. This used by vec_extract memory
+;; optimizations.
+(define_predicate "non_altivec_memory_operand"
+ (match_code "mem")
+{
+ return (memory_operand (op, mode)
+ && !altivec_indexed_or_indirect_operand (op, mode));
+})
+
;; Return 1 if the operand is a MEM with an indexed-form address.
(define_special_predicate "indexed_address_mem"
(match_test "(MEM_P (op)
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 3be5860dd9b..332cb862f54 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -7686,9 +7686,13 @@ get_vector_offset (rtx mem, rtx element, rtx base_tmp, unsigned scalar_size)
if (CONST_INT_P (element))
return GEN_INT (INTVAL (element) * scalar_size);
- /* All insns should use the 'Q' constraint (address is a single register) if
- the element number is not a constant. */
- gcc_assert (satisfies_constraint_Q (mem));
+ if (GET_CODE (base_tmp) == SCRATCH)
+ base_tmp = gen_reg_rtx (Pmode);
+
+ /* After register allocation, all insns should use the 'Q' constraint
+ (address is a single register) if the element number is not a
+ constant. */
+ gcc_assert (can_create_pseudo_p () || satisfies_constraint_Q (mem));
/* Mask the element to make sure the element number is between 0 and the
maximum number of elements - 1 so that we don't generate an address
@@ -7704,6 +7708,9 @@ get_vector_offset (rtx mem, rtx element, rtx base_tmp, unsigned scalar_size)
if (shift > 0)
{
rtx shift_op = gen_rtx_ASHIFT (Pmode, base_tmp, GEN_INT (shift));
+ if (can_create_pseudo_p ())
+ base_tmp = gen_reg_rtx (Pmode);
+
emit_insn (gen_rtx_SET (base_tmp, shift_op));
}
@@ -7747,6 +7754,9 @@ adjust_vec_address_pcrel (rtx addr, rtx element_offset, rtx base_tmp)
else
{
+ if (GET_CODE (base_tmp) == SCRATCH)
+ base_tmp = gen_reg_rtx (Pmode);
+
emit_move_insn (base_tmp, addr);
new_addr = gen_rtx_PLUS (Pmode, base_tmp, element_offset);
}
@@ -7769,9 +7779,8 @@ adjust_vec_address_pcrel (rtx addr, rtx element_offset, rtx base_tmp)
temporary (BASE_TMP) to fixup the address. Return the new memory address
that is valid for reads or writes to a given register (SCALAR_REG).
- This function is expected to be called after reload is completed when we are
- splitting insns. The temporary BASE_TMP might be set multiple times with
- this code. */
+ The temporary BASE_TMP might be set multiple times with this code if this is
+ called after register allocation. */
rtx
rs6000_adjust_vec_address (rtx scalar_reg,
@@ -7784,8 +7793,11 @@ rs6000_adjust_vec_address (rtx scalar_reg,
rtx addr = XEXP (mem, 0);
rtx new_addr;
- gcc_assert (!reg_mentioned_p (base_tmp, addr));
- gcc_assert (!reg_mentioned_p (base_tmp, element));
+ if (GET_CODE (base_tmp) != SCRATCH)
+ {
+ gcc_assert (!reg_mentioned_p (base_tmp, addr));
+ gcc_assert (!reg_mentioned_p (base_tmp, element));
+ }
/* Vector addresses should not have PRE_INC, PRE_DEC, or PRE_MODIFY. */
gcc_assert (GET_RTX_CLASS (GET_CODE (addr)) != RTX_AUTOINC);
@@ -7841,6 +7853,9 @@ rs6000_adjust_vec_address (rtx scalar_reg,
offset, it has the benefit that if D-FORM instructions are
allowed, the offset is part of the memory access to the vector
element. */
+ if (GET_CODE (base_tmp) == SCRATCH)
+ base_tmp = gen_reg_rtx (Pmode);
+
emit_insn (gen_rtx_SET (base_tmp, gen_rtx_PLUS (Pmode, op0, op1)));
new_addr = gen_rtx_PLUS (Pmode, base_tmp, element_offset);
}
@@ -7848,26 +7863,33 @@ rs6000_adjust_vec_address (rtx scalar_reg,
else
{
- emit_move_insn (base_tmp, addr);
+ if (GET_CODE (base_tmp) == SCRATCH)
+ base_tmp = gen_reg_rtx (Pmode);
+
+ emit_insn (gen_rtx_SET (base_tmp, addr));
new_addr = gen_rtx_PLUS (Pmode, base_tmp, element_offset);
}
- /* If the address isn't valid, move the address into the temporary base
- register. Some reasons it could not be valid include:
+ /* If register allocation has been done and the address isn't valid, move
+ the address into the temporary base register. Some reasons it could not
+ be valid include:
The address offset overflowed the 16 or 34 bit offset size;
We need to use a DS-FORM load, and the bottom 2 bits are non-zero;
We need to use a DQ-FORM load, and the bottom 4 bits are non-zero;
Only X_FORM loads can be done, and the address is D_FORM. */
- enum insn_form iform
- = address_to_insn_form (new_addr, scalar_mode,
- reg_to_non_prefixed (scalar_reg, scalar_mode));
-
- if (iform == INSN_FORM_BAD)
+ if (!can_create_pseudo_p ())
{
- emit_move_insn (base_tmp, new_addr);
- new_addr = base_tmp;
+ enum insn_form iform
+ = address_to_insn_form (new_addr, scalar_mode,
+ reg_to_non_prefixed (scalar_reg, scalar_mode));
+
+ if (iform == INSN_FORM_BAD)
+ {
+ emit_move_insn (base_tmp, new_addr);
+ new_addr = base_tmp;
+ }
}
return change_address (mem, scalar_mode, new_addr);
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 417aff5e24b..ed4636f1e06 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3549,15 +3549,16 @@
[(set_attr "length" "8")
(set_attr "type" "fp")])
+;; V4SF extract from memory with constant element number.
(define_insn_and_split "*vsx_extract_v4sf_load"
[(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
(vec_select:SF
- (match_operand:V4SF 1 "memory_operand" "m,Z,m,m")
+ (match_operand:V4SF 1 "non_altivec_memory_operand" "m,Z,m,m")
(parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
(clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
"VECTOR_MEM_VSX_P (V4SFmode)"
"#"
- "&& reload_completed"
+ "&& 1"
[(set (match_dup 0) (match_dup 4))]
{
operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
@@ -3565,7 +3566,28 @@
}
[(set_attr "type" "fpload,fpload,fpload,load")
(set_attr "length" "8")
- (set_attr "isa" "*,p7v,p9v,*")])
+ (set_attr "isa" "*,p8v,p9v,*")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+ [(set (match_operand:DF 0 "register_operand" "=f,v,v")
+ (float_extend:DF
+ (vec_select:SF
+ (match_operand:V4SF 1 "non_altivec_memory_operand" "m,Z,m")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")]))))
+ (clobber (match_scratch:P 3 "=&b,&b,&b"))]
+ "VECTOR_MEM_VSX_P (V4SFmode)"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (float_extend:DF (match_dup 4)))]
+{
+ operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+ operands[3], SFmode);
+}
+ [(set_attr "type" "fpload")
+ (set_attr "length" "8")
+ (set_attr "isa" "*,p8v,p9v")])
;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
--git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+ float elements into a GPR register without doing a LFS/STFS. */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 0);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 1);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlfsx?\M|\mlxsspx?\M} } } */
+/* { dg-final { scan-assembler-not {\mstfsx?\M|\mstxsspx?\M} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers.
@ 2023-04-27 20:32 Michael Meissner
0 siblings, 0 replies; 6+ messages in thread
From: Michael Meissner @ 2023-04-27 20:32 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:5fde705eaf4633764f3dea56e8824675edafddac
commit 5fde705eaf4633764f3dea56e8824675edafddac
Author: Michael Meissner <meissner@linux.ibm.com>
Date: Thu Apr 27 16:31:50 2023 -0400
Optimize vec_extract of V4SF from memory with constant element numbers.
This patch updates vec_extract of V4SF from memory with constant element
numbers.
This patch changes the splits so that they can be done before register
allocation.
This patch corrects the ISA for loading SF values to altivec registers to be
power8 vector, and not power7.
This patch adds a combiner patch to combine loading up a SF element and
converting it to double.
2023-04-27 Michael Meissner <meissner@linux.ibm.com>
gcc/
* gcc/config/rs6000/vsx.md (vsx_extract_v4sf_load): Allow splitting
before register allocation. Fix ISA for loading up SFmode values to
traditional Altivec registers.
(vsx_extract_v4sf_load_to_df): New insn.
gc/testsuite/
* gcc.target/powerpc/vec-extract-mem-float-1.c: New file.
Diff:
---
gcc/config/rs6000/vsx.md | 26 +++++++++++++++++--
| 29 ++++++++++++++++++++++
2 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 417aff5e24b..695b5cbd126 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3549,6 +3549,7 @@
[(set_attr "length" "8")
(set_attr "type" "fp")])
+;; V4SF extract from memory with constant element number.
(define_insn_and_split "*vsx_extract_v4sf_load"
[(set (match_operand:SF 0 "register_operand" "=f,v,v,?r")
(vec_select:SF
@@ -3557,7 +3558,7 @@
(clobber (match_scratch:P 3 "=&b,&b,&b,&b"))]
"VECTOR_MEM_VSX_P (V4SFmode)"
"#"
- "&& reload_completed"
+ "&& 1"
[(set (match_dup 0) (match_dup 4))]
{
operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
@@ -3565,7 +3566,28 @@
}
[(set_attr "type" "fpload,fpload,fpload,load")
(set_attr "length" "8")
- (set_attr "isa" "*,p7v,p9v,*")])
+ (set_attr "isa" "*,p8v,p9v,*")])
+
+;; V4SF extract from memory with constant element number and convert to DFmode.
+(define_insn_and_split "*vsx_extract_v4sf_load_to_df"
+ [(set (match_operand:DF 0 "register_operand" "=f,v,v")
+ (float_extend:DF
+ (vec_select:SF
+ (match_operand:V4SF 1 "memory_operand" "m,Z,m")
+ (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")]))))
+ (clobber (match_scratch:P 3 "=&b,&b,&b"))]
+ "VECTOR_MEM_VSX_P (V4SFmode)"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (float_extend:DF (match_dup 4)))]
+{
+ operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
+ operands[3], SFmode);
+}
+ [(set_attr "type" "fpload")
+ (set_attr "length" "8")
+ (set_attr "isa" "*,p8v,p9v")])
;; Variable V4SF extract from a register
(define_insn_and_split "vsx_extract_v4sf_var"
--git a/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
new file mode 100644
index 00000000000..4670e261ba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-mem-float-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+
+/* Test to verify that the vec_extract with constant element numbers can load
+ float elements into a GPR register without doing a LFS/STFS. */
+
+#include <altivec.h>
+
+void
+extract_v4sf_gpr_0 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 0);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+void
+extract_v4sf_gpr_1 (vector float *p, float *q)
+{
+ float x = vec_extract (*p, 1);
+ __asm__ (" # %0" : "+r" (x)); /* lwz, no lfs/stfs. */
+ *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\mlwzx?\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlfsx?\M|\mlxsspx?\M} } } */
+/* { dg-final { scan-assembler-not {\mstfsx?\M|\mstxsspx?\M} } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-04-29 2:40 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-27 21:25 [gcc(refs/users/meissner/heads/work119)] Optimize vec_extract of V4SF from memory with constant element numbers Michael Meissner
-- strict thread matches above, loose matches on Subject: below --
2023-04-29 2:40 Michael Meissner
2023-04-29 0:02 Michael Meissner
2023-04-28 22:12 Michael Meissner
2023-04-28 17:57 Michael Meissner
2023-04-27 20:32 Michael Meissner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).