* [PATCH] [arm] Implement Armv8.1-M low overhead loops
@ 2020-02-11 10:15 Andrea Corallo
2020-02-11 11:01 ` Richard Earnshaw (lists)
0 siblings, 1 reply; 12+ messages in thread
From: Andrea Corallo @ 2020-02-11 10:15 UTC (permalink / raw)
To: gcc-patches; +Cc: nd
[-- Attachment #1: Type: text/plain, Size: 1315 bytes --]
Hi all,
This patch enables the Armv8.1-M Mainline LOB (low overhead branch) extension
low overhead loops (LOL) feature by using the loop-doloop pass
that it shares with the Swing Modulo Scheduler.
bootstrapped arm-none-linux-gnueabihf, does not introduce testsuite regressions.
Andrea
gcc/ChangeLog:
2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
2020-??-?? Mihail-Calin Ionescu <mihail.ionescu@arm.com>
2020-??-?? Iain Apreotesei <iain.apreotesei@arm.com>
* config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
(arm_invalid_within_doloop): Implement invalid_within_doloop hook.
* config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
* config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
Add new patterns.
* config/arm/unspecs.md: Add new unspec.
gcc/testsuite/ChangeLog:
2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
2020-??-?? Mihail-Calin Ionescu <mihail.ionescu@arm.com>
2020-??-?? Iain Apreotesei <iain.apreotesei@arm.com>
* gcc.target/arm/lob.h: New header.
* gcc.target/arm/lob1.c: New testcase.
* gcc.target/arm/lob2.c: Likewise.
* gcc.target/arm/lob3.c: Likewise.
* gcc.target/arm/lob4.c: Likewise.
* gcc.target/arm/lob5.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: lol.patch --]
[-- Type: text/x-diff, Size: 12343 bytes --]
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index e07cf03538c..1269f40bd77 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -586,6 +586,9 @@ extern int arm_arch_bf16;
\f
/* Target machine storage Layout. */
+/* Nonzero if this chip provides Armv8.1-M Mainline
+ LOB (low overhead branch features) extension instructions. */
+#define TARGET_HAVE_LOB (arm_arch8_1m_main)
/* Define this macro if it is advisable to hold scalars in registers
in a wider mode than that declared by the program. In such cases,
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 9cc7bc0e562..d0b50d544e3 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -833,6 +833,9 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_CONSTANT_ALIGNMENT
#define TARGET_CONSTANT_ALIGNMENT arm_constant_alignment
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP arm_invalid_within_doloop
+
#undef TARGET_MD_ASM_ADJUST
#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
\f
@@ -32937,6 +32940,39 @@ arm_ge_bits_access (void)
return true;
}
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+arm_invalid_within_doloop (const rtx_insn *insn)
+{
+ if (!TARGET_HAVE_LOB)
+ return default_invalid_within_doloop (insn);
+
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
+ return "Computed branch in the loop.";
+
+ if (INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == PARALLEL)
+ {
+ rtx parallel = PATTERN (insn);
+ rtx clobber;
+ int j;
+ for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--)
+ {
+ clobber = XVECEXP (parallel, 0, j);
+ if (GET_CODE (clobber) == CLOBBER
+ && GET_CODE (XEXP (clobber, 0)) == REG
+ && REGNO (XEXP (clobber, 0)) == LR_REGNUM)
+ return "LR is used inside loop.";
+ }
+ }
+ return NULL;
+}
+
#if CHECKING_P
namespace selftest {
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index b0d3bd1cf1c..44b1a264dba 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1555,8 +1555,11 @@
using a certain 'count' register and (2) the loop count can be
adjusted by modifying this register prior to the loop.
??? The possible introduction of a new block to initialize the
- new IV can potentially affect branch optimizations. */
- if (optimize > 0 && flag_modulo_sched)
+ new IV can potentially affect branch optimizations.
+
+ Also used to implement the low over head loops feature, which is part of
+ the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
{
rtx s0;
rtx bcomp;
@@ -1569,6 +1572,11 @@
FAIL;
s0 = operands [0];
+
+ /* Low over head loop instructions require the first operand to be LR. */
+ if (TARGET_HAVE_LOB)
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
if (TARGET_THUMB2)
insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
else
@@ -1650,3 +1658,29 @@
"TARGET_HAVE_MVE"
"lsrl%?\\t%Q0, %R0, %1"
[(set_attr "predicable" "yes")])
+
+(define_insn "*doloop_end"
+ [(parallel [(set (pc)
+ (if_then_else
+ (ne (reg:SI LR_REGNUM) (const_int 1))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
+ "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
+ "le\tlr, %l0")
+
+(define_expand "doloop_begin"
+ [(match_operand 0 "" "")
+ (match_operand 1 "" "")]
+ "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
+ {
+ emit_insn (gen_dls_insn (operands[0], operands[0]));
+ DONE;
+ })
+
+(define_insn "dls_insn"
+ [(set (match_operand:SI 0 "" "")
+ (unspec:SI [(match_operand:SI 1 "s_register_operand" "r")] UNSPEC_DLS))]
+ "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
+ "dls\tlr, %1")
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43e..df5ecb73192 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -154,6 +154,7 @@
UNSPEC_SMUADX ; Represent the SMUADX operation.
UNSPEC_SSAT16 ; Represent the SSAT16 operation.
UNSPEC_USAT16 ; Represent the USAT16 operation.
+ UNSPEC_DLS ; Used for DLS (Do Loop Start), Armv8.1-M Mainline instruction
])
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
new file mode 100644
index 00000000000..feaae7cc899
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -0,0 +1,15 @@
+#include <string.h>
+
+/* Common code for lob tests. */
+
+#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
+
+#define N 10000
+
+static void
+reset_data (int *a, int *b, int *c)
+{
+ memset (a, -1, N * sizeof (*a));
+ memset (b, -1, N * sizeof (*b));
+ memset (c, -1, N * sizeof (*c));
+}
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
new file mode 100644
index 00000000000..8ffaaa29878
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -0,0 +1,82 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ for some simple loops. */
+/* { dg-do run } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+foo (int a, int b)
+{
+ return a + b;
+}
+
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+}
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+{
+ int i = 0;
+ while (i < N)
+ {
+ a[i] = i - 2;
+ b[i] = i * 5;
+ c[i] = a[i] + b[i];
+ i++;
+ }
+}
+
+void __attribute__((noinline))
+loop3 (int *a, int *b, int *c)
+{
+ int i = 0;
+ do
+ {
+ a[i] = i - 4;
+ b[i] = i * 3;
+ c[i] = a[i] + b[i];
+ i++;
+ } while (i < N);
+}
+
+void
+check (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (c[i] != a[i] + b[i])
+ abort ();
+ }
+}
+
+int main (void)
+{
+ reset_data (a, b, c);
+ loop1 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop2 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop3 (a, b ,c);
+ check (a, b ,c);
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 3 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 3 } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob2.c b/gcc/testsuite/gcc.target/arm/lob2.c
new file mode 100644
index 00000000000..046d92fcad1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob2.c
@@ -0,0 +1,30 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if a non-inlineable function call takes place inside the loop. */
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int __attribute__ ((noinline))
+foo (int a, int b)
+{
+ return a + b;
+}
+
+int main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo (a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob3.c b/gcc/testsuite/gcc.target/arm/lob3.c
new file mode 100644
index 00000000000..77f89ad9c70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob3.c
@@ -0,0 +1,26 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if causes VFP emulation library calls to happen inside the loop. */
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+double a[N];
+double b[N];
+double c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob4.c b/gcc/testsuite/gcc.target/arm/lob4.c
new file mode 100644
index 00000000000..88be61f3c76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob4.c
@@ -0,0 +1,32 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if LR is modified within the loop. */
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+static __attribute__ ((always_inline)) inline int
+foo (int a, int b)
+{
+ NO_LOB;
+ return a + b;
+}
+
+int main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo(a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob5.c b/gcc/testsuite/gcc.target/arm/lob5.c
new file mode 100644
index 00000000000..cd91c3252d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob5.c
@@ -0,0 +1,33 @@
+/* Check that GCC does not generates Armv8.1-M low over head loop
+ instructions. Innermost loop has no fixed number of iterations
+ therefore is not optimizable. Outer loops are not optimized. */
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+
+ int k = b[i];
+ while (k != 0)
+ {
+ if (k % 2 == 0)
+ c[i - 1] = k % 2;
+ k /= 2;
+ }
+ c[i] = a[i] - b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
new file mode 100644
index 00000000000..4bcedc8bd60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -0,0 +1,94 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ with some less trivial loops and the result is correct. */
+/* { dg-do run } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+#define TEST_CODE1 \
+ { \
+ for (int i = 0; i < N; i++) \
+ { \
+ a[i] = i; \
+ b[i] = i * 2; \
+ \
+ for (int k = 0; k < N; k++) \
+ { \
+ MAYBE_LOB; \
+ c[k] = k / 2; \
+ } \
+ c[i] = a[i] - b[i]; \
+ } \
+ }
+
+#define TEST_CODE2 \
+ { \
+ for (int i = 0; i < N / 2; i++) \
+ { \
+ MAYBE_LOB; \
+ if (c[i] % 2 == 0) \
+ break; \
+ a[i]++; \
+ b[i]++; \
+ } \
+ }
+
+int a1[N];
+int b1[N];
+int c1[N];
+
+int a2[N];
+int b2[N];
+int c2[N];
+
+#define MAYBE_LOB
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+#undef MAYBE_LOB
+#define MAYBE_LOB NO_LOB
+
+void
+ref1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void
+ref2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+void
+check (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (a1[i] != a2[i]
+ && b1[i] != b2[i]
+ && c1[i] != c2[i])
+ abort ();
+ }
+}
+
+int main (void)
+{
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop1 (a1, b1, c1);
+ ref1 (a2, b2, c2);
+ check ();
+
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop2 (a1, b1, c1);
+ ref2 (a2, b2, c2);
+ check ();
+
+ return 0;
+}
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 2 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 2 } } */
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-11 10:15 [PATCH] [arm] Implement Armv8.1-M low overhead loops Andrea Corallo
@ 2020-02-11 11:01 ` Richard Earnshaw (lists)
2020-02-11 13:40 ` Andrea Corallo
0 siblings, 1 reply; 12+ messages in thread
From: Richard Earnshaw (lists) @ 2020-02-11 11:01 UTC (permalink / raw)
To: Andrea Corallo, gcc-patches; +Cc: nd
On 11/02/2020 10:14, Andrea Corallo wrote:
> Hi all,
>
> This patch enables the Armv8.1-M Mainline LOB (low overhead branch) extension
> low overhead loops (LOL) feature by using the loop-doloop pass
> that it shares with the Swing Modulo Scheduler.
>
> bootstrapped arm-none-linux-gnueabihf, does not introduce testsuite regressions.
>
> Andrea
>
> gcc/ChangeLog:
>
> 2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
> 2020-??-?? Mihail-Calin Ionescu <mihail.ionescu@arm.com>
> 2020-??-?? Iain Apreotesei <iain.apreotesei@arm.com>
>
> * config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
> (arm_invalid_within_doloop): Implement invalid_within_doloop hook.
> * config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
> * config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
> Add new patterns.
> * config/arm/unspecs.md: Add new unspec.
>
A date should only appear before the first author in a multi-author
patch, other authors should then be indented to align with the name of
that first author.
> gcc/testsuite/ChangeLog:
>
> 2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
> 2020-??-?? Mihail-Calin Ionescu <mihail.ionescu@arm.com>
> 2020-??-?? Iain Apreotesei <iain.apreotesei@arm.com>
>
> * gcc.target/arm/lob.h: New header.
> * gcc.target/arm/lob1.c: New testcase.
> * gcc.target/arm/lob2.c: Likewise.
> * gcc.target/arm/lob3.c: Likewise.
> * gcc.target/arm/lob4.c: Likewise.
> * gcc.target/arm/lob5.c: Likewise.
> * gcc.target/arm/lob6.c: Likewise.
>
+(define_insn "*doloop_end"
+ [(parallel [(set (pc)
+ (if_then_else
+ (ne (reg:SI LR_REGNUM) (const_int 1))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
+ "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
+ "le\tlr, %l0")
Is it deliberate that this pattern name has a '*' prefix? doloop_end is
a named expansion pattern according to md.texi.
Also, hard-coded register names should be prefixed with '%|' (so "%|lr",
not just "lr"), just in case the assembler dialect requires something
(ELF doesn't but others have). Also for dls_insn.
For the tests, your 'require-effective-taret' tests look insufficient to
prevent problems when testing a multilib environment, you'll need (at
least) checks that a) passing -marm has not happened and b) that the
architecture, or a specific CPU isn't being passed on the command line.
R.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-11 11:01 ` Richard Earnshaw (lists)
@ 2020-02-11 13:40 ` Andrea Corallo
2020-02-12 9:23 ` Roman Zhuykov
0 siblings, 1 reply; 12+ messages in thread
From: Andrea Corallo @ 2020-02-11 13:40 UTC (permalink / raw)
To: Richard Earnshaw (lists); +Cc: gcc-patches, nd
Hi Richard,
"Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> writes:
>> gcc/ChangeLog:
>> 2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
>> 2020-??-?? Mihail-Calin Ionescu <mihail.ionescu@arm.com>
>> 2020-??-?? Iain Apreotesei <iain.apreotesei@arm.com>
>> * config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
>> (arm_invalid_within_doloop): Implement invalid_within_doloop hook.
>> * config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
>> * config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
>> Add new patterns.
>> * config/arm/unspecs.md: Add new unspec.
>>
>
> A date should only appear before the first author in a multi-author
> patch, other authors should then be indented to align with the name of
> that first author.
Ack
> +(define_insn "*doloop_end"
> + [(parallel [(set (pc)
> + (if_then_else
> + (ne (reg:SI LR_REGNUM) (const_int 1))
> + (label_ref (match_operand 0 "" ""))
> + (pc)))
> + (set (reg:SI LR_REGNUM)
> + (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
> + "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
> + "le\tlr, %l0")
>
> Is it deliberate that this pattern name has a '*' prefix? doloop_end
> is a named expansion pattern according to md.texi.
Yes, this should be expanded already by the define_expand we have in
thumb2.md. Perhaps I'll call it 'doloop_end_internal' and add a
comment.
> Also, hard-coded register names should be prefixed with '%|' (so
> "%|lr", not just "lr"), just in case the assembler dialect requires
> something (ELF doesn't but others have). Also for dls_insn.
Ack
> For the tests, your 'require-effective-taret' tests look insufficient
> to prevent problems when testing a multilib environment, you'll need
> (at least) checks that a) passing -marm has not happened and b) that
> the architecture, or a specific CPU isn't being passed on the command
> line.
Ack
Thanks for reviewing I'll update the patch.
Andrea
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-11 13:40 ` Andrea Corallo
@ 2020-02-12 9:23 ` Roman Zhuykov
2020-02-13 17:54 ` Andrea Corallo
2020-02-19 13:01 ` Andrea Corallo
0 siblings, 2 replies; 12+ messages in thread
From: Roman Zhuykov @ 2020-02-12 9:23 UTC (permalink / raw)
To: Andrea Corallo, Richard Earnshaw (lists); +Cc: gcc-patches, nd
Hello!
11.02.2020 16:40, Andrea Corallo wrote:
> Hi Richard,
>
> "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> writes:
>
>>> gcc/ChangeLog:
>>> 2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
>>> 2020-??-?? Mihail-Calin Ionescu <mihail.ionescu@arm.com>
>>> 2020-??-?? Iain Apreotesei <iain.apreotesei@arm.com>
>>> * config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
>>> (arm_invalid_within_doloop): Implement invalid_within_doloop hook.
>>> * config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
>>> * config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
>>> Add new patterns.
>>> * config/arm/unspecs.md: Add new unspec.
>>>
>> A date should only appear before the first author in a multi-author
>> patch, other authors should then be indented to align with the name of
>> that first author.
> Ack
This patch is stage1 material, right?
>
>> +(define_insn "*doloop_end"
>> + [(parallel [(set (pc)
>> + (if_then_else
>> + (ne (reg:SI LR_REGNUM) (const_int 1))
>> + (label_ref (match_operand 0 "" ""))
>> + (pc)))
>> + (set (reg:SI LR_REGNUM)
>> + (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
>> + "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
>> + "le\tlr, %l0")
I'm not an expert in .md files, but having that "!flag_modulo_sched"
condition seems wrong to me. What was the issue on SMS side to add that?
Currently, there are fake doloop_end pattern on ARM. It is generated
only when flag_modulo_sched is set and actually expands to more than one
instruction. This old approach have its pros and cons. When we
HAVE_LOB, target allows us to use a real doloop_end instruction, fake
one is not needed at all. In this case compiler should use real
instruction regardless whether SMS in on or off.
I hope in stage1 after upgrading modulo scheduler, we will restart old
discussion about removing fake doloop_end pattern for ARM:
https://gcc.gnu.org/ml/gcc-patches/2011-07/msg01812.html
https://gcc.gnu.org/ml/gcc-patches/2012-01/msg00195.html
Aarch64 also have such a fake pattern since 2014, probably its removal
also will be considered.
Roman
>> Is it deliberate that this pattern name has a '*' prefix? doloop_end
>> is a named expansion pattern according to md.texi.
> Yes, this should be expanded already by the define_expand we have in
> thumb2.md. Perhaps I'll call it 'doloop_end_internal' and add a
> comment.
>
>> Also, hard-coded register names should be prefixed with '%|' (so
>> "%|lr", not just "lr"), just in case the assembler dialect requires
>> something (ELF doesn't but others have). Also for dls_insn.
> Ack
>
>> For the tests, your 'require-effective-taret' tests look insufficient
>> to prevent problems when testing a multilib environment, you'll need
>> (at least) checks that a) passing -marm has not happened and b) that
>> the architecture, or a specific CPU isn't being passed on the command
>> line.
> Ack
>
> Thanks for reviewing I'll update the patch.
>
> Andrea
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-12 9:23 ` Roman Zhuykov
@ 2020-02-13 17:54 ` Andrea Corallo
2020-02-19 13:01 ` Andrea Corallo
1 sibling, 0 replies; 12+ messages in thread
From: Andrea Corallo @ 2020-02-13 17:54 UTC (permalink / raw)
To: Roman Zhuykov; +Cc: Richard Earnshaw (lists), gcc-patches, nd
Hi Roman,
Roman Zhuykov <zhroma@ispras.ru> writes:
> This patch is stage1 material, right?
Yes
>>
>>> +(define_insn "*doloop_end"
>>> + [(parallel [(set (pc)
>>> + (if_then_else
>>> + (ne (reg:SI LR_REGNUM) (const_int 1))
>>> + (label_ref (match_operand 0 "" ""))
>>> + (pc)))
>>> + (set (reg:SI LR_REGNUM)
>>> + (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
>>> + "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
>>> + "le\tlr, %l0")
> I'm not an expert in .md files, but having that "!flag_modulo_sched"
> condition seems wrong to me. What was the issue on SMS side to add
> that?
With this patch the first insn of the low loop overhead 'doloop_begin'
is expanded by 'doloop_modify' in loop-doloop.c. The same does not
happen with SMS. My understanding is that to have it working in that
case too the machine dependent reorg pass should add it later. Am I
correct on this?
Thanks
Andrea
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-12 9:23 ` Roman Zhuykov
2020-02-13 17:54 ` Andrea Corallo
@ 2020-02-19 13:01 ` Andrea Corallo
2020-02-21 15:31 ` Kyrill Tkachov
2020-02-21 15:41 ` Roman Zhuykov
1 sibling, 2 replies; 12+ messages in thread
From: Andrea Corallo @ 2020-02-19 13:01 UTC (permalink / raw)
To: gcc-patches; +Cc: Richard Earnshaw (lists), Roman Zhuykov, nd
[-- Attachment #1: Type: text/plain, Size: 1836 bytes --]
Hi all,
Second version of the patch here addressing comments.
This patch enables the Armv8.1-M Mainline LOB (low overhead branch) extension
low overhead loops (LOL) feature by using the 'loop-doloop' pass.
Given the following function:
void
loop (int *a)
{
for (int i = 0; i < 1000; i++)
a[i] = i;
}
'doloop_begin' and 'doloop_end' patterns translates into 'dls' and 'le'
giving:
loop:
movw r2, #10000
movs r3, #0
subs r0, r0, #4
push {lr}
dls lr, r2
.L2:
str r3, [r0, #4]!
adds r3, r3, #1
le lr, .L2
ldr pc, [sp], #4
SMS is disabled in tests not to break them when SMS does loop versioning.
bootstrapped arm-none-linux-gnueabihf, do not introduce testsuite regressions.
Andrea
gcc/ChangeLog:
2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
Mihail-Calin Ionescu <mihail.ionescu@arm.com>
Iain Apreotesei <iain.apreotesei@arm.com>
* config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
(arm_invalid_within_doloop): Implement invalid_within_doloop hook.
* config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
* config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
Add new patterns.
* config/arm/unspecs.md: Add new unspec.
gcc/testsuite/ChangeLog:
2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
Mihail-Calin Ionescu <mihail.ionescu@arm.com>
Iain Apreotesei <iain.apreotesei@arm.com>
* gcc.target/arm/lob.h: New header.
* gcc.target/arm/lob1.c: New testcase.
* gcc.target/arm/lob2.c: Likewise.
* gcc.target/arm/lob3.c: Likewise.
* gcc.target/arm/lob4.c: Likewise.
* gcc.target/arm/lob5.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: lol.patch --]
[-- Type: text/x-diff, Size: 13133 bytes --]
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index e07cf03538c5..1269f40bd77c 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -586,6 +586,9 @@ extern int arm_arch_bf16;
\f
/* Target machine storage Layout. */
+/* Nonzero if this chip provides Armv8.1-M Mainline
+ LOB (low overhead branch features) extension instructions. */
+#define TARGET_HAVE_LOB (arm_arch8_1m_main)
/* Define this macro if it is advisable to hold scalars in registers
in a wider mode than that declared by the program. In such cases,
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 9cc7bc0e5621..7c2a7b7e9e97 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -833,6 +833,9 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_CONSTANT_ALIGNMENT
#define TARGET_CONSTANT_ALIGNMENT arm_constant_alignment
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP arm_invalid_within_doloop
+
#undef TARGET_MD_ASM_ADJUST
#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
\f
@@ -32937,6 +32940,27 @@ arm_ge_bits_access (void)
return true;
}
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+arm_invalid_within_doloop (const rtx_insn *insn)
+{
+ if (!TARGET_HAVE_LOB)
+ return default_invalid_within_doloop (insn);
+
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
+ return "Computed branch in the loop.";
+
+ if (reg_mentioned_p (gen_rtx_REG (SImode, LR_REGNUM), insn))
+ return "LR is used inside loop.";
+
+ return NULL;
+}
+
#if CHECKING_P
namespace selftest {
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index b0d3bd1cf1c4..4aff1a0838d8 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1555,8 +1555,11 @@
using a certain 'count' register and (2) the loop count can be
adjusted by modifying this register prior to the loop.
??? The possible introduction of a new block to initialize the
- new IV can potentially affect branch optimizations. */
- if (optimize > 0 && flag_modulo_sched)
+ new IV can potentially affect branch optimizations.
+
+ Also used to implement the low over head loops feature, which is part of
+ the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
{
rtx s0;
rtx bcomp;
@@ -1569,6 +1572,11 @@
FAIL;
s0 = operands [0];
+
+ /* Low over head loop instructions require the first operand to be LR. */
+ if (TARGET_HAVE_LOB)
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
if (TARGET_THUMB2)
insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
else
@@ -1650,3 +1658,30 @@
"TARGET_HAVE_MVE"
"lsrl%?\\t%Q0, %R0, %1"
[(set_attr "predicable" "yes")])
+
+;; Originally expanded by 'doloop_end'.
+(define_insn "doloop_end_internal"
+ [(parallel [(set (pc)
+ (if_then_else
+ (ne (reg:SI LR_REGNUM) (const_int 1))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ "le\t%|lr, %l0")
+
+(define_expand "doloop_begin"
+ [(match_operand 0 "" "")
+ (match_operand 1 "" "")]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ {
+ emit_insn (gen_dls_insn (operands[0]));
+ DONE;
+ })
+
+(define_insn "dls_insn"
+ [(set (reg:SI LR_REGNUM)
+ (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] UNSPEC_DLS))]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ "dls\t%|lr, %0")
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43ef..df5ecb731925 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -154,6 +154,7 @@
UNSPEC_SMUADX ; Represent the SMUADX operation.
UNSPEC_SSAT16 ; Represent the SSAT16 operation.
UNSPEC_USAT16 ; Represent the USAT16 operation.
+ UNSPEC_DLS ; Used for DLS (Do Loop Start), Armv8.1-M Mainline instruction
])
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
new file mode 100644
index 000000000000..feaae7cc8995
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -0,0 +1,15 @@
+#include <string.h>
+
+/* Common code for lob tests. */
+
+#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
+
+#define N 10000
+
+static void
+reset_data (int *a, int *b, int *c)
+{
+ memset (a, -1, N * sizeof (*a));
+ memset (b, -1, N * sizeof (*b));
+ memset (c, -1, N * sizeof (*c));
+}
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
new file mode 100644
index 000000000000..e4913519942f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -0,0 +1,85 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ for some simple loops. */
+/* { dg-do run } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+foo (int a, int b)
+{
+ return a + b;
+}
+
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+}
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+{
+ int i = 0;
+ while (i < N)
+ {
+ a[i] = i - 2;
+ b[i] = i * 5;
+ c[i] = a[i] + b[i];
+ i++;
+ }
+}
+
+void __attribute__((noinline))
+loop3 (int *a, int *b, int *c)
+{
+ int i = 0;
+ do
+ {
+ a[i] = i - 4;
+ b[i] = i * 3;
+ c[i] = a[i] + b[i];
+ i++;
+ } while (i < N);
+}
+
+void
+check (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (c[i] != a[i] + b[i])
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ reset_data (a, b, c);
+ loop1 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop2 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop3 (a, b ,c);
+ check (a, b ,c);
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 3 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 3 } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob2.c b/gcc/testsuite/gcc.target/arm/lob2.c
new file mode 100644
index 000000000000..e81286694804
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob2.c
@@ -0,0 +1,33 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if a non-inlineable function call takes place inside the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int __attribute__ ((noinline))
+foo (int a, int b)
+{
+ return a + b;
+}
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo (a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob3.c b/gcc/testsuite/gcc.target/arm/lob3.c
new file mode 100644
index 000000000000..69d22b2f023a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob3.c
@@ -0,0 +1,28 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if causes VFP emulation library calls to happen inside the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+double a[N];
+double b[N];
+double c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob4.c b/gcc/testsuite/gcc.target/arm/lob4.c
new file mode 100644
index 000000000000..62be52e31007
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob4.c
@@ -0,0 +1,35 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if LR is modified within the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+static __attribute__ ((always_inline)) inline int
+foo (int a, int b)
+{
+ NO_LOB;
+ return a + b;
+}
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo(a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob5.c b/gcc/testsuite/gcc.target/arm/lob5.c
new file mode 100644
index 000000000000..ad8a1b961e40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob5.c
@@ -0,0 +1,36 @@
+/* Check that GCC does not generates Armv8.1-M low over head loop
+ instructions. Innermost loop has no fixed number of iterations
+ therefore is not optimizable. Outer loops are not optimized. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+
+ int k = b[i];
+ while (k != 0)
+ {
+ if (k % 2 == 0)
+ c[i - 1] = k % 2;
+ k /= 2;
+ }
+ c[i] = a[i] - b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
new file mode 100644
index 000000000000..1dbcaff1670d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -0,0 +1,97 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ with some less trivial loops and the result is correct. */
+/* { dg-do run } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+#define TEST_CODE1 \
+ { \
+ for (int i = 0; i < N; i++) \
+ { \
+ a[i] = i; \
+ b[i] = i * 2; \
+ \
+ for (int k = 0; k < N; k++) \
+ { \
+ MAYBE_LOB; \
+ c[k] = k / 2; \
+ } \
+ c[i] = a[i] - b[i]; \
+ } \
+ }
+
+#define TEST_CODE2 \
+ { \
+ for (int i = 0; i < N / 2; i++) \
+ { \
+ MAYBE_LOB; \
+ if (c[i] % 2 == 0) \
+ break; \
+ a[i]++; \
+ b[i]++; \
+ } \
+ }
+
+int a1[N];
+int b1[N];
+int c1[N];
+
+int a2[N];
+int b2[N];
+int c2[N];
+
+#define MAYBE_LOB
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+#undef MAYBE_LOB
+#define MAYBE_LOB NO_LOB
+
+void
+ref1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void
+ref2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+void
+check (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (a1[i] != a2[i]
+ && b1[i] != b2[i]
+ && c1[i] != c2[i])
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop1 (a1, b1, c1);
+ ref1 (a2, b2, c2);
+ check ();
+
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop2 (a1, b1, c1);
+ ref2 (a2, b2, c2);
+ check ();
+
+ return 0;
+}
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 2 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 2 } } */
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-19 13:01 ` Andrea Corallo
@ 2020-02-21 15:31 ` Kyrill Tkachov
2020-02-21 15:49 ` Roman Zhuykov
2020-02-21 15:41 ` Roman Zhuykov
1 sibling, 1 reply; 12+ messages in thread
From: Kyrill Tkachov @ 2020-02-21 15:31 UTC (permalink / raw)
To: Andrea Corallo, gcc-patches; +Cc: Richard Earnshaw, Roman Zhuykov, nd
Hi Andrea,
On 2/19/20 1:01 PM, Andrea Corallo wrote:
> Hi all,
>
> Second version of the patch here addressing comments.
>
> This patch enables the Armv8.1-M Mainline LOB (low overhead branch)
> extension
> low overhead loops (LOL) feature by using the 'loop-doloop' pass.
>
> Given the following function:
>
> void
> loop (int *a)
> {
> Â for (int i = 0; i < 1000; i++)
> Â Â Â a[i] = i;
> }
>
> 'doloop_begin' and 'doloop_end' patterns translates into 'dls' and 'le'
> giving:
>
> Â loop:
>         movw   r2, #10000
>         movs   r3, #0
>         subs   r0, r0, #4
>         push   {lr}
>         dls    lr, r2
> Â .L2:
>         str    r3, [r0, #4]!
>         adds   r3, r3, #1
>         le     lr, .L2
>         ldr    pc, [sp], #4
>
> SMS is disabled in tests not to break them when SMS does loop versioning.
>
> bootstrapped arm-none-linux-gnueabihf, do not introduce testsuite
> regressions.
This should be aimed at GCC 11 at this point.
Some comments inline...
>
> Andrea
>
> gcc/ChangeLog:
>
> 2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
> Â Â Â Â Â Â Â Â Â Â Â Mihail-Calin Ionescu <mihail.ionescu@arm.com>
>            Iain Apreotesei <iain.apreotesei@arm.com>
>
> Â Â Â Â Â Â Â * config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
> Â Â Â Â Â Â Â (arm_invalid_within_doloop): Implement invalid_within_doloop hook.
> Â Â Â Â Â Â Â * config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
> Â Â Â Â Â Â Â * config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
> Â Â Â Â Â Â Â Add new patterns.
> Â Â Â Â Â Â Â * config/arm/unspecs.md: Add new unspec.
>
> gcc/testsuite/ChangeLog:
>
> 2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
> Â Â Â Â Â Â Â Â Â Â Â Mihail-Calin Ionescu <mihail.ionescu@arm.com>
>            Iain Apreotesei <iain.apreotesei@arm.com>
>
> Â Â Â Â Â Â Â * gcc.target/arm/lob.h: New header.
> Â Â Â Â Â Â Â * gcc.target/arm/lob1.c: New testcase.
> Â Â Â Â Â Â Â * gcc.target/arm/lob2.c: Likewise.
> Â Â Â Â Â Â Â * gcc.target/arm/lob3.c: Likewise.
> Â Â Â Â Â Â Â * gcc.target/arm/lob4.c: Likewise.
> Â Â Â Â Â Â Â * gcc.target/arm/lob5.c: Likewise.
> Â Â Â Â Â Â Â * gcc.target/arm/lob6.c: Likewise.
>
lol.patch
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index e07cf03538c5..1269f40bd77c 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -586,6 +586,9 @@ extern int arm_arch_bf16;
/* Target machine storage Layout. */
+/* Nonzero if this chip provides Armv8.1-M Mainline
+ LOB (low overhead branch features) extension instructions. */
+#define TARGET_HAVE_LOB (arm_arch8_1m_main)
/* Define this macro if it is advisable to hold scalars in registers
in a wider mode than that declared by the program. In such cases,
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 9cc7bc0e5621..7c2a7b7e9e97 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -833,6 +833,9 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_CONSTANT_ALIGNMENT
#define TARGET_CONSTANT_ALIGNMENT arm_constant_alignment
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP arm_invalid_within_doloop
+
#undef TARGET_MD_ASM_ADJUST
#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
@@ -32937,6 +32940,27 @@ arm_ge_bits_access (void)
return true;
}
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+arm_invalid_within_doloop (const rtx_insn *insn)
+{
+ if (!TARGET_HAVE_LOB)
+ return default_invalid_within_doloop (insn);
+
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
+ return "Computed branch in the loop.";
+
+ if (reg_mentioned_p (gen_rtx_REG (SImode, LR_REGNUM), insn))
+ return "LR is used inside loop.";
+
+ return NULL;
+}
+
#if CHECKING_P
namespace selftest {
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index b0d3bd1cf1c4..4aff1a0838d8 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1555,8 +1555,11 @@
using a certain 'count' register and (2) the loop count can be
adjusted by modifying this register prior to the loop.
??? The possible introduction of a new block to initialize the
- new IV can potentially affect branch optimizations. */
- if (optimize > 0 && flag_modulo_sched)
+ new IV can potentially affect branch optimizations.
+
+ Also used to implement the low over head loops feature, which is part of
+ the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
{
rtx s0;
rtx bcomp;
@@ -1569,6 +1572,11 @@
FAIL;
s0 = operands [0];
+
+ /* Low over head loop instructions require the first operand to be LR. */
+ if (TARGET_HAVE_LOB)
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
if (TARGET_THUMB2)
insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
else
@@ -1650,3 +1658,30 @@
"TARGET_HAVE_MVE"
"lsrl%?\\t%Q0, %R0, %1"
[(set_attr "predicable" "yes")])
+
+;; Originally expanded by 'doloop_end'.
+(define_insn "doloop_end_internal"
We usually prefer to name these patterns with a '*' in front to prevent the gen* machinery from generating gen_* unneeded expanders for them if they're not used.
+ [(parallel [(set (pc)
+ (if_then_else
+ (ne (reg:SI LR_REGNUM) (const_int 1))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ "le\t%|lr, %l0")
+
+(define_expand "doloop_begin"
+ [(match_operand 0 "" "")
+ (match_operand 1 "" "")]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ {
+ emit_insn (gen_dls_insn (operands[0]));
+ DONE;
+ })
+
+(define_insn "dls_insn"
+ [(set (reg:SI LR_REGNUM)
+ (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] UNSPEC_DLS))]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ "dls\t%|lr, %0")
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43ef..df5ecb731925 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -154,6 +154,7 @@
UNSPEC_SMUADX ; Represent the SMUADX operation.
UNSPEC_SSAT16 ; Represent the SSAT16 operation.
UNSPEC_USAT16 ; Represent the USAT16 operation.
+ UNSPEC_DLS ; Used for DLS (Do Loop Start), Armv8.1-M Mainline instruction
])
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
new file mode 100644
index 000000000000..feaae7cc8995
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -0,0 +1,15 @@
+#include <string.h>
+
+/* Common code for lob tests. */
+
+#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
+
+#define N 10000
+
+static void
+reset_data (int *a, int *b, int *c)
+{
+ memset (a, -1, N * sizeof (*a));
+ memset (b, -1, N * sizeof (*b));
+ memset (c, -1, N * sizeof (*c));
+}
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
new file mode 100644
index 000000000000..e4913519942f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -0,0 +1,85 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ for some simple loops. */
+/* { dg-do run } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
We need to avoid running this test on targets that don't support LOB. This needs an appropriate effective target check (see the existing *_hw ones in lib/target-supports.exp)
Thanks,
Kyrill
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+foo (int a, int b)
+{
+ return a + b;
+}
+
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+}
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+{
+ int i = 0;
+ while (i < N)
+ {
+ a[i] = i - 2;
+ b[i] = i * 5;
+ c[i] = a[i] + b[i];
+ i++;
+ }
+}
+
+void __attribute__((noinline))
+loop3 (int *a, int *b, int *c)
+{
+ int i = 0;
+ do
+ {
+ a[i] = i - 4;
+ b[i] = i * 3;
+ c[i] = a[i] + b[i];
+ i++;
+ } while (i < N);
+}
+
+void
+check (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (c[i] != a[i] + b[i])
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ reset_data (a, b, c);
+ loop1 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop2 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop3 (a, b ,c);
+ check (a, b ,c);
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 3 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 3 } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob2.c b/gcc/testsuite/gcc.target/arm/lob2.c
new file mode 100644
index 000000000000..e81286694804
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob2.c
@@ -0,0 +1,33 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if a non-inlineable function call takes place inside the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int __attribute__ ((noinline))
+foo (int a, int b)
+{
+ return a + b;
+}
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo (a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob3.c b/gcc/testsuite/gcc.target/arm/lob3.c
new file mode 100644
index 000000000000..69d22b2f023a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob3.c
@@ -0,0 +1,28 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if causes VFP emulation library calls to happen inside the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+double a[N];
+double b[N];
+double c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob4.c b/gcc/testsuite/gcc.target/arm/lob4.c
new file mode 100644
index 000000000000..62be52e31007
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob4.c
@@ -0,0 +1,35 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if LR is modified within the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+static __attribute__ ((always_inline)) inline int
+foo (int a, int b)
+{
+ NO_LOB;
+ return a + b;
+}
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo(a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob5.c b/gcc/testsuite/gcc.target/arm/lob5.c
new file mode 100644
index 000000000000..ad8a1b961e40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob5.c
@@ -0,0 +1,36 @@
+/* Check that GCC does not generates Armv8.1-M low over head loop
+ instructions. Innermost loop has no fixed number of iterations
+ therefore is not optimizable. Outer loops are not optimized. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+
+ int k = b[i];
+ while (k != 0)
+ {
+ if (k % 2 == 0)
+ c[i - 1] = k % 2;
+ k /= 2;
+ }
+ c[i] = a[i] - b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
new file mode 100644
index 000000000000..1dbcaff1670d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -0,0 +1,97 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ with some less trivial loops and the result is correct. */
+/* { dg-do run } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-skip-if "do not run SMS to prevent loop versioning" { *-*-* } { "-fmodulo-sched" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+#define TEST_CODE1 \
+ { \
+ for (int i = 0; i < N; i++) \
+ { \
+ a[i] = i; \
+ b[i] = i * 2; \
+ \
+ for (int k = 0; k < N; k++) \
+ { \
+ MAYBE_LOB; \
+ c[k] = k / 2; \
+ } \
+ c[i] = a[i] - b[i]; \
+ } \
+ }
+
+#define TEST_CODE2 \
+ { \
+ for (int i = 0; i < N / 2; i++) \
+ { \
+ MAYBE_LOB; \
+ if (c[i] % 2 == 0) \
+ break; \
+ a[i]++; \
+ b[i]++; \
+ } \
+ }
+
+int a1[N];
+int b1[N];
+int c1[N];
+
+int a2[N];
+int b2[N];
+int c2[N];
+
+#define MAYBE_LOB
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+#undef MAYBE_LOB
+#define MAYBE_LOB NO_LOB
+
+void
+ref1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void
+ref2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+void
+check (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (a1[i] != a2[i]
+ && b1[i] != b2[i]
+ && c1[i] != c2[i])
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop1 (a1, b1, c1);
+ ref1 (a2, b2, c2);
+ check ();
+
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop2 (a1, b1, c1);
+ ref2 (a2, b2, c2);
+ check ();
+
+ return 0;
+}
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 2 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 2 } } */
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-19 13:01 ` Andrea Corallo
2020-02-21 15:31 ` Kyrill Tkachov
@ 2020-02-21 15:41 ` Roman Zhuykov
2020-02-24 15:48 ` Andrea Corallo
1 sibling, 1 reply; 12+ messages in thread
From: Roman Zhuykov @ 2020-02-21 15:41 UTC (permalink / raw)
To: Andrea Corallo, gcc-patches; +Cc: Richard Earnshaw (lists), nd
Andrea Corallo writes:
> With this patch the first insn of the low loop overhead 'doloop_begin'
> is expanded by 'doloop_modify' in loop-doloop.c. The same does not
> happen with SMS.
That certainly works correct as in your first patch, doloop_begin
pattern also have "!flag_modulo_sched" condition.
> My understanding is that to have it working in that
> case too the machine dependent reorg pass should add it later. Am I
> correct on this?
IMHO, this is not needed is your case. Currently, list of platforms
(actually, gcc/config subfolders) which have doloop_end is rather big:
aarch64*, arc, arm*, bfin, c6x, ia64, pdp11, pru, rs6000, s390, sh,
tilegx*, tilepro, v850 and xtensa. I marked three of them with a star -
they actually have a fake pattern, which is applied only with SMS.Â
Reorg_loops from hw-doloop.c (see also
https://gcc.gnu.org/ml/gcc-patches/2011-06/msg01593.html and
https://gcc.gnu.org/ml/gcc-patches/2011-07/msg00133.html) is used only
in arc, bfin, c6x, and xtensa. Certainly some other platforms may have
additional loop reorg steps in target-specific part (e.q. pru), but not
all of them. And that reorg is actually needed independently, whether
SMS is on or off.
Actually, the question was: what goes wrong if you remove that
"!flag_modulo_sched" condition from three new patterns? I had actually
made one step forward, removed that "!flag_modulo_sched" parts in your
patch, and made the following simplest testing for such modified patch.Â
I've build and then compared regtest results of two ARM cross-compilers:
first was built from clean trunk, second with patch. Both compilers
were configured -with-march=armv8.1-m.main and had modified common.opt
to enable -fmodulo-sched and -fmodulo-sched-allow-regmoves by default.Â
Regtest results are identical.
> Second version of the patch here addressing comments.
Thank you, now I see in second patch that aspect was solved.
> SMS is disabled in tests not to break them when SMS does loop versioning.
And I'm not really sure about this. First of all, there are a lot of
scan-assembler-times tests which fail when modulo-scheduler is enabled,
probably the same happens when some unrolling parameters are not
default. It seems that any non-default optimization which creates more
instruction copies can break scan-assembler-times check. IMHO, it is
not necessary to workaround this in few particular tests. Second, I'm
not sure how dg-skip-if directive works. When one enables SMS setting
"Init(1)" directly into common.opt this won't be catched, would it?
Roman
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-21 15:31 ` Kyrill Tkachov
@ 2020-02-21 15:49 ` Roman Zhuykov
2020-02-21 15:53 ` Kyrill Tkachov
0 siblings, 1 reply; 12+ messages in thread
From: Roman Zhuykov @ 2020-02-21 15:49 UTC (permalink / raw)
To: Kyrill Tkachov, Andrea Corallo, gcc-patches; +Cc: Richard Earnshaw, nd
11.02.2020 14:00, Richard Earnshaw (lists) wrote:
> +(define_insn "*doloop_end"
> +Â [(parallel [(set (pc)
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (if_then_else
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (ne (reg:SI LR_REGNUM) (const_int 1))
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (label_ref (match_operand 0 "" ""))
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (pc)))
> +Â Â Â Â Â Â Â Â Â Â Â Â Â (set (reg:SI LR_REGNUM)
> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
> +Â "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
> +Â "le\tlr, %l0")
>
> Is it deliberate that this pattern name has a '*' prefix? doloop_end
> is a named expansion pattern according to md.texi.
>
> R.
21.02.2020 18:30, Kyrill Tkachov wrote:
> +;; Originally expanded by 'doloop_end'.
> +(define_insn "doloop_end_internal"
>
> We usually prefer to name these patterns with a '*' in front to
> prevent the gen* machinery from generating gen_* unneeded expanders
> for them if they're not used.
>
It seems you and Richard asking Andrea to do the opposite things.
:) LOL.patch
Roman
PS. I don't have an idea what approach is correct.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-21 15:49 ` Roman Zhuykov
@ 2020-02-21 15:53 ` Kyrill Tkachov
0 siblings, 0 replies; 12+ messages in thread
From: Kyrill Tkachov @ 2020-02-21 15:53 UTC (permalink / raw)
To: Roman Zhuykov, Andrea Corallo, gcc-patches; +Cc: Richard Earnshaw, nd
Hi Roman,
On 2/21/20 3:49 PM, Roman Zhuykov wrote:
> 11.02.2020 14:00, Richard Earnshaw (lists) wrote:
>> +(define_insn "*doloop_end"
>> +Â [(parallel [(set (pc)
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (if_then_else
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (ne (reg:SI LR_REGNUM) (const_int 1))
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (label_ref (match_operand 0 "" ""))
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (pc)))
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â (set (reg:SI LR_REGNUM)
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
>> +Â "TARGET_32BIT && TARGET_HAVE_LOB && !flag_modulo_sched"
>> +Â "le\tlr, %l0")
>>
>> Is it deliberate that this pattern name has a '*' prefix? doloop_end
>> is a named expansion pattern according to md.texi.
>>
>> R.
> 21.02.2020 18:30, Kyrill Tkachov wrote:
>> +;; Originally expanded by 'doloop_end'.
>> +(define_insn "doloop_end_internal"
>>
>> We usually prefer to name these patterns with a '*' in front to
>> prevent the gen* machinery from generating gen_* unneeded expanders
>> for them if they're not used.
>>
> It seems you and Richard asking Andrea to do the opposite things.
> :) LOL.patch
Almost, but not exactly incompatible things ;)
doloop_end is a standard name and if we wanted to use it directly it
cannot have a '*', which Richard is right to point out.
Once "doloop_end" is moved to its own expander and the define_insn is
doloop_end_internal, there is no reason for it to not have a '*' as its
gen_* form is never called.
Thanks,
Kyrill
> Roman
>
> PS. I don't have an idea what approach is correct.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-21 15:41 ` Roman Zhuykov
@ 2020-02-24 15:48 ` Andrea Corallo
2020-02-25 14:57 ` Andrea Corallo
0 siblings, 1 reply; 12+ messages in thread
From: Andrea Corallo @ 2020-02-24 15:48 UTC (permalink / raw)
To: Roman Zhuykov; +Cc: gcc-patches, Richard Earnshaw (lists), nd
Hi Roman,
Roman Zhuykov <zhroma@ispras.ru> writes:
>> SMS is disabled in tests not to break them when SMS does loop versioning.
>
> And I'm not really sure about this. First of all, there are a lot of
> scan-assembler-times tests which fail when modulo-scheduler is enabled,
> probably the same happens when some unrolling parameters are not
> default. It seems that any non-default optimization which creates more
> instruction copies can break scan-assembler-times check. IMHO, it is
> not necessary to workaround this in few particular tests. Second, I'm
> not sure how dg-skip-if directive works. When one enables SMS setting
> "Init(1)" directly into common.opt this won't be catched, would it?
Agree on everything. Also enabling SMS from common.opt apparently makes
dg-skip-if not effective.
Thanks for commenting I'm updating the patch.
Andrea
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] [arm] Implement Armv8.1-M low overhead loops
2020-02-24 15:48 ` Andrea Corallo
@ 2020-02-25 14:57 ` Andrea Corallo
0 siblings, 0 replies; 12+ messages in thread
From: Andrea Corallo @ 2020-02-25 14:57 UTC (permalink / raw)
To: gcc-patches; +Cc: Roman Zhuykov, Richard Earnshaw (lists), Kyrill Tkachov, nd
[-- Attachment #1: Type: text/plain, Size: 1941 bytes --]
Hi all,
Third version of the patch here addressing comments.
This patch enables the Armv8.1-M Mainline LOB (low overhead branch) extension
low overhead loops (LOL) feature by using the 'loop-doloop' pass.
Given the following function:
void
loop (int *a)
{
for (int i = 0; i < 1000; i++)
a[i] = i;
}
'doloop_begin' and 'doloop_end' patterns translates into 'dls' and 'le'
giving:
loop:
movw r2, #10000
movs r3, #0
subs r0, r0, #4
push {lr}
dls lr, r2
.L2:
str r3, [r0, #4]!
adds r3, r3, #1
le lr, .L2
ldr pc, [sp], #4
bootstrapped arm-none-linux-gnueabihf, do not introduce testsuite regressions.
Andrea
gcc/ChangeLog:
2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
Mihail-Calin Ionescu <mihail.ionescu@arm.com>
Iain Apreotesei <iain.apreotesei@arm.com>
* config/arm/arm.c (TARGET_INVALID_WITHIN_DOLOOP):
(arm_invalid_within_doloop): Implement invalid_within_doloop hook.
* config/arm/arm.h (TARGET_HAVE_LOB): Add new macro.
* config/arm/thumb2.md (*doloop_end, doloop_begin, dls_insn):
Add new patterns.
* config/arm/unspecs.md: Add new unspec.
* doc/sourcebuild.texi (arm_v8_1_lob_ok): Document new target
supports option.
gcc/testsuite/ChangeLog:
2020-??-?? Andrea Corallo <andrea.corallo@arm.com>
Mihail-Calin Ionescu <mihail.ionescu@arm.com>
Iain Apreotesei <iain.apreotesei@arm.com>
* gcc.target/arm/lob.h: New header.
* gcc.target/arm/lob1.c: New testcase.
* gcc.target/arm/lob2.c: Likewise.
* gcc.target/arm/lob3.c: Likewise.
* gcc.target/arm/lob4.c: Likewise.
* gcc.target/arm/lob5.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
* lib/target-supports.exp (check_effective_target_arm_v8_1_lob_ok):
New proc.
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: lol.patch --]
[-- Type: text/x-diff, Size: 14270 bytes --]
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index e07cf03538c5..1269f40bd77c 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -586,6 +586,9 @@ extern int arm_arch_bf16;
\f
/* Target machine storage Layout. */
+/* Nonzero if this chip provides Armv8.1-M Mainline
+ LOB (low overhead branch features) extension instructions. */
+#define TARGET_HAVE_LOB (arm_arch8_1m_main)
/* Define this macro if it is advisable to hold scalars in registers
in a wider mode than that declared by the program. In such cases,
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 9cc7bc0e5621..7c2a7b7e9e97 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -833,6 +833,9 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_CONSTANT_ALIGNMENT
#define TARGET_CONSTANT_ALIGNMENT arm_constant_alignment
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP arm_invalid_within_doloop
+
#undef TARGET_MD_ASM_ADJUST
#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
\f
@@ -32937,6 +32940,27 @@ arm_ge_bits_access (void)
return true;
}
+/* NULL if INSN insn is valid within a low-overhead loop.
+ Otherwise return why doloop cannot be applied. */
+
+static const char *
+arm_invalid_within_doloop (const rtx_insn *insn)
+{
+ if (!TARGET_HAVE_LOB)
+ return default_invalid_within_doloop (insn);
+
+ if (CALL_P (insn))
+ return "Function call in the loop.";
+
+ if (tablejump_p (insn, NULL, NULL) || computed_jump_p (insn))
+ return "Computed branch in the loop.";
+
+ if (reg_mentioned_p (gen_rtx_REG (SImode, LR_REGNUM), insn))
+ return "LR is used inside loop.";
+
+ return NULL;
+}
+
#if CHECKING_P
namespace selftest {
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index b0d3bd1cf1c4..fab993a8079f 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1555,8 +1555,11 @@
using a certain 'count' register and (2) the loop count can be
adjusted by modifying this register prior to the loop.
??? The possible introduction of a new block to initialize the
- new IV can potentially affect branch optimizations. */
- if (optimize > 0 && flag_modulo_sched)
+ new IV can potentially affect branch optimizations.
+
+ Also used to implement the low over head loops feature, which is part of
+ the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
{
rtx s0;
rtx bcomp;
@@ -1569,6 +1572,11 @@
FAIL;
s0 = operands [0];
+
+ /* Low over head loop instructions require the first operand to be LR. */
+ if (TARGET_HAVE_LOB)
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
if (TARGET_THUMB2)
insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
else
@@ -1650,3 +1658,30 @@
"TARGET_HAVE_MVE"
"lsrl%?\\t%Q0, %R0, %1"
[(set_attr "predicable" "yes")])
+
+;; Originally expanded by 'doloop_end'.
+(define_insn "*doloop_end_internal"
+ [(parallel [(set (pc)
+ (if_then_else
+ (ne (reg:SI LR_REGNUM) (const_int 1))
+ (label_ref (match_operand 0 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int -1)))])]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ "le\t%|lr, %l0")
+
+(define_expand "doloop_begin"
+ [(match_operand 0 "" "")
+ (match_operand 1 "" "")]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ {
+ emit_insn (gen_dls_insn (operands[0]));
+ DONE;
+ })
+
+(define_insn "dls_insn"
+ [(set (reg:SI LR_REGNUM)
+ (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")] UNSPEC_DLS))]
+ "TARGET_32BIT && TARGET_HAVE_LOB"
+ "dls\t%|lr, %0")
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 8f4a705f43ef..df5ecb731925 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -154,6 +154,7 @@
UNSPEC_SMUADX ; Represent the SMUADX operation.
UNSPEC_SSAT16 ; Represent the SSAT16 operation.
UNSPEC_USAT16 ; Represent the USAT16 operation.
+ UNSPEC_DLS ; Used for DLS (Do Loop Start), Armv8.1-M Mainline instruction
])
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 1f412ded2bb7..5b9e0399ebc1 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1945,6 +1945,12 @@ ARM Target supports options suitable for accessing the Q-bit manipulation
intrinsics from @code{arm_acle.h}.
Some multilibs may be incompatible with these options.
+@item arm_v8_1_lob_ok
+@anchor{arm_v8_1_lob_ok}
+ARM Target supports executing the Armv8.1-M Mainline Low Overhead Loop
+instructions @code{DLS} and @code{LE}.
+Some multilibs may be incompatible with these options.
+
@end table
@subsubsection AArch64-specific attributes
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
new file mode 100644
index 000000000000..feaae7cc8995
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -0,0 +1,15 @@
+#include <string.h>
+
+/* Common code for lob tests. */
+
+#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
+
+#define N 10000
+
+static void
+reset_data (int *a, int *b, int *c)
+{
+ memset (a, -1, N * sizeof (*a));
+ memset (b, -1, N * sizeof (*b));
+ memset (c, -1, N * sizeof (*c));
+}
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
new file mode 100644
index 000000000000..b92dc551d50b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -0,0 +1,85 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ for some simple loops. */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_1_lob_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+foo (int a, int b)
+{
+ return a + b;
+}
+
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+}
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+{
+ int i = 0;
+ while (i < N)
+ {
+ a[i] = i - 2;
+ b[i] = i * 5;
+ c[i] = a[i] + b[i];
+ i++;
+ }
+}
+
+void __attribute__((noinline))
+loop3 (int *a, int *b, int *c)
+{
+ int i = 0;
+ do
+ {
+ a[i] = i - 4;
+ b[i] = i * 3;
+ c[i] = a[i] + b[i];
+ i++;
+ } while (i < N);
+}
+
+void
+check (int *a, int *b, int *c)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (c[i] != a[i] + b[i])
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ reset_data (a, b, c);
+ loop1 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop2 (a, b ,c);
+ check (a, b ,c);
+ reset_data (a, b, c);
+ loop3 (a, b ,c);
+ check (a, b ,c);
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 3 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 3 } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob2.c b/gcc/testsuite/gcc.target/arm/lob2.c
new file mode 100644
index 000000000000..1fe9a9d82bb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob2.c
@@ -0,0 +1,32 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if a non-inlineable function call takes place inside the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int __attribute__ ((noinline))
+foo (int a, int b)
+{
+ return a + b;
+}
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo (a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob3.c b/gcc/testsuite/gcc.target/arm/lob3.c
new file mode 100644
index 000000000000..17cba007ccb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob3.c
@@ -0,0 +1,27 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if causes VFP emulation library calls to happen inside the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+double a[N];
+double b[N];
+double c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = a[i] + b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob4.c b/gcc/testsuite/gcc.target/arm/lob4.c
new file mode 100644
index 000000000000..444a2c7b4bfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob4.c
@@ -0,0 +1,34 @@
+/* Check that GCC does not generate Armv8.1-M low over head loop instructions
+ if LR is modified within the loop. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps -mfloat-abi=soft" } */
+/* { dg-require-effective-target arm_softfloat } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+static __attribute__ ((always_inline)) inline int
+foo (int a, int b)
+{
+ NO_LOB;
+ return a + b;
+}
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+ c[i] = foo(a[i], b[i]);
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob5.c b/gcc/testsuite/gcc.target/arm/lob5.c
new file mode 100644
index 000000000000..c4f46e41532b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob5.c
@@ -0,0 +1,35 @@
+/* Check that GCC does not generates Armv8.1-M low over head loop
+ instructions. Innermost loop has no fixed number of iterations
+ therefore is not optimizable. Outer loops are not optimized. */
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+int a[N];
+int b[N];
+int c[N];
+
+int
+main (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ a[i] = i;
+ b[i] = i * 2;
+
+ int k = b[i];
+ while (k != 0)
+ {
+ if (k % 2 == 0)
+ c[i - 1] = k % 2;
+ k /= 2;
+ }
+ c[i] = a[i] - b[i];
+ }
+
+ return 0;
+}
+/* { dg-final { scan-assembler-not {dls\s\S*,\s\S*} } } */
+/* { dg-final { scan-assembler-not {le\slr,\s\S*} } } */
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
new file mode 100644
index 000000000000..95db2f47efe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -0,0 +1,97 @@
+/* Check that GCC generates Armv8.1-M low over head loop instructions
+ with some less trivial loops and the result is correct. */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_1_lob_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main -O3 --save-temps" } */
+#include <stdlib.h>
+#include "lob.h"
+
+#define TEST_CODE1 \
+ { \
+ for (int i = 0; i < N; i++) \
+ { \
+ a[i] = i; \
+ b[i] = i * 2; \
+ \
+ for (int k = 0; k < N; k++) \
+ { \
+ MAYBE_LOB; \
+ c[k] = k / 2; \
+ } \
+ c[i] = a[i] - b[i]; \
+ } \
+ }
+
+#define TEST_CODE2 \
+ { \
+ for (int i = 0; i < N / 2; i++) \
+ { \
+ MAYBE_LOB; \
+ if (c[i] % 2 == 0) \
+ break; \
+ a[i]++; \
+ b[i]++; \
+ } \
+ }
+
+int a1[N];
+int b1[N];
+int c1[N];
+
+int a2[N];
+int b2[N];
+int c2[N];
+
+#define MAYBE_LOB
+void __attribute__((noinline))
+loop1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void __attribute__((noinline))
+loop2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+#undef MAYBE_LOB
+#define MAYBE_LOB NO_LOB
+
+void
+ref1 (int *a, int *b, int *c)
+ TEST_CODE1;
+
+void
+ref2 (int *a, int *b, int *c)
+ TEST_CODE2;
+
+void
+check (void)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (a1[i] != a2[i]
+ && b1[i] != b2[i]
+ && c1[i] != c2[i])
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop1 (a1, b1, c1);
+ ref1 (a2, b2, c2);
+ check ();
+
+ reset_data (a1, b1, c1);
+ reset_data (a2, b2, c2);
+ loop2 (a1, b1, c1);
+ ref2 (a2, b2, c2);
+ check ();
+
+ return 0;
+}
+/* { dg-final { scan-assembler-times {dls\s\S*,\s\S*} 2 } } */
+/* { dg-final { scan-assembler-times {le\slr,\s\S*} 2 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index d3b2798df3e8..51669aac1327 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9807,6 +9807,28 @@ proc check_effective_target_arm_v8_3a_bkey_directive { } {
}]
}
+# Return 1 if the target supports executing the Armv8.1-M Mainline Low
+# Overhead Loop, 0 otherwise. The test is valid for ARM.
+
+proc check_effective_target_arm_v8_1_lob_ok { } {
+ if { ![istarget arm*-*-*] } {
+ return 0;
+ } else {
+ return [check_runtime arm_v8_1_lob_hw_available {
+ int
+ main (void)
+ { int i = 0;
+ asm ("movw r3, #10\n\t" /* movs? */
+ "dls lr, r3" : : : "r3", "lr");
+ loop:
+ i++;
+ asm goto ("le lr, %l0" : : : "lr" : loop);
+ return i != 10;
+ }
+ } "-march=armv8.1-m.main" ]
+ }
+}
+
# Returns 1 if the target is using glibc, 0 otherwise.
proc check_effective_target_glibc { } {
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2020-02-25 14:57 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-11 10:15 [PATCH] [arm] Implement Armv8.1-M low overhead loops Andrea Corallo
2020-02-11 11:01 ` Richard Earnshaw (lists)
2020-02-11 13:40 ` Andrea Corallo
2020-02-12 9:23 ` Roman Zhuykov
2020-02-13 17:54 ` Andrea Corallo
2020-02-19 13:01 ` Andrea Corallo
2020-02-21 15:31 ` Kyrill Tkachov
2020-02-21 15:49 ` Roman Zhuykov
2020-02-21 15:53 ` Kyrill Tkachov
2020-02-21 15:41 ` Roman Zhuykov
2020-02-24 15:48 ` Andrea Corallo
2020-02-25 14:57 ` Andrea Corallo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).