public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [AVR PATCH] Optimize (X>>C)&1 for C in [1,4,8,16,24] in *insv.any_shift.<mode>.
@ 2023-11-02 11:50 Roger Sayle
  2023-11-09 18:08 ` Georg-Johann Lay
  2024-03-05 11:15 ` [patch,avr,applied] Improve output of insn "*insv.any_shift.<mode>" Georg-Johann Lay
  0 siblings, 2 replies; 3+ messages in thread
From: Roger Sayle @ 2023-11-02 11:50 UTC (permalink / raw)
  To: gcc-patches; +Cc: 'Denis Chertykov', 'Georg-Johann Lay'

[-- Attachment #1: Type: text/plain, Size: 2431 bytes --]


This patch optimizes a few special cases in avr.md's *insv.any_shift.<mode>
instruction.  This template handles tests for a single bit, where the result
has only a (different) single bit set in the result.  Usually (currently)
this always requires a three-instruction sequence of a BST, a CLR and a BLD
(plus any additional CLR instructions to clear the rest of the result
bytes).
The special cases considered here are those that can be done with only two
instructions (plus CLRs); an ANDI preceded by either a MOV, a SHIFT or a
SWAP.

Hence for C=1 in HImode, GCC with -O2 currently generates:

        bst r24,1
        clr r24
        clr r25
        bld r24,0

with this patch, we now generate:

        lsr r24
        andi r24,1
        clr r25

Likewise, HImode C=4 now becomes:

        swap r24
        andi r24,1
        clr r25

and SImode C=8 now becomes:

        mov r22,r23
        andi r22,1
        clr 23
        clr 24
        clr 25


I've not attempted to model the instruction length accurately for these
special cases; the logic would be ugly, but it's safe to use the current
(1 insn longer) length.

This patch has been (partially) tested with a cross-compiler to avr-elf
hosted on x86_64, without a simulator, where the compile-only tests in
the gcc testsuite show no regressions.  If someone could test this more
thoroughly that would be great.


2023-11-02  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/avr/avr.md (*insv.any_shift.<mode>): Optimize special
        cases of *insv.any_shift that save one instruction by using
        ANDI with either a MOV, a SHIFT or a SWAP.

gcc/testsuite/ChangeLog
        * gcc.target/avr/insvhi-1.c: New HImode test case.
        * gcc.target/avr/insvhi-2.c: Likewise.
        * gcc.target/avr/insvhi-3.c: Likewise.
        * gcc.target/avr/insvhi-4.c: Likewise.
        * gcc.target/avr/insvhi-5.c: Likewise.
        * gcc.target/avr/insvqi-1.c: New QImode test case.
        * gcc.target/avr/insvqi-2.c: Likewise.
        * gcc.target/avr/insvqi-3.c: Likewise.
        * gcc.target/avr/insvqi-4.c: Likewise.
        * gcc.target/avr/insvsi-1.c: New SImode test case.
        * gcc.target/avr/insvsi-2.c: Likewise.
        * gcc.target/avr/insvsi-3.c: Likewise.
        * gcc.target/avr/insvsi-4.c: Likewise.
        * gcc.target/avr/insvsi-5.c: Likewise.
        * gcc.target/avr/insvsi-6.c: Likewise.


Thanks in advance,
Roger
--


[-- Attachment #2: patchav.txt --]
[-- Type: text/plain, Size: 15354 bytes --]

diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 83dd15040b07..c2a1931733f8 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -9840,6 +9840,7 @@
    (clobber (reg:CC REG_CC))]
   "reload_completed"
   {
+    int ldi_ok = test_hard_reg_class (LD_REGS, operands[0]);
     int shift = <CODE> == ASHIFT ? INTVAL (operands[2]) : -INTVAL (operands[2]);
     int mask = GET_MODE_MASK (<MODE>mode) & INTVAL (operands[3]);
     // Position of the output / input bit, respectively.
@@ -9850,6 +9851,217 @@
     operands[3] = GEN_INT (obit);
     operands[2] = GEN_INT (ibit);
 
+    /* Special cases requiring MOV to low byte and ANDI.  */
+    if ((shift & 7) == 0 && ldi_ok)
+      {
+	if (IN_RANGE (obit, 0, 7))
+	  {
+	    if (shift == -8)
+	      {
+		if (<SIZE> == 2)
+		  return "mov %A0,%B1\;andi %A0,lo8(1<<%3)\;clr %B0";
+		if (<SIZE> == 3)
+		  return "mov %A0,%B1\;andi %A0,lo8(1<<%3)\;clr %B0\;clr %C0";
+		if (<SIZE> == 4 && !AVR_HAVE_MOVW)
+		  return "mov %A0,%B1\;andi %A0,lo8(1<<%3)\;"
+			 "clr %B0\;clr %C0\;clr %D0";
+	      }
+	    else if (shift == -16)
+	      {
+		if (<SIZE> == 3)
+		  return "mov %A0,%C1\;andi %A0,lo8(1<<%3)\;clr %B0\;clr %C0";
+		if (<SIZE> == 4 && !AVR_HAVE_MOVW)
+		  return "mov %A0,%C1\;andi %A0,lo8(1<<%3)\;"
+			 "clr %B0\;clr %C0\;clr %D0";
+	      }
+	    else if (shift == -24 && !AVR_HAVE_MOVW)
+	      return "mov %A0,%D1\;andi %A0,lo8(1<<%3)\;"
+		     "clr %B0\;clr %C0\;clr %D0";
+	  }
+
+	/* Special cases requiring MOV and ANDI.  */
+	else if (IN_RANGE (obit, 8, 15))
+	  {
+	    if (shift == 8)
+	      {
+		if (<SIZE> == 2)
+		  return "mov %B0,%A1\;andi %B0,lo8(1<<(%3-8))\;clr %A0";
+		if (<SIZE> == 3)
+		  return "mov %B0,%A1\;andi %B0,lo8(1<<(%3-8))\;"
+			 "clr %A0\;clr %C0";
+		if (<SIZE> == 4 && !AVR_HAVE_MOVW)
+		  return "mov %B0,%A1\;andi %B0,lo8(1<<(%3-8))\;"
+			 "clr %A0\;clr %C0\;clr %D0";
+	      }
+	    else if (shift == -8)
+	      {
+		if (<SIZE> == 3)
+		  return "mov %B0,%C1\;andi %B0,lo8(1<<(%3-8))\;"
+			 "clr %A0\;clr %C0";
+		if (<SIZE> == 4 && !AVR_HAVE_MOVW)
+		  return "mov %B0,%C1\;andi %B0,lo8(1<<(%3-8))\;"
+			 "clr %B0\;clr %C0\;clr %D0";
+	      }
+	    else if (shift == -16 && !AVR_HAVE_MOVW)
+	      return "mov %B0,%D1\;andi %B0,lo8(1<<(%3-8))\;"
+		     "clr %A0\;clr %C0\;clr %D0";
+	  }
+	else if (IN_RANGE (obit, 16, 23))
+	  {
+	    if (shift == 16)
+	      {
+		if (<SIZE> == 3)
+		  return "mov %C0,%A1\;andi %B0,lo8(1<<(%3-16))\;"
+			 "clr %A0\;clr %B0";
+		if (<SIZE> == 4 && !AVR_HAVE_MOVW)
+		  return "mov %C0,%A1\;andi %B0,lo8(1<<(%3-16))\;"
+			 "clr %A0\;clr %B0\;clr %D0";
+	      }
+	    else if (shift == 8)
+	      {
+		if (<SIZE> == 3)
+		  return "mov %C0,%B1\;andi %C0,lo8(1<<(%3-16))\;"
+			 "clr %A0\;clr %B0";
+		if (<SIZE> == 4 && !AVR_HAVE_MOVW)
+		  return "mov %C0,%B1\;andi %C0,lo8(1<<(%3-16))\;"
+			 "clr %A0\;clr %C0\;clr %D0";
+	      }
+	    else if (shift == -8 && !AVR_HAVE_MOVW)
+	      return "mov %C0,%D1\;andi %C0,lo8(1<<(%3-16))\;"
+		     "clr %A0\;clr %B0\;clr %D0";
+	  }
+	else if (IN_RANGE (obit, 24, 31) && !AVR_HAVE_MOVW)
+	  {
+	    if (shift == 8)
+	      return "mov %D0,%C1\;andi %D0,lo8(1<<(%3-24))\;"
+		     "clr %A0\;clr %B0\;clr %C0";
+	    if (shift == 16)
+	      return "mov %D0,%B1\;andi %D0,lo8(1<<(%3-24))\;"
+		     "clr %A0\;clr %B0\;clr %C0";
+	    if (shift == 24)
+	      return "mov %D0,%A1\;andi %D0,lo8(1<<(%3-24))\;"
+		     "clr %A0\;clr %B0\;clr %C0";
+	  }
+      }
+
+    /* Special cases where the byte is already in place.  */
+    if (REGNO (operands[0]) == REGNO (operands[1])
+	&& ldi_ok)
+      {
+	if (shift == 1)
+	  {
+	    if (IN_RANGE (obit, 0, 7))
+	      {
+		if (<SIZE> == 1)
+		  return "lsl %0\;andi %0,lo8(1<<%3)";
+		if (<SIZE> == 2)
+		  return "lsl %A0\;andi %A0,lo8(1<<%3)\;clr %B0";
+		if (<SIZE> == 3)
+		  return "lsl %A0\;andi %A0,lo8(1<<%3)\;clr %B0\;clr %C0";
+		if (!AVR_HAVE_MOVW)
+		  return "lsl %A0\;andi %A0,lo8(1<<%3)\;"
+			 "clr %B0\;clr %C0\;clr %D0";
+	      }
+	    else if (IN_RANGE (obit, 9, 15))
+	      {
+		if (<SIZE> == 2)
+		  return "lsl %B0\;andi %B0,lo8(1<<(%3-8))\;clr %A0";
+		if (<SIZE> == 3)
+		  return "lsl %B0\;andi %B0,lo8(1<<(%3-8))\;clr %A0\;clr %C0";
+		if (!AVR_HAVE_MOVW)
+		  return "lsl %B0\;andi %B0,lo8(1<<(%3-8))\;"
+			 "clr %A0\;clr %C0\;clr %D0";
+	      }
+	    else if (IN_RANGE (obit, 17, 23))
+	      {
+		if (<SIZE> == 3)
+		  return "lsl %C0\;andi %C0,lo8(1<<(%3-16))\;clr %A0\;clr %B0";
+		if (!AVR_HAVE_MOVW)
+		  return "lsl %C0\;andi %C0,lo8(1<<(%3-16))\;"
+			 "clr %A0\;clr %B0\;clr %D0";
+	      }
+	    else if (IN_RANGE (obit, 25, 31) && !AVR_HAVE_MOVW)
+	      return "lsl %D0\;andi %D0,lo8(1<<(%3-24))\;"
+		     "clr %A0\;clr %B0\;clr %C0";
+	  }
+
+	if (shift == -1)
+	  {
+	    if (IN_RANGE (obit, 0, 6))
+	      {
+		if (<SIZE> == 1)
+		  return "lsr %0\;andi %0,lo8(1<<%3)";
+		if (<SIZE> == 2)
+		  return "lsr %A0\;andi %A0,lo8(1<<%3)\;clr %B0";
+		if (<SIZE> == 3)
+		  return "lsr %A0\;andi %A0,lo8(1<<%3)\;clr %B0\;clr %C0";
+		if (!AVR_HAVE_MOVW)
+		  return "lsr %A0\;andi %A0,lo8(1<<%3)\;"
+			 "clr %B0\;clr %C0\;clr %D0";
+	      }
+	    else if (IN_RANGE (obit, 8, 14))
+	      {
+		if (<SIZE> == 2)
+		  return "lsr %B0\;andi %B0,lo8(1<<(%3-8))\;clr %A0";
+		if (<SIZE> == 3)
+		  return "lsr %B0\;andi %A0,lo8(1<<(%3-8))\;clr %A0\;clr %C0";
+		if (!AVR_HAVE_MOVW)
+		  return "lsr %B0\;andi %B0,lo8(1<<(%3-8))\;"
+			 "clr %A0\;clr %C0\;clr %D0";
+	      }
+	    else if (IN_RANGE (obit, 16, 22))
+	      {
+		if (<SIZE> == 3)
+		  return "lsr %C0\;andi %C0,lo8(1<<(%3-16))\;clr %A0\;clr %B0";
+		if (!AVR_HAVE_MOVW)
+		  return "lsr %C0\;andi %C0,lo8(1<<(%3-16))\;"
+			 "clr %A0\;clr %B0\;clr %D0";
+	      }
+	    else if (IN_RANGE (obit, 24, 30) && !AVR_HAVE_MOVW)
+	      return "lsr %D0\;andi %D0,lo8(1<<(%3-24))\;"
+		     "clr %A0\;clr %B0\;clr %C0";
+	  }
+
+	if ((shift == 4 && IN_RANGE (obit, 4, 7))
+	    || (shift == -4 && IN_RANGE (obit, 0, 3)))
+	  {
+	    if (<SIZE> == 1)
+	      return "swap %0\;andi %0,lo8(1<<%3)";
+	    if (<SIZE> == 2)
+	      return "swap %A0\;andi %A0,lo8(1<<%3)\;clr %B0";
+	    if (<SIZE> == 3)
+	      return "swap %A0\;andi %A0,lo8(1<<%3)\;clr %B0\;clr %C0";
+	    if (!AVR_HAVE_MOVW)
+	      return "swap %A0\;andi %A0,lo8(1<<%3)\;"
+		     "clr %B0\;clr %C0\;clr %D0";
+	  }
+	if ((shift == 4 && IN_RANGE (obit, 12, 15))
+	    || (shift == -4 && IN_RANGE (obit, 8, 11)))
+	  {
+	    if (<SIZE> == 2)
+	      return "swap %B0\;andi %B0,lo8(1<<(%3-8))\;clr %A0";
+	    if (<SIZE> == 3)
+	      return "swap %B0\;andi %B0,lo8(1<<(%3-8))\;clr %A0\;clr %C0";
+	    if (!AVR_HAVE_MOVW)
+	      return "swap %B0\;andi %B0,lo8(1<<(%3-8))\;"
+		     "clr %A0\;clr %C0\;clr %D0";
+	  }
+	if ((shift == 4 && IN_RANGE (obit, 20, 23))
+	    || (shift == -4 && IN_RANGE (obit, 16, 19)))
+	  {
+	    if (<SIZE> == 3)
+	      return "swap %C0\;andi %C0,lo8(1<<(%3-16))\;clr %A0\;clr %B0";
+	    if (!AVR_HAVE_MOVW)
+	      return "swap %C0\;andi %C0,lo8(1<<(%3-16))\;"
+		     "clr %A0\;clr %B0\;clr %D0";
+	  }
+	if (((shift == 4 && IN_RANGE (obit, 28, 31))
+	     || (shift == -4 && IN_RANGE (obit, 24, 27)))
+	    && !AVR_HAVE_MOVW)
+	  return "swap %D0\;andi %D0,lo8(1<<(%3-24))\;"
+		 "clr %A0\;clr %B0\;clr %C0";
+      }
+
     if (<SIZE> == 1) return "bst %T1%T2\;clr %0\;"                 "bld %T0%T3";
     if (<SIZE> == 2) return "bst %T1%T2\;clr %A0\;clr %B0\;"       "bld %T0%T3";
     if (<SIZE> == 3) return "bst %T1%T2\;clr %A0\;clr %B0\;clr %C0\;bld %T0%T3";
diff --git a/gcc/testsuite/gcc.target/avr/insvhi-1.c b/gcc/testsuite/gcc.target/avr/insvhi-1.c
new file mode 100644
index 000000000000..4468917b7696
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvhi-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned short foo(unsigned short x)
+{
+  return x & 1;
+}
+
+/* { dg-final { scan-assembler "andi r24,1" } } */
+/* { dg-final { scan-assembler "clr r25" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvhi-2.c b/gcc/testsuite/gcc.target/avr/insvhi-2.c
new file mode 100644
index 000000000000..6899fbd0b994
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvhi-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned short foo(unsigned short x)
+{
+  return (x >> 1) & 1;
+}
+
+/* { dg-final { scan-assembler "lsr r24" } } */
+/* { dg-final { scan-assembler "andi r24,lo8\\(1<<0\\)" } } */
+/* { dg-final { scan-assembler "clr r25" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvhi-3.c b/gcc/testsuite/gcc.target/avr/insvhi-3.c
new file mode 100644
index 000000000000..2d1d9f413dc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvhi-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned short foo(unsigned short x)
+{
+  return (x >> 2) & 1;
+}
+
+/* { dg-final { scan-assembler "bst r24,2" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 2 } } */
+/* { dg-final { scan-assembler "bld r24,0" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvhi-4.c b/gcc/testsuite/gcc.target/avr/insvhi-4.c
new file mode 100644
index 000000000000..6a36f4ca6112
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvhi-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned short foo(unsigned short x)
+{
+  return (x >> 4) & 1;
+}
+
+/* { dg-final { scan-assembler "swap r24" } } */
+/* { dg-final { scan-assembler "andi r24,lo8\\(1<<0\\)" } } */
+/* { dg-final { scan-assembler "clr r25" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvhi-5.c b/gcc/testsuite/gcc.target/avr/insvhi-5.c
new file mode 100644
index 000000000000..6a6e8684a2b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvhi-5.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned short foo(unsigned short x)
+{
+  return (x >> 8) & 1;
+}
+
+/* { dg-final { scan-assembler "mov r24,r25" } } */
+/* { dg-final { scan-assembler "andi r24,1" } } */
+/* { dg-final { scan-assembler "ldi r25,0" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvqi-1.c b/gcc/testsuite/gcc.target/avr/insvqi-1.c
new file mode 100644
index 000000000000..32f53f226d81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvqi-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned char foo(unsigned char x)
+{
+  return x & 1;
+}
+
+/* { dg-final { scan-assembler "andi r24,lo8\\(1\\)" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvqi-2.c b/gcc/testsuite/gcc.target/avr/insvqi-2.c
new file mode 100644
index 000000000000..b276b2b3b92f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvqi-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned char foo(unsigned char x)
+{
+  return (x>>1) & 1;
+}
+
+/* { dg-final { scan-assembler "lsr r24" } } */
+/* { dg-final { scan-assembler "andi r24,1" } } */
+
diff --git a/gcc/testsuite/gcc.target/avr/insvqi-3.c b/gcc/testsuite/gcc.target/avr/insvqi-3.c
new file mode 100644
index 000000000000..c28320f64c81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvqi-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned char foo(unsigned char x)
+{
+  return (x>>2) & 1;
+}
+
+/* { dg-final { scan-assembler "bst r24,2" } } */
+/* { dg-final { scan-assembler "clr r24" } } */
+/* { dg-final { scan-assembler "bld r24,0" } } */
+
diff --git a/gcc/testsuite/gcc.target/avr/insvqi-4.c b/gcc/testsuite/gcc.target/avr/insvqi-4.c
new file mode 100644
index 000000000000..1ae7afe92a5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvqi-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned char foo(unsigned char x)
+{
+  return (x>>4) & 1;
+}
+
+/* { dg-final { scan-assembler "swap r24" } } */
+/* { dg-final { scan-assembler "andi r24,1" } } */
+
diff --git a/gcc/testsuite/gcc.target/avr/insvsi-1.c b/gcc/testsuite/gcc.target/avr/insvsi-1.c
new file mode 100644
index 000000000000..e057a7a09183
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvsi-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long foo(unsigned long x)
+{
+  return x & 1;
+}
+
+/* { dg-final { scan-assembler "andi r22,1" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 3 } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvsi-2.c b/gcc/testsuite/gcc.target/avr/insvsi-2.c
new file mode 100644
index 000000000000..518340322ce3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvsi-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long foo(unsigned long x)
+{
+  return (x >> 1) & 1;
+}
+
+/* { dg-final { scan-assembler "lsr r22" } } */
+/* { dg-final { scan-assembler "andi r22,lo8\\(1<<0\\)" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 3 } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvsi-3.c b/gcc/testsuite/gcc.target/avr/insvsi-3.c
new file mode 100644
index 000000000000..c8d6e1a8ea23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvsi-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long foo(unsigned long x)
+{
+  return (x >> 2) & 1;
+}
+
+/* { dg-final { scan-assembler "bst r22,2" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 4 } } */
+/* { dg-final { scan-assembler "bld r22,0" } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvsi-4.c b/gcc/testsuite/gcc.target/avr/insvsi-4.c
new file mode 100644
index 000000000000..52a3a75203e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvsi-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long foo(unsigned long x)
+{
+  return (x >> 4) & 1;
+}
+
+/* { dg-final { scan-assembler "swap r22" } } */
+/* { dg-final { scan-assembler "andi r22,lo8\\(1<<0\\)" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 3 } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvsi-5.c b/gcc/testsuite/gcc.target/avr/insvsi-5.c
new file mode 100644
index 000000000000..627016a5ae66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvsi-5.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long foo(unsigned long x)
+{
+  return (x >> 8) & 1;
+}
+
+/* { dg-final { scan-assembler "mov r22,r23" } } */
+/* { dg-final { scan-assembler "andi r22,lo8\\(1<<0\\)" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 3 } } */
diff --git a/gcc/testsuite/gcc.target/avr/insvsi-6.c b/gcc/testsuite/gcc.target/avr/insvsi-6.c
new file mode 100644
index 000000000000..6f72d6a74232
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/insvsi-6.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+unsigned long foo(unsigned long x)
+{
+  return (x >> 16) & 1;
+}
+
+/* { dg-final { scan-assembler "mov r22,r24" } } */
+/* { dg-final { scan-assembler "andi r22,lo8\\(1<<0\\)" } } */
+/* { dg-final { scan-assembler-times "clr r2\\d" 3 } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [AVR PATCH] Optimize (X>>C)&1 for C in [1,4,8,16,24] in *insv.any_shift.<mode>.
  2023-11-02 11:50 [AVR PATCH] Optimize (X>>C)&1 for C in [1,4,8,16,24] in *insv.any_shift.<mode> Roger Sayle
@ 2023-11-09 18:08 ` Georg-Johann Lay
  2024-03-05 11:15 ` [patch,avr,applied] Improve output of insn "*insv.any_shift.<mode>" Georg-Johann Lay
  1 sibling, 0 replies; 3+ messages in thread
From: Georg-Johann Lay @ 2023-11-09 18:08 UTC (permalink / raw)
  To: Roger Sayle, gcc-patches; +Cc: 'Denis Chertykov', Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 3095 bytes --]



Am 02.11.23 um 12:50 schrieb Roger Sayle:
> 
> This patch optimizes a few special cases in avr.md's *insv.any_shift.<mode>
> instruction.  This template handles tests for a single bit, where the result
> has only a (different) single bit set in the result.  Usually (currently)
> this always requires a three-instruction sequence of a BST, a CLR and a BLD
> (plus any additional CLR instructions to clear the rest of the result
> bytes).
> The special cases considered here are those that can be done with only two
> instructions (plus CLRs); an ANDI preceded by either a MOV, a SHIFT or a
> SWAP.
> 
> Hence for C=1 in HImode, GCC with -O2 currently generates:
> 
>          bst r24,1
>          clr r24
>          clr r25
>          bld r24,0
> 
> with this patch, we now generate:
> 
>          lsr r24
>          andi r24,1
>          clr r25
> 
> Likewise, HImode C=4 now becomes:
> 
>          swap r24
>          andi r24,1
>          clr r25
> 
> and SImode C=8 now becomes:
> 
>          mov r22,r23
>          andi r22,1
>          clr 23
>          clr 24
>          clr 25
> 
> 
> I've not attempted to model the instruction length accurately for these
> special cases; the logic would be ugly, but it's safe to use the current
> (1 insn longer) length.
> 
> This patch has been (partially) tested with a cross-compiler to avr-elf
> hosted on x86_64, without a simulator, where the compile-only tests in
> the gcc testsuite show no regressions.  If someone could test this more
> thoroughly that would be great.
> 
> 
> 2023-11-02  Roger Sayle  <roger@nextmovesoftware.com>

CCing Andrew.

Hi, here is a version based on yours.

I am still unsure of what to make with this insn; one approach would be
to post-reload split which simplifies the pattern a bit.  However, when
the current pattern would use MOVW, in a split version we'd get one
more instruction because there would be no MOVW but two MOV's.

Splitting would improve situation when not all of the output bytes
are used by following code, though.

Maybe Andrew has an idea; he helped a lot to improve code generation
by fixing and tweaking middle-end using AVR test cases like for PR55181
or PR109907.

Anyway, here is a version that works out exact code lengths, and it
handles some more cases.

Then I am not really sure if testcases that assert certain instruction
sequences from optimizers is a good idea or rather a liability:
The middle-end is not very good at generating reproducible code
across versions.  In particular, it's not uncommon that newer GCC
versions no more find some optimizations.  So the attached patch just
has a dg-do run without asserting anything on the exact code sequence.

Johann

--

Improve insn output for "*insv.any_shift.<mode>".

gcc/
	* config/avr/avr-protos.h (avr_out_insv): New proto.
	* config/avr/avr.md (adjust_len) [insv]: Add to define_attr.
	(*insv.any_shift.<mode>): Output using...
	* config/avr/avr.cc (avr_out_insv): ...this new function.
	(avr_adjust_insn_length) [ADJUST_LEN_INSV]: Handle new case.

gcc/testsuite/
	* gcc.target/avr/torture/insv-anyshift.c: New test.

[-- Attachment #2: x.diff --]
[-- Type: text/x-patch, Size: 11172 bytes --]

diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 5c1343f0df8..dfc949a8c0f 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -58,6 +58,7 @@ extern const char *ret_cond_branch (rtx x, int len, int reverse);
 extern const char *avr_out_movpsi (rtx_insn *, rtx*, int*);
 extern const char *avr_out_sign_extend (rtx_insn *, rtx*, int*);
 extern const char *avr_out_insert_notbit (rtx_insn *, rtx*, int*);
+extern const char *avr_out_insv (rtx_insn *, rtx*, int*);
 extern const char *avr_out_extr (rtx_insn *, rtx*, int*);
 extern const char *avr_out_extr_not (rtx_insn *, rtx*, int*);
 extern const char *avr_out_plus_set_ZN (rtx*, int*);
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index 5e0217de36f..b4d082315b5 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -9066,6 +9066,159 @@ avr_out_insert_notbit (rtx_insn *insn, rtx op[], int *plen)
 }
 
 
+/* Output instructions for  XOP[0] = (XOP[1] <Shift> XOP[2]) & XOP[3]  where
+   * XOP[0] and XOP[1] have the same mode which is one of: QI, HI, PSI, SI.
+   * XOP[3] is an exact power of 2.
+   * XOP[2] and XOP[3] are const_int.
+   * <Shift> is any of: ASHIFT, LSHIFTRT, ASHIFTRT.
+   * The result depends on XOP[1].
+   Returns "".
+   PLEN != 0: Set *PLEN to the code length in words.  Don't output anything.
+   PLEN == 0: Output instructions.  */
+
+const char*
+avr_out_insv (rtx_insn *insn, rtx xop[], int *plen)
+{
+  machine_mode mode = GET_MODE (xop[0]);
+  int n_bytes = GET_MODE_SIZE (mode);
+  rtx xsrc = SET_SRC (single_set (insn));
+
+  // Any of ASHIFT, LSHIFTRT, ASHIFTRT.
+  enum rtx_code code = GET_CODE (XEXP (xsrc, 0));
+  int shift = code == ASHIFT ? INTVAL (xop[2]) : -INTVAL (xop[2]);
+
+  // Determines the position of the output bit.
+  unsigned mask = GET_MODE_MASK (mode) & INTVAL (xop[3]);
+
+  // Position of the output / input bit, respectively.
+  int obit = exact_log2 (mask);
+  int ibit = obit - shift;
+
+  gcc_assert (IN_RANGE (obit, 0, GET_MODE_BITSIZE (mode) - 1));
+  gcc_assert (IN_RANGE (ibit, 0, GET_MODE_BITSIZE (mode) - 1));
+
+  // In the remainder, use the sub-bytes that hold the bits.
+  rtx op[4] =
+    {
+      // Output
+      simplify_gen_subreg (QImode, xop[0], mode, obit / 8),
+      GEN_INT (obit & 7),
+      // Input
+      simplify_gen_subreg (QImode, xop[1], mode, ibit / 8),
+      GEN_INT (ibit & 7)
+    };
+  obit &= 7;
+  ibit &= 7;
+
+  // The length of the default sequence at the end of this function.
+  // We only emit anything other than the default when we find a sequence
+  // that is strictly shorter than the default sequence.
+  const int len0 = 2 + n_bytes - (n_bytes == 4 && AVR_HAVE_MOVW);
+
+  // Finding something shorter than the default sequence implies that there
+  // must be at most 2 instructions that deal with the bytes containing the
+  // relevant bits.  In addition, we need  N_BYTES - 1  instructions to clear
+  // the remaining result bytes.
+
+  const int n_clr = n_bytes - 1;
+  bool clr_p = false;
+  bool andi_p = false;
+
+  if (plen)
+    *plen = 0;
+
+  if (REGNO (op[0]) == REGNO (op[2])
+      // Output reg allows ANDI.
+      && test_hard_reg_class (LD_REGS, op[0]))
+    {
+      if (1 + n_clr < len0
+	  // Same byte and bit: A single ANDI will do.
+	  && obit == ibit)
+	{
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // |obit - ibit| = 4:  SWAP + ANDI will do.
+	       && (obit == ibit + 4 || obit == ibit - 4))
+	{
+	  avr_asm_len ("swap %0", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // LSL + ANDI will do.
+	       && obit == ibit + 1)
+	{
+	  avr_asm_len ("lsl %0", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // LSR + ANDI will do.
+	       && obit == ibit - 1)
+	{
+	  avr_asm_len ("lsr %0", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+    }
+
+  if (REGNO (op[0]) != REGNO (op[2])
+      && obit == ibit)
+    {
+      if (2 + n_clr < len0
+	  // Same bit but different byte: MOV + ANDI will do.
+	  && test_hard_reg_class (LD_REGS, op[0]))
+	{
+	  avr_asm_len ("mov %0,%2", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // Same bit but different byte:  We can use ANDI + MOV,
+	       // but only if the input byte is LD_REGS and unused after.
+	       && test_hard_reg_class (LD_REGS, op[2])
+	       && reg_unused_after (insn, op[2]))
+	{
+	  avr_asm_len ("andi %2,1<<%3"  CR_TAB
+		       "mov %0,%2", op, plen, 2);
+	  clr_p = true;
+	}
+    }
+
+  // Output remaining instructions of the shorter sequence.
+
+  if (andi_p)
+    avr_asm_len ("andi %0,1<<%1", op, plen, 1);
+
+  if (clr_p)
+    {
+      for (int b = 0; b < n_bytes; ++b)
+	{
+	  rtx byte = simplify_gen_subreg (QImode, xop[0], mode, b);
+	  if (REGNO (byte) != REGNO (op[0]))
+	    avr_asm_len ("clr %0", &byte, plen, 1);
+	}
+
+      // CLR_P means we found a shorter sequence, so we are done now.
+      return "";
+    }
+
+  // No shorter sequence found, just emit  BST, CLR*, BLD  sequence.
+
+  avr_asm_len ("bst %2,%3", op, plen, -1);
+
+  if (n_bytes == 4 && AVR_HAVE_MOVW)
+    avr_asm_len ("clr %A0"   CR_TAB
+		 "clr %B0"   CR_TAB
+		 "movw %C0,%A0", xop, plen, 3);
+  else
+    for (int b = 0; b < n_bytes; ++b)
+      {
+	rtx byte = simplify_gen_subreg (QImode, xop[0], mode, b);
+	avr_asm_len ("clr %0", &byte, plen, 1);
+      }
+
+  return avr_asm_len ("bld %0,%1", op, plen, 1);
+}
+
+
 /* Output instructions to extract a bit to 8-bit register XOP[0].
    The input XOP[1] is a register or an 8-bit MEM in the lower I/O range.
    XOP[2] is the const_int bit position.  Return "".
@@ -9994,6 +10147,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len)
     case ADJUST_LEN_OUT_BITOP: avr_out_bitop (insn, op, &len); break;
     case ADJUST_LEN_EXTR_NOT: avr_out_extr_not (insn, op, &len); break;
     case ADJUST_LEN_EXTR: avr_out_extr (insn, op, &len); break;
+    case ADJUST_LEN_INSV: avr_out_insv (insn, op, &len); break;
 
     case ADJUST_LEN_PLUS: avr_out_plus (insn, op, &len); break;
     case ADJUST_LEN_ADDTO_SP: avr_out_addto_sp (op, &len); break;
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 83dd15040b0..73bddec2b33 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -163,7 +163,7 @@ (define_attr "adjust_len"
    ashlhi, ashrhi, lshrhi,
    ashlsi, ashrsi, lshrsi,
    ashlpsi, ashrpsi, lshrpsi,
-   insert_bits, insv_notbit,
+   insert_bits, insv_notbit, insv,
    add_set_ZN, cmp_uext, cmp_sext,
    no"
   (const_string "no"))
@@ -9818,6 +9818,12 @@ (define_insn_and_split "*extzv.io.lsr7"
                          (const_int 1)
                          (const_int 7)))])
 
+;; This insn serves as a combine bridge because insn combine will only
+;; combine so much (3) insns at most.  It's not actually an open coded
+;; bit-insertion but just a part of it.  It may occur in other contexts
+;; than INSV though, and in such a case the code may be worse than without
+;; this pattern.  We still have to emit code for it in that case because
+;; we cannot roll back.
 (define_insn_and_split "*insv.any_shift.<mode>_split"
   [(set (match_operand:QISI 0 "register_operand" "=r")
         (and:QISI (any_shift:QISI (match_operand:QISI 1 "register_operand" "r")
@@ -9840,27 +9846,9 @@ (define_insn "*insv.any_shift.<mode>"
    (clobber (reg:CC REG_CC))]
   "reload_completed"
   {
-    int shift = <CODE> == ASHIFT ? INTVAL (operands[2]) : -INTVAL (operands[2]);
-    int mask = GET_MODE_MASK (<MODE>mode) & INTVAL (operands[3]);
-    // Position of the output / input bit, respectively.
-    int obit = exact_log2 (mask);
-    int ibit = obit - shift;
-    gcc_assert (IN_RANGE (obit, 0, <MSB>));
-    gcc_assert (IN_RANGE (ibit, 0, <MSB>));
-    operands[3] = GEN_INT (obit);
-    operands[2] = GEN_INT (ibit);
-
-    if (<SIZE> == 1) return "bst %T1%T2\;clr %0\;"                 "bld %T0%T3";
-    if (<SIZE> == 2) return "bst %T1%T2\;clr %A0\;clr %B0\;"       "bld %T0%T3";
-    if (<SIZE> == 3) return "bst %T1%T2\;clr %A0\;clr %B0\;clr %C0\;bld %T0%T3";
-    return AVR_HAVE_MOVW
-      ? "bst %T1%T2\;clr %A0\;clr %B0\;movw %C0,%A0\;"  "bld %T0%T3"
-      : "bst %T1%T2\;clr %A0\;clr %B0\;clr %C0\;clr %D0\;bld %T0%T3";
+    return avr_out_insv (insn, operands, nullptr);
   }
-  [(set (attr "length")
-        (minus (symbol_ref "2 + <SIZE>")
-               ; One less if we can use a MOVW to clear.
-               (symbol_ref "<SIZE> == 4 && AVR_HAVE_MOVW")))])
+  [(set_attr "adjust_len" "insv")])
 
 
 (define_insn_and_split "*extzv.<mode>hi2"
diff --git a/gcc/testsuite/gcc.target/avr/torture/insv-anyshift.c b/gcc/testsuite/gcc.target/avr/torture/insv-anyshift.c
new file mode 100644
index 00000000000..2f94e1787b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/insv-anyshift.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+
+typedef __UINT32_TYPE__ uint32_t;
+
+/* Testing inlined and completely folded versions of functions
+   against their non-inlined, non-folded counnterparts.  */
+
+#define MK_FUN1(OBIT, LSR)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint32_t fun1_lsr_##OBIT##_##LSR##_ai (int x, uint32_t a)		\
+  {									\
+    (void) x;								\
+    return (a >> LSR) & (1ul << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint32_t fun1_lsr_##OBIT##_##LSR##_ni (int x, uint32_t a)		\
+  {									\
+    return fun1_lsr_##OBIT##_##LSR##_ai (x, a);				\
+  }									\
+									\
+  void test_fun1_lsr_##OBIT##_##LSR (void)				\
+  {									\
+    if (fun1_lsr_##OBIT##_##LSR##_ni (0, 1ul << (OBIT + LSR))		\
+	!= fun1_lsr_##OBIT##_##LSR##_ai (0, 1ul << (OBIT + LSR)))	\
+      __builtin_abort();						\
+									\
+    if (fun1_lsr_##OBIT##_##LSR##_ni (0, 1ul << (OBIT + LSR))		\
+	!= fun1_lsr_##OBIT##_##LSR##_ai (0, -1ul))			\
+      __builtin_abort();						\
+  }
+  
+
+#define MK_FUN2(OBIT, LSL)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint32_t fun2_lsl_##OBIT##_##LSL##_ai (int x, uint32_t a)		\
+  {									\
+    (void) x;								\
+    return (a << LSL) & (1ul << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint32_t fun2_lsl_##OBIT##_##LSL##_ni (int x, uint32_t a)		\
+  {									\
+    return fun2_lsl_##OBIT##_##LSL##_ai (x, a);				\
+  }									\
+									\
+  void test_fun2_lsl_##OBIT##_##LSL (void)				\
+  {									\
+    if (fun2_lsl_##OBIT##_##LSL##_ni (0, 1ul << (OBIT - LSL))		\
+	!= fun2_lsl_##OBIT##_##LSL##_ai (0, 1ul << (OBIT - LSL)))	\
+      __builtin_abort();						\
+									\
+    if (fun2_lsl_##OBIT##_##LSL##_ni (0, 1ul << (OBIT - LSL))		\
+	!= fun2_lsl_##OBIT##_##LSL##_ai (0, -1ul))			\
+      __builtin_abort();						\
+  }
+
+
+MK_FUN1 (13, 15)
+MK_FUN1 (13, 16)
+MK_FUN1 (13, 17)
+MK_FUN1 (13, 12)
+
+MK_FUN2 (12, 8)
+MK_FUN2 (13, 8)
+MK_FUN2 (16, 8)
+
+int main (void)
+{
+  test_fun1_lsr_13_15 ();
+  test_fun1_lsr_13_16 ();
+  test_fun1_lsr_13_17 ();
+  test_fun1_lsr_13_12 ();
+
+  test_fun2_lsl_12_8 ();
+  test_fun2_lsl_13_8 ();
+  test_fun2_lsl_16_8 ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [patch,avr,applied] Improve output of insn "*insv.any_shift.<mode>".
  2023-11-02 11:50 [AVR PATCH] Optimize (X>>C)&1 for C in [1,4,8,16,24] in *insv.any_shift.<mode> Roger Sayle
  2023-11-09 18:08 ` Georg-Johann Lay
@ 2024-03-05 11:15 ` Georg-Johann Lay
  1 sibling, 0 replies; 3+ messages in thread
From: Georg-Johann Lay @ 2024-03-05 11:15 UTC (permalink / raw)
  To: Roger Sayle, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1699 bytes --]

Applied Roger's proposed improvements with some changes:

Lengthy code is more convenient in avr.cc than in an insn
output function, and it makes it easy to work out the exact
instruction length.  Moreover, the code can handle shifts
with offset zero (cases of *and<mode>3 insns).

Passed with no new regressions on ATmega128.

Applied as https://gcc.gnu.org/r14-9317

Johann

--

AVR: Improve output of insn "*insv.any_shift.<mode>_split".

The instructions printed by insn "*insv.any_shift.<mode>_split" were
sub-optimal.  The code to print the improved output is lengthy and
performed by new function avr_out_insv.  As it turns out, the function
can also handle shift-offsets of zero, which is "*andhi3", "*andpsi3"
and "*andsi3".  Thus, these tree insns get a new 3-operand alternative
where the 3rd operand is an exact power of 2.

gcc/
         * config/avr/avr-protos.h (avr_out_insv): New proto.
         * config/avr/avr.cc (avr_out_insv): New function.
         (avr_adjust_insn_length) [ADJUST_LEN_INSV]: Handle case.
         (avr_cbranch_cost) [ZERO_EXTRACT]: Adjust rtx costs.
         * config/avr/avr.md (define_attr "adjust_len") Add insv.
         (andhi3, *andhi3, andpsi3, *andpsi3, andsi3, *andsi3):
         Add constraint alternative where the 3rd operand is a power
         of 2, and the source register may differ from the destination.
         (*insv.any_shift.<mode>_split): Call avr_out_insv to output
         instructions.  Set attr "length" to "insv".
         * config/avr/constraints.md (Cb2, Cb3, Cb4): New constraints.

gcc/testsuite/
         * gcc.target/avr/torture/insv-anyshift-hi.c: New test.
         * gcc.target/avr/torture/insv-anyshift-si.c: New test.

[-- Attachment #2: rs.diff --]
[-- Type: text/x-patch, Size: 25036 bytes --]

commit 49a1a340ea0eef681f23b6861f3cdb6840aadd99
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Tue Mar 5 11:06:17 2024 +0100

    AVR: Improve output of insn "*insv.any_shift.<mode>_split".
    
    The instructions printed by insn "*insv.any_shift.<mode>_split" were
    sub-optimal.  The code to print the improved output is lengthy and
    performed by new function avr_out_insv.  As it turns out, the function
    can also handle shift-offsets of zero, which is "*andhi3", "*andpsi3"
    and "*andsi3".  Thus, these tree insns get a new 3-operand alternative
    where the 3rd operand is an exact power of 2.
    
    gcc/
            * config/avr/avr-protos.h (avr_out_insv): New proto.
            * config/avr/avr.cc (avr_out_insv): New function.
            (avr_adjust_insn_length) [ADJUST_LEN_INSV]: Handle case.
            (avr_cbranch_cost) [ZERO_EXTRACT]: Adjust rtx costs.
            * config/avr/avr.md (define_attr "adjust_len") Add insv.
            (andhi3, *andhi3, andpsi3, *andpsi3, andsi3, *andsi3):
            Add constraint alternative where the 3rd operand is a power
            of 2, and the source register may differ from the destination.
            (*insv.any_shift.<mode>_split): Call avr_out_insv to output
            instructions.  Set attr "length" to "insv".
            * config/avr/constraints.md (Cb2, Cb3, Cb4): New constraints.
    
    gcc/testsuite/
            * gcc.target/avr/torture/insv-anyshift-hi.c: New test.
            * gcc.target/avr/torture/insv-anyshift-si.c: New test.

diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 3e19409d636..bb680312117 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -58,6 +58,7 @@ extern const char *ret_cond_branch (rtx x, int len, int reverse);
 extern const char *avr_out_movpsi (rtx_insn *, rtx*, int*);
 extern const char *avr_out_sign_extend (rtx_insn *, rtx*, int*);
 extern const char *avr_out_insert_notbit (rtx_insn *, rtx*, int*);
+extern const char *avr_out_insv (rtx_insn *, rtx*, int*);
 extern const char *avr_out_extr (rtx_insn *, rtx*, int*);
 extern const char *avr_out_extr_not (rtx_insn *, rtx*, int*);
 extern const char *avr_out_plus_set_ZN (rtx*, int*);
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index c8b2b504e3f..36995e05cbe 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -9795,6 +9795,178 @@ avr_out_insert_notbit (rtx_insn *insn, rtx op[], int *plen)
 }
 
 
+/* Output instructions for  XOP[0] = (XOP[1] <Shift> XOP[2]) & XOP[3]  where
+   -  XOP[0] and XOP[1] have the same mode which is one of: QI, HI, PSI, SI.
+   -  XOP[3] is an exact const_int power of 2.
+   -  XOP[2] and XOP[3] are const_int.
+   -  <Shift> is any of: ASHIFT, LSHIFTRT, ASHIFTRT.
+   -  The result depends on XOP[1].
+   or  XOP[0] = XOP[1] & XOP[2]  where
+   -  XOP[0] and XOP[1] have the same mode which is one of: HI, PSI, SI.
+   -  XOP[2] is an exact const_int power of 2.
+   Returns "".
+   PLEN != 0: Set *PLEN to the code length in words.  Don't output anything.
+   PLEN == 0: Output instructions.  */
+
+const char*
+avr_out_insv (rtx_insn *insn, rtx xop[], int *plen)
+{
+  machine_mode mode = GET_MODE (xop[0]);
+  int n_bytes = GET_MODE_SIZE (mode);
+  rtx xsrc = SET_SRC (single_set (insn));
+
+  gcc_assert (AND == GET_CODE (xsrc));
+
+  rtx xop2 = xop[2];
+  rtx xop3 = xop[3];
+
+  if (REG_P (XEXP (xsrc, 0)))
+    {
+      // This function can also handle AND with an exact power of 2,
+      // which can be regarded as a XOP[1] shift with offset 0.
+      rtx xshift = gen_rtx_ASHIFT (mode, xop[1], const0_rtx);
+      xsrc = gen_rtx_AND (mode, xshift, xop[2]);
+      xop3 = xop[2];
+      xop2 = const0_rtx;
+    }
+
+  // Any of ASHIFT, LSHIFTRT, ASHIFTRT.
+  enum rtx_code code = GET_CODE (XEXP (xsrc, 0));
+  int shift = code == ASHIFT ? INTVAL (xop2) : -INTVAL (xop2);
+
+  // Determines the position of the output bit.
+  unsigned mask = GET_MODE_MASK (mode) & INTVAL (xop3);
+
+  // Position of the output / input bit, respectively.
+  int obit = exact_log2 (mask);
+  int ibit = obit - shift;
+
+  gcc_assert (IN_RANGE (obit, 0, GET_MODE_BITSIZE (mode) - 1));
+  gcc_assert (IN_RANGE (ibit, 0, GET_MODE_BITSIZE (mode) - 1));
+
+  // In the remainder, use the sub-bytes that hold the bits.
+  rtx op[4] =
+    {
+      // Output
+      simplify_gen_subreg (QImode, xop[0], mode, obit / 8),
+      GEN_INT (obit & 7),
+      // Input
+      simplify_gen_subreg (QImode, xop[1], mode, ibit / 8),
+      GEN_INT (ibit & 7)
+    };
+  obit &= 7;
+  ibit &= 7;
+
+  // The length of the default sequence at the end of this function.
+  // We only emit anything other than the default when we find a sequence
+  // that is strictly shorter than the default sequence; which is:
+  // BST + <CLR-result-bytes> + BLD.
+  const int len0 = 2 + n_bytes - (n_bytes == 4 && AVR_HAVE_MOVW);
+
+  // Finding something shorter than the default sequence implies that there
+  // must be at most 2 instructions that deal with the bytes containing the
+  // relevant bits.  In addition, we need  N_BYTES - 1  instructions to clear
+  // the remaining result bytes.
+
+  const int n_clr = n_bytes - 1;
+  bool clr_p = false;
+  bool andi_p = false;
+
+  if (plen)
+    *plen = 0;
+
+  if (REGNO (op[0]) == REGNO (op[2])
+      // Output reg allows ANDI.
+      && test_hard_reg_class (LD_REGS, op[0]))
+    {
+      if (1 + n_clr < len0
+	  // Same byte and bit: A single ANDI will do.
+	  && obit == ibit)
+	{
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // |obit - ibit| = 4:  SWAP + ANDI will do.
+	       && (obit == ibit + 4 || obit == ibit - 4))
+	{
+	  avr_asm_len ("swap %0", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // LSL + ANDI will do.
+	       && obit == ibit + 1)
+	{
+	  avr_asm_len ("lsl %0", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // LSR + ANDI will do.
+	       && obit == ibit - 1)
+	{
+	  avr_asm_len ("lsr %0", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+    }
+
+  if (REGNO (op[0]) != REGNO (op[2])
+      && obit == ibit)
+    {
+      if (2 + n_clr < len0
+	  // Same bit but different byte: MOV + ANDI will do.
+	  && test_hard_reg_class (LD_REGS, op[0]))
+	{
+	  avr_asm_len ("mov %0,%2", op, plen, 1);
+	  clr_p = andi_p = true;
+	}
+      else if (2 + n_clr < len0
+	       // Same bit but different byte:  We can use ANDI + MOV,
+	       // but only if the input byte is LD_REGS and unused after.
+	       && test_hard_reg_class (LD_REGS, op[2])
+	       && reg_unused_after (insn, op[2]))
+	{
+	  avr_asm_len ("andi %2,1<<%3"  CR_TAB
+		       "mov %0,%2", op, plen, 2);
+	  clr_p = true;
+	}
+    }
+
+  // Output remaining instructions of the shorter sequence.
+
+  if (andi_p)
+    avr_asm_len ("andi %0,1<<%1", op, plen, 1);
+
+  if (clr_p)
+    {
+      for (int b = 0; b < n_bytes; ++b)
+	{
+	  rtx byte = simplify_gen_subreg (QImode, xop[0], mode, b);
+	  if (REGNO (byte) != REGNO (op[0]))
+	    avr_asm_len ("clr %0", &byte, plen, 1);
+	}
+
+      // CLR_P means we found a shorter sequence, so we are done now.
+      return "";
+    }
+
+  // No shorter sequence found, just emit  BST, CLR*, BLD  sequence.
+
+  avr_asm_len ("bst %2,%3", op, plen, -1);
+
+  if (n_bytes == 4 && AVR_HAVE_MOVW)
+    avr_asm_len ("clr %A0"   CR_TAB
+		 "clr %B0"   CR_TAB
+		 "movw %C0,%A0", xop, plen, 3);
+  else
+    for (int b = 0; b < n_bytes; ++b)
+      {
+	rtx byte = simplify_gen_subreg (QImode, xop[0], mode, b);
+	avr_asm_len ("clr %0", &byte, plen, 1);
+      }
+
+  return avr_asm_len ("bld %0,%1", op, plen, 1);
+}
+
+
 /* Output instructions to extract a bit to 8-bit register XOP[0].
    The input XOP[1] is a register or an 8-bit MEM in the lower I/O range.
    XOP[2] is the const_int bit position.  Return "".
@@ -10721,6 +10893,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len)
     case ADJUST_LEN_OUT_BITOP: avr_out_bitop (insn, op, &len); break;
     case ADJUST_LEN_EXTR_NOT: avr_out_extr_not (insn, op, &len); break;
     case ADJUST_LEN_EXTR: avr_out_extr (insn, op, &len); break;
+    case ADJUST_LEN_INSV: avr_out_insv (insn, op, &len); break;
 
     case ADJUST_LEN_PLUS: avr_out_plus (insn, op, &len); break;
     case ADJUST_LEN_ADDTO_SP: avr_out_addto_sp (op, &len); break;
@@ -12206,6 +12379,14 @@ avr_cbranch_cost (rtx x)
       return COSTS_N_INSNS (size + 1 + 1);
     }
 
+  if (GET_CODE (xreg) == ZERO_EXTRACT
+      && XEXP (xreg, 1) == const1_rtx)
+    {
+      // Branch on a single bit, with an additional edge due to less
+      // register pressure.
+      return (int) COSTS_N_INSNS (1.5);
+    }
+
   bool reg_p = register_operand (xreg, mode);
   bool reg_or_0_p = reg_or_0_operand (xval, mode);
 
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 6606837b5f7..6bdf4682fab 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -170,7 +170,7 @@ (define_attr "adjust_len"
    ashlhi, ashrhi, lshrhi,
    ashlsi, ashrsi, lshrsi,
    ashlpsi, ashrpsi, lshrpsi,
-   insert_bits, insv_notbit,
+   insert_bits, insv_notbit, insv,
    add_set_ZN, cmp_uext, cmp_sext,
    no"
   (const_string "no"))
@@ -4380,10 +4380,10 @@ (define_insn "*andqi3"
   [(set_attr "length" "1,1,2")])
 
 (define_insn_and_split "andhi3"
-  [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r")
-        (and:HI (match_operand:HI 1 "register_operand" "%0,0,0,0  ,0")
-                (match_operand:HI 2 "nonmemory_operand" "r,s,n,Ca2,n")))
-   (clobber (match_scratch:QI 3                        "=X,X,X,X  ,&d"))]
+  [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r  ,r")
+        (and:HI (match_operand:HI 1 "register_operand" "%0,0,0,0  ,r  ,0")
+                (match_operand:HI 2 "nonmemory_operand" "r,s,n,Ca2,Cb2,n")))
+   (clobber (match_scratch:QI 3                        "=X,X,X,X  ,X  ,&d"))]
   ""
   "#"
   "&& reload_completed"
@@ -4394,10 +4394,10 @@ (define_insn_and_split "andhi3"
               (clobber (reg:CC REG_CC))])])
 
 (define_insn "*andhi3"
-  [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r")
-        (and:HI (match_operand:HI 1 "register_operand" "%0,0,0,0  ,0")
-                (match_operand:HI 2 "nonmemory_operand" "r,s,n,Ca2,n")))
-   (clobber (match_scratch:QI 3                        "=X,X,X,X  ,&d"))
+  [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r  ,r")
+        (and:HI (match_operand:HI 1 "register_operand" "%0,0,0,0  ,r  ,0")
+                (match_operand:HI 2 "nonmemory_operand" "r,s,n,Ca2,Cb2,n")))
+   (clobber (match_scratch:QI 3                        "=X,X,X,X  ,X  ,&d"))
    (clobber (reg:CC REG_CC))]
   "reload_completed"
   {
@@ -4405,17 +4405,19 @@ (define_insn "*andhi3"
       return "and %A0,%A2\;and %B0,%B2";
     else if (which_alternative == 1)
       return "andi %A0,lo8(%2)\;andi %B0,hi8(%2)";
+    else if (which_alternative == 4)
+      return avr_out_insv (insn, operands, NULL);
 
     return avr_out_bitop (insn, operands, NULL);
   }
-  [(set_attr "length" "2,2,2,4,4")
-   (set_attr "adjust_len" "*,*,out_bitop,out_bitop,out_bitop")])
+  [(set_attr "length" "2,2,2,4,4,4")
+   (set_attr "adjust_len" "*,*,out_bitop,out_bitop,insv,out_bitop")])
 
 (define_insn_and_split "andpsi3"
-  [(set (match_operand:PSI 0 "register_operand"        "=??r,d,r  ,r")
-        (and:PSI (match_operand:PSI 1 "register_operand" "%0,0,0  ,0")
-                 (match_operand:PSI 2 "nonmemory_operand" "r,n,Ca3,n")))
-   (clobber (match_scratch:QI 3                          "=X,X,X  ,&d"))]
+  [(set (match_operand:PSI 0 "register_operand"        "=??r,d,r  ,r  ,r")
+        (and:PSI (match_operand:PSI 1 "register_operand" "%0,0,0  ,r  ,0")
+                 (match_operand:PSI 2 "nonmemory_operand" "r,n,Ca3,Cb3,n")))
+   (clobber (match_scratch:QI 3                          "=X,X,X  ,X  ,&d"))]
   ""
   "#"
   "&& reload_completed"
@@ -4426,10 +4428,10 @@ (define_insn_and_split "andpsi3"
               (clobber (reg:CC REG_CC))])])
 
 (define_insn "*andpsi3"
-  [(set (match_operand:PSI 0 "register_operand"        "=??r,d,r  ,r")
-        (and:PSI (match_operand:PSI 1 "register_operand" "%0,0,0  ,0")
-                 (match_operand:PSI 2 "nonmemory_operand" "r,n,Ca3,n")))
-   (clobber (match_scratch:QI 3                          "=X,X,X  ,&d"))
+  [(set (match_operand:PSI 0 "register_operand"        "=??r,d,r  ,r  ,r")
+        (and:PSI (match_operand:PSI 1 "register_operand" "%0,0,0  ,r  ,0")
+                 (match_operand:PSI 2 "nonmemory_operand" "r,n,Ca3,Cb3,n")))
+   (clobber (match_scratch:QI 3                          "=X,X,X  ,X  ,&d"))
    (clobber (reg:CC REG_CC))]
   "reload_completed"
   {
@@ -4438,16 +4440,19 @@ (define_insn "*andpsi3"
              "and %B0,%B2" CR_TAB
              "and %C0,%C2";
 
+    if (which_alternative == 3)
+      return avr_out_insv (insn, operands, NULL);
+
     return avr_out_bitop (insn, operands, NULL);
   }
-  [(set_attr "length" "3,3,6,6")
-   (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")])
+  [(set_attr "length" "3,3,6,5,6")
+   (set_attr "adjust_len" "*,out_bitop,out_bitop,insv,out_bitop")])
 
 (define_insn_and_split "andsi3"
-  [(set (match_operand:SI 0 "register_operand"       "=??r,d,r  ,r")
-        (and:SI (match_operand:SI 1 "register_operand" "%0,0,0  ,0")
-                (match_operand:SI 2 "nonmemory_operand" "r,n,Ca4,n")))
-   (clobber (match_scratch:QI 3                        "=X,X,X  ,&d"))]
+  [(set (match_operand:SI 0 "register_operand"       "=??r,d,r  ,r  ,r")
+        (and:SI (match_operand:SI 1 "register_operand" "%0,0,0  ,r  ,0")
+                (match_operand:SI 2 "nonmemory_operand" "r,n,Ca4,Cb4,n")))
+   (clobber (match_scratch:QI 3                        "=X,X,X  ,X  ,&d"))]
   ""
   "#"
   "&& reload_completed"
@@ -4458,10 +4463,10 @@ (define_insn_and_split "andsi3"
               (clobber (reg:CC REG_CC))])])
 
 (define_insn "*andsi3"
-  [(set (match_operand:SI 0 "register_operand"       "=??r,d,r  ,r")
-        (and:SI (match_operand:SI 1 "register_operand" "%0,0,0  ,0")
-                (match_operand:SI 2 "nonmemory_operand" "r,n,Ca4,n")))
-   (clobber (match_scratch:QI 3                        "=X,X,X  ,&d"))
+  [(set (match_operand:SI 0 "register_operand"       "=??r,d,r  ,r  ,r")
+        (and:SI (match_operand:SI 1 "register_operand" "%0,0,0  ,r  ,0")
+                (match_operand:SI 2 "nonmemory_operand" "r,n,Ca4,Cb4,n")))
+   (clobber (match_scratch:QI 3                        "=X,X,X  ,X  ,&d"))
    (clobber (reg:CC REG_CC))]
   "reload_completed"
   {
@@ -4471,10 +4476,13 @@ (define_insn "*andsi3"
              "and %C0,%C2" CR_TAB
              "and %D0,%D2";
 
+    if (which_alternative == 3)
+      return avr_out_insv (insn, operands, NULL);
+
     return avr_out_bitop (insn, operands, NULL);
   }
-  [(set_attr "length" "4,4,8,8")
-   (set_attr "adjust_len" "*,out_bitop,out_bitop,out_bitop")])
+  [(set_attr "length" "4,4,8,6,8")
+   (set_attr "adjust_len" "*,out_bitop,out_bitop,insv,out_bitop")])
 
 (define_peephole2 ; andi
   [(parallel [(set (match_operand:QI 0 "d_register_operand" "")
@@ -9852,6 +9860,12 @@ (define_insn_and_split "*extzv.io.lsr7"
                          (const_int 1)
                          (const_int 7)))])
 
+;; This insn serves as a combine bridge because insn combine will only
+;; combine so much (3) insns at most.  It's not actually an open coded
+;; bit-insertion but just a part of it.  It may occur in other contexts
+;; than INSV though, and in such a case the code may be worse than without
+;; this pattern.  We still have to emit code for it in that case because
+;; we cannot roll back.
 (define_insn_and_split "*insv.any_shift.<mode>_split"
   [(set (match_operand:QISI 0 "register_operand" "=r")
         (and:QISI (any_shift:QISI (match_operand:QISI 1 "register_operand" "r")
@@ -9874,27 +9888,9 @@ (define_insn "*insv.any_shift.<mode>"
    (clobber (reg:CC REG_CC))]
   "reload_completed"
   {
-    int shift = <CODE> == ASHIFT ? INTVAL (operands[2]) : -INTVAL (operands[2]);
-    int mask = GET_MODE_MASK (<MODE>mode) & INTVAL (operands[3]);
-    // Position of the output / input bit, respectively.
-    int obit = exact_log2 (mask);
-    int ibit = obit - shift;
-    gcc_assert (IN_RANGE (obit, 0, <MSB>));
-    gcc_assert (IN_RANGE (ibit, 0, <MSB>));
-    operands[3] = GEN_INT (obit);
-    operands[2] = GEN_INT (ibit);
-
-    if (<SIZE> == 1) return "bst %T1%T2\;clr %0\;"                 "bld %T0%T3";
-    if (<SIZE> == 2) return "bst %T1%T2\;clr %A0\;clr %B0\;"       "bld %T0%T3";
-    if (<SIZE> == 3) return "bst %T1%T2\;clr %A0\;clr %B0\;clr %C0\;bld %T0%T3";
-    return AVR_HAVE_MOVW
-      ? "bst %T1%T2\;clr %A0\;clr %B0\;movw %C0,%A0\;"  "bld %T0%T3"
-      : "bst %T1%T2\;clr %A0\;clr %B0\;clr %C0\;clr %D0\;bld %T0%T3";
+    return avr_out_insv (insn, operands, nullptr);
   }
-  [(set (attr "length")
-        (minus (symbol_ref "2 + <SIZE>")
-               ; One less if we can use a MOVW to clear.
-               (symbol_ref "<SIZE> == 4 && AVR_HAVE_MOVW")))])
+  [(set_attr "adjust_len" "insv")])
 
 
 (define_insn_and_split "*extzv.<mode>hi2"
diff --git a/gcc/config/avr/constraints.md b/gcc/config/avr/constraints.md
index 81ed63db2cc..fac54da17db 100644
--- a/gcc/config/avr/constraints.md
+++ b/gcc/config/avr/constraints.md
@@ -188,6 +188,21 @@ (define_constraint "Co4"
   (and (match_code "const_int")
        (match_test "avr_popcount_each_byte (op, 4, (1<<0) | (1<<1) | (1<<8))")))
 
+(define_constraint "Cb2"
+  "Constant 2-byte integer that has exactly 1 bit set."
+  (and (match_code "const_int")
+       (match_test "single_one_operand (op, HImode)")))
+
+(define_constraint "Cb3"
+  "Constant 3-byte integer that has exactly 1 bit set."
+  (and (match_code "const_int")
+       (match_test "single_one_operand (op, PSImode)")))
+
+(define_constraint "Cb4"
+  "Constant 4-byte integer that has exactly 1 bit set."
+  (and (match_code "const_int")
+       (match_test "single_one_operand (op, SImode)")))
+
 (define_constraint "Cx2"
   "Constant 2-byte integer that allows XOR without clobber register."
   (and (match_code "const_int")
diff --git a/gcc/testsuite/gcc.target/avr/torture/insv-anyshift-hi.c b/gcc/testsuite/gcc.target/avr/torture/insv-anyshift-hi.c
new file mode 100644
index 00000000000..7ee5c04813a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/insv-anyshift-hi.c
@@ -0,0 +1,141 @@
+/* { dg-do run } */
+/* { dg-additional-options { -fno-split-wide-types } } */
+
+typedef __UINT16_TYPE__ uint16_t;
+
+/* Testing inlined and completely folded versions of functions
+   against their non-inlined, non-folded counnterparts.  */
+
+#define MK_FUN1(OBIT, LSR)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint16_t fun1_lsr_##OBIT##_##LSR##_ai (int x, uint16_t a)		\
+  {									\
+    (void) x;								\
+    return (a >> LSR) & (1u << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint16_t fun1_lsr_##OBIT##_##LSR##_ni (int x, uint16_t a)		\
+  {									\
+    return fun1_lsr_##OBIT##_##LSR##_ai (x, a);				\
+  }									\
+									\
+  void test_fun1_lsr_##OBIT##_##LSR (void)				\
+  {									\
+    if (fun1_lsr_##OBIT##_##LSR##_ni (0, 1u << (OBIT + LSR))		\
+	!= fun1_lsr_##OBIT##_##LSR##_ai (0, 1u << (OBIT + LSR)))	\
+      __builtin_abort();						\
+									\
+    if (fun1_lsr_##OBIT##_##LSR##_ni (0, 1u << (OBIT + LSR))		\
+	!= fun1_lsr_##OBIT##_##LSR##_ai (0, -1u))			\
+      __builtin_abort();						\
+  }
+
+#define MK_FUN3(OBIT, LSR)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint16_t fun3_lsr_##OBIT##_##LSR##_ai (uint16_t a)			\
+  {									\
+    return (a >> LSR) & (1u << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint16_t fun3_lsr_##OBIT##_##LSR##_ni (uint16_t a)			\
+  {									\
+    return fun3_lsr_##OBIT##_##LSR##_ai (a);				\
+  }									\
+									\
+  void test_fun3_lsr_##OBIT##_##LSR (void)				\
+  {									\
+    if (fun3_lsr_##OBIT##_##LSR##_ni (1u << (OBIT + LSR))		\
+	!= fun3_lsr_##OBIT##_##LSR##_ai (1u << (OBIT + LSR)))		\
+      __builtin_abort();						\
+									\
+    if (fun3_lsr_##OBIT##_##LSR##_ni (1u << (OBIT + LSR))		\
+	!= fun3_lsr_##OBIT##_##LSR##_ai (-1u))				\
+      __builtin_abort();						\
+  }
+
+
+#define MK_FUN2(OBIT, LSL)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint16_t fun2_lsl_##OBIT##_##LSL##_ai (uint16_t a)			\
+  {									\
+    return (a << LSL) & (1u << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint16_t fun2_lsl_##OBIT##_##LSL##_ni (uint16_t a)			\
+  {									\
+    return fun2_lsl_##OBIT##_##LSL##_ai (a);				\
+  }									\
+									\
+  void test_fun2_lsl_##OBIT##_##LSL (void)				\
+  {									\
+    if (fun2_lsl_##OBIT##_##LSL##_ni (1u << (OBIT - LSL))		\
+	!= fun2_lsl_##OBIT##_##LSL##_ai (1u << (OBIT - LSL)))		\
+      __builtin_abort();						\
+									\
+    if (fun2_lsl_##OBIT##_##LSL##_ni (1u << (OBIT - LSL))		\
+	!= fun2_lsl_##OBIT##_##LSL##_ai (-1u))				\
+      __builtin_abort();						\
+  }
+
+
+MK_FUN1 (10, 4)
+MK_FUN1 (6, 1)
+MK_FUN1 (1, 5)
+MK_FUN1 (0, 8)
+MK_FUN1 (0, 4)
+MK_FUN1 (0, 1)
+MK_FUN1 (0, 0)
+
+MK_FUN3 (10, 4)
+MK_FUN3 (6, 1)
+MK_FUN3 (1, 5)
+MK_FUN3 (0, 8)
+MK_FUN3 (0, 4)
+MK_FUN3 (0, 1)
+MK_FUN3 (0, 0)
+
+MK_FUN2 (12, 8)
+MK_FUN2 (15, 15)
+MK_FUN2 (14, 12)
+MK_FUN2 (8, 8)
+MK_FUN2 (7, 4)
+MK_FUN2 (5, 4)
+MK_FUN2 (5, 1)
+MK_FUN2 (4, 0)
+MK_FUN2 (1, 0)
+MK_FUN2 (0, 0)
+
+int main (void)
+{
+  test_fun1_lsr_10_4 ();
+  test_fun1_lsr_6_1 ();
+  test_fun1_lsr_1_5 ();
+  test_fun1_lsr_0_8 ();
+  test_fun1_lsr_0_4 ();
+  test_fun1_lsr_0_1 ();
+  test_fun1_lsr_0_0 ();
+
+  test_fun3_lsr_10_4 ();
+  test_fun3_lsr_6_1 ();
+  test_fun3_lsr_1_5 ();
+  test_fun3_lsr_0_8 ();
+  test_fun3_lsr_0_4 ();
+  test_fun3_lsr_0_1 ();
+  test_fun3_lsr_0_0 ();
+
+  test_fun2_lsl_12_8 ();
+  test_fun2_lsl_15_15 ();
+  test_fun2_lsl_14_12 ();
+  test_fun2_lsl_8_8 ();
+  test_fun2_lsl_7_4 ();
+  test_fun2_lsl_5_4 ();
+  test_fun2_lsl_5_1 ();
+  test_fun2_lsl_4_0 ();
+  test_fun2_lsl_1_0 ();
+  test_fun2_lsl_0_0 ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/avr/torture/insv-anyshift-si.c b/gcc/testsuite/gcc.target/avr/torture/insv-anyshift-si.c
new file mode 100644
index 00000000000..f52593cf0a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/insv-anyshift-si.c
@@ -0,0 +1,89 @@
+/* { dg-do run } */
+
+typedef __UINT32_TYPE__ uint32_t;
+
+/* Testing inlined and completely folded versions of functions
+   against their non-inlined, non-folded counnterparts.  */
+
+#define MK_FUN1(OBIT, LSR)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint32_t fun1_lsr_##OBIT##_##LSR##_ai (int x, uint32_t a)		\
+  {									\
+    (void) x;								\
+    return (a >> LSR) & (1ul << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint32_t fun1_lsr_##OBIT##_##LSR##_ni (int x, uint32_t a)		\
+  {									\
+    return fun1_lsr_##OBIT##_##LSR##_ai (x, a);				\
+  }									\
+									\
+  void test_fun1_lsr_##OBIT##_##LSR (void)				\
+  {									\
+    if (fun1_lsr_##OBIT##_##LSR##_ni (0, 1ul << (OBIT + LSR))		\
+	!= fun1_lsr_##OBIT##_##LSR##_ai (0, 1ul << (OBIT + LSR)))	\
+      __builtin_abort();						\
+									\
+    if (fun1_lsr_##OBIT##_##LSR##_ni (0, 1ul << (OBIT + LSR))		\
+	!= fun1_lsr_##OBIT##_##LSR##_ai (0, -1ul))			\
+      __builtin_abort();						\
+  }
+  
+
+#define MK_FUN2(OBIT, LSL)						\
+  static __inline__ __attribute__((__always_inline__))			\
+  uint32_t fun2_lsl_##OBIT##_##LSL##_ai (int x, uint32_t a)		\
+  {									\
+    (void) x;								\
+    return (a << LSL) & (1ul << OBIT);					\
+  }									\
+									\
+  __attribute__((__noinline__,__noclone__))				\
+  uint32_t fun2_lsl_##OBIT##_##LSL##_ni (int x, uint32_t a)		\
+  {									\
+    return fun2_lsl_##OBIT##_##LSL##_ai (x, a);				\
+  }									\
+									\
+  void test_fun2_lsl_##OBIT##_##LSL (void)				\
+  {									\
+    if (fun2_lsl_##OBIT##_##LSL##_ni (0, 1ul << (OBIT - LSL))		\
+	!= fun2_lsl_##OBIT##_##LSL##_ai (0, 1ul << (OBIT - LSL)))	\
+      __builtin_abort();						\
+									\
+    if (fun2_lsl_##OBIT##_##LSL##_ni (0, 1ul << (OBIT - LSL))		\
+	!= fun2_lsl_##OBIT##_##LSL##_ai (0, -1ul))			\
+      __builtin_abort();						\
+  }
+
+
+MK_FUN1 (13, 15)
+MK_FUN1 (13, 16)
+MK_FUN1 (13, 17)
+MK_FUN1 (13, 12)
+MK_FUN1 (0, 31)
+MK_FUN1 (0, 8)
+MK_FUN1 (0, 0)
+
+MK_FUN2 (12, 8)
+MK_FUN2 (13, 8)
+MK_FUN2 (16, 8)
+MK_FUN2 (16, 0)
+
+int main (void)
+{
+  test_fun1_lsr_13_15 ();
+  test_fun1_lsr_13_16 ();
+  test_fun1_lsr_13_17 ();
+  test_fun1_lsr_13_12 ();
+  test_fun1_lsr_0_31 ();
+  test_fun1_lsr_0_8 ();
+  test_fun1_lsr_0_0 ();
+
+  test_fun2_lsl_12_8 ();
+  test_fun2_lsl_13_8 ();
+  test_fun2_lsl_16_8 ();
+  test_fun2_lsl_16_0 ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-03-05 11:15 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-02 11:50 [AVR PATCH] Optimize (X>>C)&1 for C in [1,4,8,16,24] in *insv.any_shift.<mode> Roger Sayle
2023-11-09 18:08 ` Georg-Johann Lay
2024-03-05 11:15 ` [patch,avr,applied] Improve output of insn "*insv.any_shift.<mode>" Georg-Johann Lay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).