[AArch64] Upgrade integer MLA intrinsics to GCC vector extensions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions
@ 2020-08-12  8:40 James Greenhalgh
  2020-08-12  8:54 ` Christophe Lyon
  2020-08-12  9:03 ` Richard Sandiford
  0 siblings, 2 replies; 3+ messages in thread
From: James Greenhalgh @ 2020-08-12  8:40 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 627 bytes --]


Hi,

As subject, this patch rewrites the mla intrinsics to use a + b * c rather
than inline assembler, thereby opening them to CSE, scheduling, etc.

Bootstrapped and tested on aarch64-none-linux-gnu.

OK?

Thanks,
James

---

gcc/Changelog:

2020-08-11  James Greenhalgh  <james.greenhalgh@arm.com>

	config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
	(vmla_s16): Likewise.
	(vmla_s32): Likewise.
	(vmla_u8): Likewise.
	(vmla_u16): Likewise.
	(vmla_u32): Likewise.
	(vmlaq_s8): Likewise.
	(vmlaq_s16): Likewise.
	(vmlaq_s32): Likewise.
	(vmlaq_u8): Likewise.
	(vmlaq_u16): Likewise.
	(vmlaq_u32): Likewise.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-AArch64-Upgrade-integer-MLA-intrinsics-to-GCC-vector.patch --]
[-- Type: text/x-patch; name="0001-AArch64-Upgrade-integer-MLA-intrinsics-to-GCC-vector.patch", Size: 5033 bytes --]

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..aa548e4e6c7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7400,72 +7400,42 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 #define vmlal_high_lane_s16(a, b, c, d)                                 \
@@ -7941,72 +7911,42 @@ __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint8x16_t __result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint16x8_t __result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b), "w"(__c)
-           : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline float32x2_t

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions
  2020-08-12  8:40 [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions James Greenhalgh
@ 2020-08-12  8:54 ` Christophe Lyon
  2020-08-12  9:03 ` Richard Sandiford
  1 sibling, 0 replies; 3+ messages in thread
From: Christophe Lyon @ 2020-08-12  8:54 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc Patches, nd

Hi James,

On Wed, 12 Aug 2020 at 10:40, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>
>
> Hi,
>
> As subject, this patch rewrites the mla intrinsics to use a + b * c rather
> than inline assembler, thereby opening them to CSE, scheduling, etc.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>

Do we have tests that make sure we still generate the mla instructions?

> OK?
>
> Thanks,
> James
>
> ---
>
> gcc/Changelog:
>
> 2020-08-11  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
>         (vmla_s16): Likewise.
>         (vmla_s32): Likewise.
>         (vmla_u8): Likewise.
>         (vmla_u16): Likewise.
>         (vmla_u32): Likewise.
>         (vmlaq_s8): Likewise.
>         (vmlaq_s16): Likewise.
>         (vmlaq_s32): Likewise.
>         (vmlaq_u8): Likewise.
>         (vmlaq_u16): Likewise.
>         (vmlaq_u32): Likewise.
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions
  2020-08-12  8:40 [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions James Greenhalgh
  2020-08-12  8:54 ` Christophe Lyon
@ 2020-08-12  9:03 ` Richard Sandiford
  1 sibling, 0 replies; 3+ messages in thread
From: Richard Sandiford @ 2020-08-12  9:03 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, nd

James Greenhalgh <james.greenhalgh@arm.com> writes:
> Hi,
>
> As subject, this patch rewrites the mla intrinsics to use a + b * c rather
> than inline assembler, thereby opening them to CSE, scheduling, etc.

Looks good for the unsigned ones.  For the signed ones, there's a risk
that the functions might become subject to the usual UB for signed
overflow, rather than acting just like the instructions do.  (Realise
that isn't unique to these functions, but it'd be good not to introduce
more instances of it.)

So for the signed ones, it might be safer to cast to the unsigned type,
do the operation, and then cast back.

Thanks,
Richard

> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> OK?
>
> Thanks,
> James
>
> ---
>
> gcc/Changelog:
>
> 2020-08-11  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	config/aarch64/arm_neon.h (vmla_s8): Upgrade to C rather than asm.
> 	(vmla_s16): Likewise.
> 	(vmla_s32): Likewise.
> 	(vmla_u8): Likewise.
> 	(vmla_u16): Likewise.
> 	(vmla_u32): Likewise.
> 	(vmlaq_s8): Likewise.
> 	(vmlaq_s16): Likewise.
> 	(vmlaq_s32): Likewise.
> 	(vmlaq_u8): Likewise.
> 	(vmlaq_u16): Likewise.
> 	(vmlaq_u32): Likewise.
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 50f8b23bc17..aa548e4e6c7 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -7400,72 +7400,42 @@ __extension__ extern __inline int8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
>  {
> -  int8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>  {
> -  int16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
>  {
> -  int32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
>  {
> -  uint8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
>  {
> -  uint16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
>  {
> -  uint32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  #define vmlal_high_lane_s16(a, b, c, d)                                 \
> @@ -7941,72 +7911,42 @@ __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
>  {
> -  int8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
>  {
> -  int16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
>  {
> -  int32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
>  {
> -  uint8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
>  {
> -  uint16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
>  {
> -  uint32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __a + __b * __c;
>  }
>  
>  __extension__ extern __inline float32x2_t

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-08-12  9:04 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-12  8:40 [AArch64] Upgrade integer MLA intrinsics to GCC vector extensions James Greenhalgh
2020-08-12  8:54 ` Christophe Lyon
2020-08-12  9:03 ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).