public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's
@ 2020-04-07 10:40 Andre Vieira (lists)
  2020-04-07 10:57 ` Christophe Lyon
  2020-04-07 14:06 ` Kyrylo Tkachov
  0 siblings, 2 replies; 4+ messages in thread
From: Andre Vieira (lists) @ 2020-04-07 10:40 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 811 bytes --]

Hi,

This patch fixes v[id]wdup intrinsics. They had two issues:
1) the predicated versions did not link the incoming inactive vector 
parameter to the output
2) The backend didn't enforce the wrap limit operand be in an odd register.

1) was fixed like we did for all other predicated intrinsics
2) requires a temporary hack where we pass the value in the top end of 
DImode operand. The proper fix would be to add a register CLASS but this 
interacted badly with other existing targets codegen.  We will look to 
fix this properly in GCC 11.

Regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira  <andre.simoesdiasvieira@arm.com>

         * config/arm/arm_mve.h: Fix v[id]wdup intrinsics.
         * config/arm/mve/md: Fix v[id]wdup patterns.


[-- Attachment #2: vidwdups.patch --]
[-- Type: text/plain, Size: 26277 bytes --]

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index e31c2e8fdc4f500bf9408d05ad86e151397627f7..47eead71d9515b4103a5b66999a3f9357dc3c3be 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -13585,29 +13585,33 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint8x16_t __res =  __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __b, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __res =  __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13615,8 +13619,9 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __b, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13624,8 +13629,9 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __b, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13633,29 +13639,33 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_vdwdupq_n_uv16qi (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_n_uv16qi (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_vdwdupq_n_uv4si (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_n_uv4si (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_vdwdupq_n_uv8hi (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_n_uv8hi (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm)
 {
-  uint8x16_t __res = __builtin_mve_vdwdupq_n_uv16qi (*__a, __b, __imm);
-  *__a = __builtin_mve_vdwdupq_wb_uv16qi (*__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __res = __builtin_mve_vdwdupq_n_uv16qi (*__a, __c, __imm);
+  *__a = __builtin_mve_vdwdupq_wb_uv16qi (*__a, __c, __imm);
   return __res;
 }
 
@@ -13663,8 +13673,9 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm)
 {
-  uint32x4_t __res = __builtin_mve_vdwdupq_n_uv4si (*__a, __b, __imm);
-  *__a = __builtin_mve_vdwdupq_wb_uv4si (*__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __res = __builtin_mve_vdwdupq_n_uv4si (*__a, __c, __imm);
+  *__a = __builtin_mve_vdwdupq_wb_uv4si (*__a, __c, __imm);
   return __res;
 }
 
@@ -13672,8 +13683,9 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
 {
-  uint16x8_t __res = __builtin_mve_vdwdupq_n_uv8hi (*__a, __b, __imm);
-  *__a = __builtin_mve_vdwdupq_wb_uv8hi (*__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __res = __builtin_mve_vdwdupq_n_uv8hi (*__a, __c, __imm);
+  *__a = __builtin_mve_vdwdupq_wb_uv8hi (*__a, __c, __imm);
   return __res;
 }
 
@@ -13804,29 +13816,33 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_viwdupq_m_n_uv16qi (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_viwdupq_m_n_uv4si (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_viwdupq_m_n_uv8hi (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__inactive, *__a, __b, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p);
+  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13834,8 +13850,9 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__inactive, *__a, __b, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p);
+  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13843,8 +13860,9 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__inactive, *__a, __b, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p);
+  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13852,29 +13870,33 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_viwdupq_n_uv16qi (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_n_uv16qi (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_viwdupq_n_uv4si (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_n_uv4si (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_viwdupq_n_uv8hi (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_n_uv8hi (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm)
 {
-  uint8x16_t __res = __builtin_mve_viwdupq_n_uv16qi (*__a, __b, __imm);
-  *__a = __builtin_mve_viwdupq_wb_uv16qi (*__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __res = __builtin_mve_viwdupq_n_uv16qi (*__a, __c, __imm);
+  *__a = __builtin_mve_viwdupq_wb_uv16qi (*__a, __c, __imm);
   return __res;
 }
 
@@ -13882,8 +13904,9 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm)
 {
-  uint32x4_t __res = __builtin_mve_viwdupq_n_uv4si (*__a, __b, __imm);
-  *__a = __builtin_mve_viwdupq_wb_uv4si (*__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __res = __builtin_mve_viwdupq_n_uv4si (*__a, __c, __imm);
+  *__a = __builtin_mve_viwdupq_wb_uv4si (*__a, __c, __imm);
   return __res;
 }
 
@@ -13891,11 +13914,13 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm)
 {
-  uint16x8_t __res = __builtin_mve_viwdupq_n_uv8hi (*__a, __b, __imm);
-  *__a = __builtin_mve_viwdupq_wb_uv8hi (*__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __res = __builtin_mve_viwdupq_n_uv8hi (*__a, __c, __imm);
+  *__a = __builtin_mve_viwdupq_wb_uv8hi (*__a, __c, __imm);
   return __res;
 }
 
+
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrdq_gather_base_wb_s64 (uint64x2_t * __addr, const int __offset)
@@ -14095,30 +14120,34 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv16qi (vuninitializedq_u8 (), __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv8hi (vuninitializedq_u16 (), __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv4si (vuninitializedq_u32 (), __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint8x16_t __arg1 = vuninitializedq_u8 ();
-  uint8x16_t __res = __builtin_mve_vdwdupq_m_n_uv16qi (__arg1, *__a, __b, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__arg1, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
+  uint8x16_t __res = __builtin_mve_vdwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -14126,9 +14155,10 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint16x8_t __arg1 = vuninitializedq_u16 ();
-  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__arg1, *__a, __b, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__arg1, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
+  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -14136,9 +14166,10 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint32x4_t __arg1 = vuninitializedq_u32 ();
-  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__arg1, *__a, __b, __imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__arg1, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
+  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -14197,30 +14228,34 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_viwdupq_m_n_uv16qi (vuninitializedq_u8 (), __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_viwdupq_m_n_uv8hi (vuninitializedq_u16 (), __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_viwdupq_m_n_uv4si (vuninitializedq_u32 (), __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_viwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint8x16_t __arg1 = vuninitializedq_u8 ();
-  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__arg1, *__a, __b, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__arg1, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __arg1 = __arm_vuninitializedq_u8 ();
+  uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p);
+  *__a =  __builtin_mve_viwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -14228,9 +14263,10 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint16x8_t __arg1 = vuninitializedq_u16 ();
-  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__arg1, *__a, __b, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__arg1, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __arg1 = __arm_vuninitializedq_u16 ();
+  uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p);
+  *__a =  __builtin_mve_viwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -14238,9 +14274,10 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_viwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p)
 {
-  uint32x4_t __arg1 = vuninitializedq_u32 ();
-  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__arg1, *__a, __b, __imm, __p);
-  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__arg1, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __arg1 = __arm_vuninitializedq_u32 ();
+  uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p);
+  *__a =  __builtin_mve_viwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p);
   return __res;
 }
 
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 10abc3fae3709891346b63213afb1fe3754af41a..4a506cc3861534b4ddc30ba8f4f3c4ec28a8cc69 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -9853,7 +9853,7 @@ (define_insn "mve_vddupq_m_wb_u<mode>_insn"
 (define_expand "mve_vdwdupq_n_u<mode>"
  [(match_operand:MVE_2 0 "s_register_operand")
   (match_operand:SI 1 "s_register_operand")
-  (match_operand:SI 2 "s_register_operand")
+  (match_operand:DI 2 "s_register_operand")
   (match_operand:SI 3 "mve_imm_selective_upto_8")]
  "TARGET_HAVE_MVE"
 {
@@ -9870,7 +9870,7 @@ (define_expand "mve_vdwdupq_n_u<mode>"
 (define_expand "mve_vdwdupq_wb_u<mode>"
  [(match_operand:SI 0 "s_register_operand")
   (match_operand:SI 1 "s_register_operand")
-  (match_operand:SI 2 "s_register_operand")
+  (match_operand:DI 2 "s_register_operand")
   (match_operand:SI 3 "mve_imm_selective_upto_8")
   (unspec:MVE_2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
  "TARGET_HAVE_MVE"
@@ -9888,16 +9888,16 @@ (define_expand "mve_vdwdupq_wb_u<mode>"
 (define_insn "mve_vdwdupq_wb_u<mode>_insn"
   [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:SI 2 "s_register_operand" "1")
-		       (match_operand:SI 3 "s_register_operand" "r")
+		       (subreg:SI (match_operand:DI 3 "s_register_operand" "r") 4)
 		       (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg")]
 	 VDWDUPQ))
    (set (match_operand:SI 1 "s_register_operand" "=e")
 	(unspec:SI [(match_dup 2)
-		    (match_dup 3)
+		    (subreg:SI (match_dup 3) 4)
 		    (match_dup 4)]
 	 VDWDUPQ))]
   "TARGET_HAVE_MVE"
-  "vdwdup.u%#<V_sz_elem>\t%q0, %2, %3, %4"
+  "vdwdup.u%#<V_sz_elem>\t%q0, %2, %R3, %4"
 )
 
 ;;
@@ -9907,7 +9907,7 @@ (define_expand "mve_vdwdupq_m_n_u<mode>"
  [(match_operand:MVE_2 0 "s_register_operand")
   (match_operand:MVE_2 1 "s_register_operand")
   (match_operand:SI 2 "s_register_operand")
-  (match_operand:SI 3 "s_register_operand")
+  (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
   (match_operand:HI 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
@@ -9927,7 +9927,7 @@ (define_expand "mve_vdwdupq_m_wb_u<mode>"
  [(match_operand:SI 0 "s_register_operand")
   (match_operand:MVE_2 1 "s_register_operand")
   (match_operand:SI 2 "s_register_operand")
-  (match_operand:SI 3 "s_register_operand")
+  (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
   (match_operand:HI 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
@@ -9945,22 +9945,22 @@ (define_expand "mve_vdwdupq_m_wb_u<mode>"
 ;;
 (define_insn "mve_vdwdupq_m_wb_u<mode>_insn"
   [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
-	(unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "w")
+	(unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0")
 		       (match_operand:SI 3 "s_register_operand" "1")
-		       (match_operand:SI 4 "s_register_operand" "r")
+		       (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4)
 		       (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg")
 		       (match_operand:HI 6 "vpr_register_operand" "Up")]
 	 VDWDUPQ_M))
    (set (match_operand:SI 1 "s_register_operand" "=e")
 	(unspec:SI [(match_dup 2)
 		    (match_dup 3)
-		    (match_dup 4)
+		    (subreg:SI (match_dup 4) 4)
 		    (match_dup 5)
 		    (match_dup 6)]
 	 VDWDUPQ_M))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;\tvdwdupt.u%#<V_sz_elem>\t%q2, %3, %4, %5"
+  "vpst\;\tvdwdupt.u%#<V_sz_elem>\t%q2, %3, %R4, %5"
   [(set_attr "type" "mve_move")
    (set_attr "length""8")])
 
@@ -9970,7 +9970,7 @@ (define_insn "mve_vdwdupq_m_wb_u<mode>_insn"
 (define_expand "mve_viwdupq_n_u<mode>"
  [(match_operand:MVE_2 0 "s_register_operand")
   (match_operand:SI 1 "s_register_operand")
-  (match_operand:SI 2 "s_register_operand")
+  (match_operand:DI 2 "s_register_operand")
   (match_operand:SI 3 "mve_imm_selective_upto_8")]
  "TARGET_HAVE_MVE"
 {
@@ -9987,7 +9987,7 @@ (define_expand "mve_viwdupq_n_u<mode>"
 (define_expand "mve_viwdupq_wb_u<mode>"
  [(match_operand:SI 0 "s_register_operand")
   (match_operand:SI 1 "s_register_operand")
-  (match_operand:SI 2 "s_register_operand")
+  (match_operand:DI 2 "s_register_operand")
   (match_operand:SI 3 "mve_imm_selective_upto_8")
   (unspec:MVE_2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
  "TARGET_HAVE_MVE"
@@ -10005,16 +10005,16 @@ (define_expand "mve_viwdupq_wb_u<mode>"
 (define_insn "mve_viwdupq_wb_u<mode>_insn"
   [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
 	(unspec:MVE_2 [(match_operand:SI 2 "s_register_operand" "1")
-		       (match_operand:SI 3 "s_register_operand" "r")
+		       (subreg:SI (match_operand:DI 3 "s_register_operand" "r") 4)
 		       (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg")]
 	 VIWDUPQ))
    (set (match_operand:SI 1 "s_register_operand" "=e")
 	(unspec:SI [(match_dup 2)
-		    (match_dup 3)
+		    (subreg:SI (match_dup 3) 4)
 		    (match_dup 4)]
 	 VIWDUPQ))]
   "TARGET_HAVE_MVE"
-  "viwdup.u%#<V_sz_elem>\t%q0, %2, %3, %4"
+  "viwdup.u%#<V_sz_elem>\t%q0, %2, %R3, %4"
 )
 
 ;;
@@ -10024,7 +10024,7 @@ (define_expand "mve_viwdupq_m_n_u<mode>"
  [(match_operand:MVE_2 0 "s_register_operand")
   (match_operand:MVE_2 1 "s_register_operand")
   (match_operand:SI 2 "s_register_operand")
-  (match_operand:SI 3 "s_register_operand")
+  (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
   (match_operand:HI 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
@@ -10044,7 +10044,7 @@ (define_expand "mve_viwdupq_m_wb_u<mode>"
  [(match_operand:SI 0 "s_register_operand")
   (match_operand:MVE_2 1 "s_register_operand")
   (match_operand:SI 2 "s_register_operand")
-  (match_operand:SI 3 "s_register_operand")
+  (match_operand:DI 3 "s_register_operand")
   (match_operand:SI 4 "mve_imm_selective_upto_8")
   (match_operand:HI 5 "vpr_register_operand")]
  "TARGET_HAVE_MVE"
@@ -10062,24 +10062,25 @@ (define_expand "mve_viwdupq_m_wb_u<mode>"
 ;;
 (define_insn "mve_viwdupq_m_wb_u<mode>_insn"
   [(set (match_operand:MVE_2 0 "s_register_operand" "=w")
-	(unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "w")
+	(unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0")
 		       (match_operand:SI 3 "s_register_operand" "1")
-		       (match_operand:SI 4 "s_register_operand" "r")
+		       (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4)
 		       (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg")
 		       (match_operand:HI 6 "vpr_register_operand" "Up")]
 	 VIWDUPQ_M))
    (set (match_operand:SI 1 "s_register_operand" "=e")
 	(unspec:SI [(match_dup 2)
 		    (match_dup 3)
-		    (match_dup 4)
+		    (subreg:SI (match_dup 4) 4)
 		    (match_dup 5)
 		    (match_dup 6)]
 	 VIWDUPQ_M))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;\tviwdupt.u%#<V_sz_elem>\t%q2, %3, %4, %5"
+  "vpst\;\tviwdupt.u%#<V_sz_elem>\t%q2, %3, %R4, %5"
   [(set_attr "type" "mve_move")
    (set_attr "length""8")])
+
 (define_expand "mve_vstrwq_scatter_base_wb_<supf>v4si"
   [(match_operand:V4SI 0 "s_register_operand" "=w")
    (match_operand:SI 1 "mve_vldrd_immediate" "Ri")

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's
  2020-04-07 10:40 [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's Andre Vieira (lists)
@ 2020-04-07 10:57 ` Christophe Lyon
  2020-04-07 14:04   ` Andre Vieira (lists)
  2020-04-07 14:06 ` Kyrylo Tkachov
  1 sibling, 1 reply; 4+ messages in thread
From: Christophe Lyon @ 2020-04-07 10:57 UTC (permalink / raw)
  To: Andre Vieira (lists); +Cc: gcc-patches

On Tue, 7 Apr 2020 at 12:40, Andre Vieira (lists)
<andre.simoesdiasvieira@arm.com> wrote:
>
> Hi,
>
> This patch fixes v[id]wdup intrinsics. They had two issues:
> 1) the predicated versions did not link the incoming inactive vector
> parameter to the output
> 2) The backend didn't enforce the wrap limit operand be in an odd register.
>
> 1) was fixed like we did for all other predicated intrinsics
> 2) requires a temporary hack where we pass the value in the top end of
> DImode operand. The proper fix would be to add a register CLASS but this
> interacted badly with other existing targets codegen.  We will look to
> fix this properly in GCC 11.
>
> Regression tested on arm-none-eabi.
>

Hi Andre,

How did you find problem 1? I suspect you are using an internal
simulator since qemu does not support MVE yet?
And you probably have runtime tests to exhibit this failure?

Thanks,

Christophe

> Is this OK for trunk?
>
> gcc/ChangeLog:
> 2020-04-07  Andre Vieira  <andre.simoesdiasvieira@arm.com>
>
>          * config/arm/arm_mve.h: Fix v[id]wdup intrinsics.
>          * config/arm/mve/md: Fix v[id]wdup patterns.
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's
  2020-04-07 10:57 ` Christophe Lyon
@ 2020-04-07 14:04   ` Andre Vieira (lists)
  0 siblings, 0 replies; 4+ messages in thread
From: Andre Vieira (lists) @ 2020-04-07 14:04 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

On 07/04/2020 11:57, Christophe Lyon wrote:
> On Tue, 7 Apr 2020 at 12:40, Andre Vieira (lists)
> <andre.simoesdiasvieira@arm.com> wrote:
>> Hi,
>>
>> This patch fixes v[id]wdup intrinsics. They had two issues:
>> 1) the predicated versions did not link the incoming inactive vector
>> parameter to the output
>> 2) The backend didn't enforce the wrap limit operand be in an odd register.
>>
>> 1) was fixed like we did for all other predicated intrinsics
>> 2) requires a temporary hack where we pass the value in the top end of
>> DImode operand. The proper fix would be to add a register CLASS but this
>> interacted badly with other existing targets codegen.  We will look to
>> fix this properly in GCC 11.
>>
>> Regression tested on arm-none-eabi.
>>
> Hi Andre,
>
> How did you find problem 1? I suspect you are using an internal
> simulator since qemu does not support MVE yet?
> And you probably have runtime tests to exhibit this failure?
Hi Christophe,

I actually found 1) because I was fixing 2). Though yes, I am trying to 
complement testing using an internal simulator and running tests in 
Arm's CMSIS DSP Library (https://github.com/ARM-software/CMSIS_5) that 
use MVE.

Cheers,
Andre
> Thanks,
>
> Christophe
>
>> Is this OK for trunk?
>>
>> gcc/ChangeLog:
>> 2020-04-07  Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>
>>           * config/arm/arm_mve.h: Fix v[id]wdup intrinsics.
>>           * config/arm/mve/md: Fix v[id]wdup patterns.
>>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's
  2020-04-07 10:40 [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's Andre Vieira (lists)
  2020-04-07 10:57 ` Christophe Lyon
@ 2020-04-07 14:06 ` Kyrylo Tkachov
  1 sibling, 0 replies; 4+ messages in thread
From: Kyrylo Tkachov @ 2020-04-07 14:06 UTC (permalink / raw)
  To: Andre Simoes Dias Vieira, gcc-patches

> -----Original Message-----
> From: Andre Vieira (lists) <andre.simoesdiasvieira@arm.com>
> Sent: 07 April 2020 11:41
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's
> 
> Hi,
> 
> This patch fixes v[id]wdup intrinsics. They had two issues:
> 1) the predicated versions did not link the incoming inactive vector
> parameter to the output
> 2) The backend didn't enforce the wrap limit operand be in an odd register.
> 
> 1) was fixed like we did for all other predicated intrinsics
> 2) requires a temporary hack where we pass the value in the top end of
> DImode operand. The proper fix would be to add a register CLASS but this
> interacted badly with other existing targets codegen.  We will look to
> fix this properly in GCC 11.
> 
> Regression tested on arm-none-eabi.
> 
> Is this OK for trunk?

Argh, not a fan of explicitly matching subregs as they usually break on big-endian, but we've disabled the intrinsics for big-endian for now ☹
So ok for trunk.
Thanks,
Kyrill

> 
> gcc/ChangeLog:
> 2020-04-07  Andre Vieira  <andre.simoesdiasvieira@arm.com>
> 
>          * config/arm/arm_mve.h: Fix v[id]wdup intrinsics.
>          * config/arm/mve/md: Fix v[id]wdup patterns.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-04-07 14:06 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-07 10:40 [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's Andre Vieira (lists)
2020-04-07 10:57 ` Christophe Lyon
2020-04-07 14:04   ` Andre Vieira (lists)
2020-04-07 14:06 ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).