public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/113166] New: RISC-V: Redundant move instructions in RVV intrinsic codes
@ 2023-12-28 22:23 juzhe.zhong at rivai dot ai
2023-12-28 22:32 ` [Bug c/113166] " juzhe.zhong at rivai dot ai
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-12-28 22:23 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113166
Bug ID: 113166
Summary: RISC-V: Redundant move instructions in RVV intrinsic
codes
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: juzhe.zhong at rivai dot ai
Target Milestone: ---
https://godbolt.org/z/rMaz9jqej
#include "riscv_vector.h"
void foo (void *in, void *out, int x)
{
vint32m2_t dup = __riscv_vmv_v_x_i32m2 (x, 8);
vint32m2x4_t tuple1 = __riscv_vlseg4e32_v_i32m2x4 (in, 8);
vint32m2x4_t tuple2 = __riscv_vlseg4e32_v_i32m2x4 (in + 16, 8);
vint32m2_t tmp1 = __riscv_vadd_vv_i32m2 (dup, __riscv_vget_v_i32m2x4_i32m2
(tuple2, 0), 8);
vint32m2_t tmp2 = __riscv_vmul_vv_i32m2 (tmp1, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 0), 8);
tmp1 = __riscv_vmul_vv_i32m2 (tmp1, __riscv_vget_v_i32m2x4_i32m2 (tuple1,
1), 8);
vint32m2_t tmp3 = __riscv_vadd_vv_i32m2 (dup, __riscv_vget_v_i32m2x4_i32m2
(tuple2, 2), 8);
vint32m2_t tmp4 = __riscv_vmul_vv_i32m2 (tmp3, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 2), 8);
vint32m2_t tmp9 = __riscv_vmul_vv_i32m2 (tmp3, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 3), 8);
vint32m2_t tmp5 = __riscv_vnmsub_vv_i32m2 (__riscv_vget_v_i32m2x4_i32m2
(tuple1, 0), tmp9,tmp9, 8);
vint32m2_t tmp6 = __riscv_vmacc_vv_i32m2 (__riscv_vget_v_i32m2x4_i32m2
(tuple1, 1), tmp9,tmp9, 8);
vint32m2_t tmp7 = __riscv_vnmsac_vv_i32m2 (__riscv_vget_v_i32m2x4_i32m2
(tuple1, 2), tmp9,tmp9, 8);
vint32m2_t tmp8 = __riscv_vmacc_vv_i32m2 (__riscv_vget_v_i32m2x4_i32m2
(tuple1, 3), tmp9,tmp9, 8);
vint32m2x4_t create = __riscv_vcreate_v_i32m2x4 (tmp5, tmp6, tmp7, tmp8);
__riscv_vsseg4e32_v_i32m2x4 (out, create, 8);
}
GCC:
foo:
addi a5,a0,16
vsetivli zero,8,e32,m2,ta,ma
vlseg4e32.v v16,(a5)
vlseg4e32.v v8,(a0)
vmv.v.x v2,a2
vadd.vv v2,v2,v20
vmul.vv v2,v2,v14
vmv.v.v v4,v8
vnmsub.vv v4,v2,v2
vmv.v.v v18,v10
vmacc.vv v18,v2,v2
vmv2r.v v16,v4
vmv.v.v v20,v12
vnmsac.vv v20,v2,v2
vmv.v.v v22,v14
vmacc.vv v22,v2,v2
vsseg4e32.v v16,(a1)
ret
Clang:
foo: # @foo
vsetivli zero, 8, e32, m2, ta, ma
addi a3, a0, 16
vlseg4e32.v v8, (a3)
vlseg4e32.v v14, (a0)
vmv.v.x v8, a2
vadd.vv v8, v8, v12
vmul.vv v8, v8, v20
vnmsub.vv v14, v8, v8
vmacc.vv v16, v8, v8
vnmsac.vv v18, v8, v8
vmadd.vv v8, v8, v20
vmv.v.v v20, v8
vsseg4e32.v v14, (a1)
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug c/113166] RISC-V: Redundant move instructions in RVV intrinsic codes
2023-12-28 22:23 [Bug c/113166] New: RISC-V: Redundant move instructions in RVV intrinsic codes juzhe.zhong at rivai dot ai
@ 2023-12-28 22:32 ` juzhe.zhong at rivai dot ai
2024-01-18 14:06 ` [Bug middle-end/113166] " juzhe.zhong at rivai dot ai
2024-01-30 11:04 ` juzhe.zhong at rivai dot ai
2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2023-12-28 22:32 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113166
--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Suspect it is subreg issue again.
#include "riscv_vector.h"
void foo (void *in, void *out, int x)
{
vint32m2_t dup = __riscv_vmv_v_x_i32m2 (x, 8);
vint32m2x4_t tuple1 = __riscv_vlseg4e32_v_i32m2x4 (in, 8);
vint32m2x4_t tuple2 = __riscv_vlseg4e32_v_i32m2x4 (in + 16, 8);
vint32m2_t tmp1 = __riscv_vadd_vv_i32m2 (dup, __riscv_vget_v_i32m2x4_i32m2
(tuple2, 0), 8);
vint32m2_t tmp2 = __riscv_vmul_vv_i32m2 (tmp1, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 0), 8);
tmp1 = __riscv_vmul_vv_i32m2 (tmp1, __riscv_vget_v_i32m2x4_i32m2 (tuple1,
1), 8);
vint32m2_t tmp3 = __riscv_vadd_vv_i32m2 (dup, __riscv_vget_v_i32m2x4_i32m2
(tuple2, 2), 8);
vint32m2_t tmp4 = __riscv_vmul_vv_i32m2 (tmp3, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 2), 8);
vint32m2_t tmp9 = __riscv_vmul_vv_i32m2 (tmp3, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 3), 8);
vint32m2_t tmp5 = __riscv_vadd_vv_i32m2_tu (tmp1, tmp9, tmp9, 8);
vint32m2_t tmp6 = __riscv_vadd_vv_i32m2_tu (tmp2, tmp9, tmp9, 8);
vint32m2_t tmp7 = __riscv_vadd_vv_i32m2_tu (tmp3, tmp9, tmp9, 8);
vint32m2_t tmp8 = __riscv_vadd_vv_i32m2_tu (tmp4, tmp9, tmp9, 8);
vint32m2x4_t create = __riscv_vcreate_v_i32m2x4 (tmp5, tmp6, tmp7, tmp8);
__riscv_vsseg4e32_v_i32m2x4 (out, create, 8);
//__riscv_vse32_v_i32m2 (out, tmp5, 8);
//__riscv_vse32_v_i32m2 (out + 16, tmp6, 8);
//__riscv_vse32_v_i32m2 (out + 32, tmp7, 8);
//__riscv_vse32_v_i32m2 (out + 64, tmp8, 8);
}
has move instructions.
But
#include "riscv_vector.h"
void foo (void *in, void *out, int x)
{
vint32m2_t dup = __riscv_vmv_v_x_i32m2 (x, 8);
vint32m2x4_t tuple1 = __riscv_vlseg4e32_v_i32m2x4 (in, 8);
vint32m2x4_t tuple2 = __riscv_vlseg4e32_v_i32m2x4 (in + 16, 8);
vint32m2_t tmp1 = __riscv_vadd_vv_i32m2 (dup, __riscv_vget_v_i32m2x4_i32m2
(tuple2, 0), 8);
vint32m2_t tmp2 = __riscv_vmul_vv_i32m2 (tmp1, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 0), 8);
tmp1 = __riscv_vmul_vv_i32m2 (tmp1, __riscv_vget_v_i32m2x4_i32m2 (tuple1,
1), 8);
vint32m2_t tmp3 = __riscv_vadd_vv_i32m2 (dup, __riscv_vget_v_i32m2x4_i32m2
(tuple2, 2), 8);
vint32m2_t tmp4 = __riscv_vmul_vv_i32m2 (tmp3, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 2), 8);
vint32m2_t tmp9 = __riscv_vmul_vv_i32m2 (tmp3, __riscv_vget_v_i32m2x4_i32m2
(tuple1, 3), 8);
vint32m2_t tmp5 = __riscv_vadd_vv_i32m2_tu (tmp1, tmp9, tmp9, 8);
vint32m2_t tmp6 = __riscv_vadd_vv_i32m2_tu (tmp2, tmp9, tmp9, 8);
vint32m2_t tmp7 = __riscv_vadd_vv_i32m2_tu (tmp3, tmp9, tmp9, 8);
vint32m2_t tmp8 = __riscv_vadd_vv_i32m2_tu (tmp4, tmp9, tmp9, 8);
__riscv_vse32_v_i32m2 (out, tmp5, 8);
__riscv_vse32_v_i32m2 (out + 16, tmp6, 8);
__riscv_vse32_v_i32m2 (out + 32, tmp7, 8);
__riscv_vse32_v_i32m2 (out + 64, tmp8, 8);
}
No move instructions
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug middle-end/113166] RISC-V: Redundant move instructions in RVV intrinsic codes
2023-12-28 22:23 [Bug c/113166] New: RISC-V: Redundant move instructions in RVV intrinsic codes juzhe.zhong at rivai dot ai
2023-12-28 22:32 ` [Bug c/113166] " juzhe.zhong at rivai dot ai
@ 2024-01-18 14:06 ` juzhe.zhong at rivai dot ai
2024-01-30 11:04 ` juzhe.zhong at rivai dot ai
2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-18 14:06 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113166
--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
#include <riscv_vector.h>
#if TO_16
# define uintOut_t uint16_t
# define utf8_to_utf32_scalar utf8_to_utf16_scalar
# define utf8_to_utf32_rvv utf8_to_utf16_rvv
#else
# define uintOut_t uint32_t
#endif
size_t utf8_to_utf32_scalar(char const *src, size_t count, uintOut_t *dest);
size_t
utf8_to_utf32_rvv(char const *src, size_t count, uintOut_t *dest)
{
size_t tail = 3;
if (count < tail) return utf8_to_utf32_scalar(src, count, dest);
/* validate first three bytes */
{
size_t idx = tail;
while (idx < count && (src[idx] >> 6) == 0b10)
++idx;
uintOut_t buf[10];
if (idx > tail + 3 || !utf8_to_utf32_scalar(src, idx, buf))
return 0;
}
size_t n = count - tail;
uintOut_t *destBeg = dest;
static const uint64_t err1m[] = { 0x0202020202020202,
0x4915012180808080 };
static const uint64_t err2m[] = { 0xcbcbcb8b8383a3e7,
0xcbcbdbcbcbcbcbcb };
static const uint64_t err3m[] = { 0x0101010101010101,
0x01010101babaaee6 };
const vuint8m1_t err1tbl =
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
const vuint8m1_t err2tbl =
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
const vuint8m1_t err3tbl =
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
const vuint8m2_t v64u8m2 = __riscv_vmv_v_x_u8m2(1<<6,
__riscv_vsetvlmax_e8m2());
const size_t vl8m1 = __riscv_vsetvlmax_e8m1();
const size_t vl16m2 = __riscv_vsetvlmax_e16m2();
#if TO_16
size_t vl8m2 = __riscv_vsetvlmax_e8m2();
const vbool4_t m4odd =
__riscv_vmsne_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1,
vl8m2), 0, vl8m2);
#endif
for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dest += vlOut) {
vl = __riscv_vsetvl_e8m2(n);
vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const*)src, vl);
uint64_t max =
__riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m2_u8m1(v0,
__riscv_vmv_s_x_u8m1(0, vl), vl));
/* fast path: ASCII */
if (max < 0b10000000) {
vlOut = vl;
#if TO_16
__riscv_vse16_v_u16m4(dest, __riscv_vzext_vf2_u16m4(v0,
vlOut), vlOut);
#else
__riscv_vse32_v_u32m8(dest, __riscv_vzext_vf4_u32m8(v0,
vlOut), vlOut);
#endif
continue;
}
/* see "Validating UTF-8 In Less Than One Instruction Per Byte"
* https://arxiv.org/abs/2010.03090 */
vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, src[vl+0], vl);
vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, src[vl+1], vl);
vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, src[vl+2], vl);
vuint8m2_t s1 =
__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v2),
4, vl16m2));
vuint8m2_t s3 =
__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v3),
4, vl16m2));
vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xf, vl);
vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xf, vl);
vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xf, vl);
#define VRGATHER_u8m1x2(tbl, idx) \
__riscv_vset_v_u8m1_u8m2(__riscv_vlmul_ext_v_u8m1_u8m2(
\
__riscv_vrgather_vv_u8m1(tbl,
__riscv_vget_v_u8m2_u8m1(idx, 0), vl8m1)), 1, \
__riscv_vrgather_vv_u8m1(tbl,
__riscv_vget_v_u8m2_u8m1(idx, 1), vl8m1));
vuint8m2_t err1 = VRGATHER_u8m1x2(err1tbl, idx1);
vuint8m2_t err2 = VRGATHER_u8m1x2(err2tbl, idx2);
vuint8m2_t err3 = VRGATHER_u8m1x2(err3tbl, idx3);
vuint8m2_t errs =
__riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl);
vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000-1,
vl);
vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000-1,
vl);
vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
vbool4_t err34 = __riscv_vmxor_mm_b4(is_34,
__riscv_vmsgtu_vx_u8m2_b4(errs, 0b01111111, vl), vl);
vbool4_t errm =
__riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(errs),
0, vl), err34, vl);
if (__riscv_vfirst_m_b4(errm , vl) >= 0)
return 0;
/* decoding */
/* mask of non continuation bytes */
vbool4_t m = __riscv_vmsne_vx_u8m2_b4(__riscv_vsrl_vx_u8m2(v0,
6, vl), 0b10, vl);
vlOut = __riscv_vcpop_m_b4(m, vl);
/* extract first and second bytes */
vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
/* fast path: one and two byte */
if (max < 0b11100000) {
b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
/* Note: vmv.v.x 64 was purposfully not moved to the
* top to reduce register preasure, please benchmark
* before moving it to the top */
vbool4_t m1 = __riscv_vmsltu_vx_u8m2_b4(b1, 0b11000000,
vlOut);
b1 = __riscv_vand_vv_u8m2(b1,
__riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(63, vlOut), 0xFF, m1, vlOut),
vlOut);
vuint16m4_t b12 = __riscv_vwaddu_wv_u16m4(
__riscv_vwmulu_vv_u16m4(b1,
__riscv_vmerge_vxm_u8m2(v64u8m2, 1, m1, vlOut), vlOut),
__riscv_vmerge_vxm_u8m2(b2, 0, m1,
vlOut), vlOut);
#if TO_16
__riscv_vse16_v_u16m4(dest, b12, vlOut);
#else
__riscv_vse32_v_u32m8(dest,
__riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
#endif
continue;
}
/* fast path: one, two and three byte */
if (max < 0b11110000) {
vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
vbool4_t m1 = __riscv_vmsltu_vx_u8m2_b4(b1, 0b11000000,
vlOut);
vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111,
vlOut);
b1 = __riscv_vand_vv_u8m2(b1,
__riscv_vmerge_vxm_u8m2(__riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(63,
vlOut), 0xFF, m1, vlOut), 15, m3, vlOut), vlOut);
vuint16m4_t b12 = __riscv_vwaddu_wv_u16m4(
__riscv_vwmulu_vv_u16m4(b1,
__riscv_vmerge_vxm_u8m2(v64u8m2, 1, m1, vlOut), vlOut),
__riscv_vmerge_vxm_u8m2(b2, 0, m1,
vlOut), vlOut);
vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(m3, b12,
__riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
#if TO_16
__riscv_vse16_v_u16m4(dest, b123, vlOut);
#else
__riscv_vse32_v_u32m8(dest,
__riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
#endif
continue;
}
/* extract third and fourth bytes */
vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
#define M1_COMMON(idx) \
vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \
vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \
vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \
vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \
/* remove prefix from trailing bytes */ \
c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \
c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \
c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \
/* remove prefix from leading bytes
*
* We shift left and then right by the number of bytes in the prefix,
* which can be calculated as follows:
* max(x-10, 0)
* 0xxx -> 0000-0111 -> sift by 0 or 1 -> 0
* 10xx -> 1000-1011 -> don't care
* 110x -> 1100,1101 -> sift by 3 -> 2,3
* 1110 -> 1110 -> sift by 4 -> 4
* 1111 -> 1111 -> sift by 5 -> 5
*
* vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
* just need to manually detect and handle the one special case:
*/ \
vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \
shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10,
vlOut), 3, __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \
\
c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \
c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \
/* unconditionally widen and combine to c1234 */ \
vuint16m2_t c34 =
__riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vv_u16m2(c3,__riscv_vlmul_trunc_v_u8m2_u8m1(v64u8m2),
vlOut), c4, vlOut); \
vuint16m2_t c12 =
__riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vv_u16m2(c1,__riscv_vlmul_trunc_v_u8m2_u8m1(v64u8m2),
vlOut), c2, vlOut); \
vuint32m4_t c1234 =
__riscv_vwaddu_wv_u32m4(__riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34,
vlOut); \
/* derive required right-shift amount from `shift` to reduce
* c1234 to the required number of bytes */ \
c1234 = __riscv_vsrl_vv_u32m4(c1234, __riscv_vzext_vf4_u32m4( \
__riscv_vmul_vx_u8m1(__riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2,
vlOut), 3, vlOut), 6, vlOut), \
vlOut), vlOut);
#define DOWN __riscv_vreinterpret_v_u32m4_u16m4
#define UP __riscv_vreinterpret_v_u16m4_u32m4
#if !TO_16
#define M1_STORE \
size_t vlDest = vlOut; \
__riscv_vse32_v_u32m4(dest, c1234, vlDest);
#else
#define M1_STORE \
/* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
* to [110110aaaaaaaaaa|110111bbbbbbbbbb] */ \
vuint32m4_t t0 = __riscv_vsub_vx_u32m4(c1234, 0x10000, vlOut); \
vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 6, vlOut); \
t1 = UP(__riscv_vmerge_vvm_u16m4(DOWN(t0), DOWN(t1), m4odd, vlOut*2));
\
t1 = __riscv_vand_vx_u32m4(t1, 0x3ff03ff, vlOut); \
t1 = __riscv_vor_vx_u32m4(t1, 0xd800dc00, vlOut); \
/* merge 1 byte c1234 and 2 byte t1 */ \
vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(c1234, 0xffff, vlOut); \
c1234 = __riscv_vmerge_vvm_u32m4(c1234, t1, m4, vlOut); \
/* swap c1234 two byte pairs */ \
c1234 = __riscv_vor_vv_u32m4( \
__riscv_vsll_vx_u32m4(c1234, 16, vlOut), \
__riscv_vsrl_vx_u32m4(c1234, 16, vlOut), \
vlOut); \
/* compress and store */ \
vbool4_t mOut =
__riscv_vmor_mm_b4(__riscv_vmsne_vx_u16m4_b4(DOWN(c1234), 0, vlOut*2), m4odd,
vlOut*2); \
c1234 = UP(__riscv_vcompress_vm_u16m4(DOWN(c1234), mOut, vlOut*2)); \
size_t vlDest = __riscv_vcpop_m_b4(mOut, vlOut*2); \
__riscv_vse16_v_u16m4(dest, DOWN(c1234), vlDest);
#endif
/* Unrolling this manually reduces register pressure and allows
* us to terminate early. */
{
size_t vlOutm2 = vlOut;
vlOut = __riscv_vsetvl_e8m1(vlOut);
M1_COMMON(0)
M1_STORE
if (vlOutm2 == vlOut) {
vlOut = vlDest;
continue;
}
dest += vlDest;
vlOut = vlOutm2 - vlOut;
}
{
M1_COMMON(1)
M1_STORE
vlOut = vlDest;
}
#undef M1_COMMON
#undef M1_STORE
#undef DOWN
#undef UP
}
/* validate the last character and reparse it + tail */
if (count > tail) {
if ((src[0] >> 6) == 0b10)
--dest;
while ((src[0] >> 6) == 0b10 && tail < count)
--src, ++tail;
#if TO_16
/* go back one more, when on high surrogate */
if (dest[-1] >= 0xD800 && dest[-1] <= 0xDBFF)
--dest;
#endif
}
size_t ret = utf8_to_utf32_scalar(src, tail, dest);
if (ret == 0) return 0;
return (size_t)(dest - destBeg) + ret;
}
#undef uintOut_t
#undef utf8_to_utf32_scalar
#undef utf8_to_utf32_rvv
Too many vector spillings
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug middle-end/113166] RISC-V: Redundant move instructions in RVV intrinsic codes
2023-12-28 22:23 [Bug c/113166] New: RISC-V: Redundant move instructions in RVV intrinsic codes juzhe.zhong at rivai dot ai
2023-12-28 22:32 ` [Bug c/113166] " juzhe.zhong at rivai dot ai
2024-01-18 14:06 ` [Bug middle-end/113166] " juzhe.zhong at rivai dot ai
@ 2024-01-30 11:04 ` juzhe.zhong at rivai dot ai
2 siblings, 0 replies; 4+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-30 11:04 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113166
--- Comment #3 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
#include <cstdint>
#include <riscv_vector.h>
template <size_t length>
inline vuint8m1_t tail_load(void const* data);
template<>
inline vuint8m1_t tail_load<sizeof(uint64_t)>(void const* data) {
uint64_t const* ptr64 = reinterpret_cast<uint64_t const*>(data);
#if 1
const vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0,
__riscv_vsetvlmax_e64m1());
vuint64m1_t v64 = __riscv_vslide1up(zero, *ptr64,
__riscv_vsetvlmax_e64m1());
return __riscv_vreinterpret_u8m1(v64);
#elif 1
vuint64m1_t v64 = __riscv_vmv_s_x_u64m1(*ptr64, 1);
const vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0,
__riscv_vsetvlmax_e64m1());
v64 = __riscv_vslideup(v64, zero, 1, __riscv_vsetvlmax_e8m1());
return __riscv_vreinterpret_u8m1(v64);
#elif 1
vuint64m1_t v64 = __riscv_vle64_v_u64m1(ptr64, 1);
const vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0,
__riscv_vsetvlmax_e64m1());
v64 = __riscv_vslideup(v64, zero, 1, __riscv_vsetvlmax_e8m1());
return __riscv_vreinterpret_u8m1(v64);
#else
vuint8m1_t v = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(ptr64, 1));
const vuint8m1_t zero = __riscv_vmv_v_x_u8m1(0, __riscv_vsetvlmax_e8m1());
return __riscv_vslideup(v, zero, sizeof(uint64_t),
__riscv_vsetvlmax_e8m1());
#endif
}
vuint8m1_t test2(uint64_t data) {
return tail_load<sizeof(data)>(&data);
}
GCC ASM:
test2(unsigned long):
vsetvli a5,zero,e64,m1,ta,ma
vmv.v.i v8,0
vmv1r.v v9,v8
vslide1up.vx v8,v9,a0
ret
LLVM ASM:
test2(unsigned long): # @test2(unsigned long)
vsetvli a1, zero, e64, m1, ta, ma
vmv.v.i v9, 0
vslide1up.vx v8, v9, a0
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-01-30 11:05 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-28 22:23 [Bug c/113166] New: RISC-V: Redundant move instructions in RVV intrinsic codes juzhe.zhong at rivai dot ai
2023-12-28 22:32 ` [Bug c/113166] " juzhe.zhong at rivai dot ai
2024-01-18 14:06 ` [Bug middle-end/113166] " juzhe.zhong at rivai dot ai
2024-01-30 11:04 ` juzhe.zhong at rivai dot ai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).