public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/103272] New: failure to use vld20/vld21 to vectorize for ARM MVE
@ 2021-11-16 10:07 clyon at gcc dot gnu.org
  0 siblings, 0 replies; only message in thread
From: clyon at gcc dot gnu.org @ 2021-11-16 10:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103272

            Bug ID: 103272
           Summary: failure to use vld20/vld21 to vectorize for ARM MVE
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: clyon at gcc dot gnu.org
  Target Milestone: ---

With current GCC trunk with -mcpu=cortex-m55 -mfpu=auto

#include <stdint.h>

typedef struct {
  int16_t v1;
  int16_t v2;
} data;

void test (data* restrict d, data* restrict x,
           data* restrict y, uint32_t L) {
  for (uint32_t i = 0; i < L*16; i++) {
     d[i].v1 = x[i].v1*y[i].v1;
     d[i].v2 = x[i].v2*y[i].v2;
   }
}

we generate:
test:
        lsls    r3, r3, #4
        beq     .L9
        lsls    r3, r3, #2
        push    {lr}
        sub     lr, r3, #16
        lsr     lr, lr, #4
        add     lr, lr, #1
        dls     lr, lr
.L3:
        vldrh.16        q3, [r2], #16
        vldrh.16        q2, [r1], #16
        vmul.i16        q3, q3, q2
        vstrh.16        q3, [r0], #16
        le      lr, .L3
        ldr     pc, [sp], #4
.L9:
        bx      lr


while LLVM generates:
test:
        push    {r7, lr}
        mov     r7, sp
        mov.w   r12, #0
        cmp.w   r12, r3, lsl #4
        it      eq
        popeq   {r7, pc}
        mvn     r12, #7
        add.w   r12, r12, r3, lsl #4
        movs    r3, #1
        add.w   lr, r3, r12, lsr #3
.LBB0_2:
        vld20.16        {q0, q1}, [r1]
        vld20.16        {q2, q3}, [r2]
        vld21.16        {q0, q1}, [r1]!
        vld21.16        {q2, q3}, [r2]!
        vmul.i16        q0, q2, q0
        vmul.i16        q1, q3, q1
        vst20.16        {q0, q1}, [r0]
        vst21.16        {q0, q1}, [r0]!
        le      lr, .LBB0_2
        pop     {r7, pc}


OTOH, GCC vectorizes better the samples included in the testsuite
(gcc.target/arm/simd/mve-vld2.c)

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-11-16 10:07 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-16 10:07 [Bug target/103272] New: failure to use vld20/vld21 to vectorize for ARM MVE clyon at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).