public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/104438] New: Combine optimization exposed after pro_and_epilogue
@ 2022-02-08  4:29 crazylht at gmail dot com
  2022-02-08  9:15 ` [Bug rtl-optimization/104438] Combine optimization opportunity " rguenth at gcc dot gnu.org
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: crazylht at gmail dot com @ 2022-02-08  4:29 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104438

            Bug ID: 104438
           Summary: Combine optimization exposed after pro_and_epilogue
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---
              Host: x86_64-pc-linux-gnu
            Target: x86_64-*-* i?86-*-*

#include<stdint.h>
#include<immintrin.h>

static __m256i __attribute__((always_inline)) load8bit_4x4_avx2(const uint8_t
*const src,
    const uint32_t stride)
{
    __m128i src01, src23;
    src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
    src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
    src23 = _mm_cvtsi32_si128(*(int32_t*)(src + 2 * stride));
    src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
    return _mm256_setr_m128i(src01, src23);
}

uint32_t  compute4x_m_sad_avx2_intrin(
    uint8_t  *src,         // input parameter, source samples Ptr
    uint32_t  src_stride,  // input parameter, source stride
    uint8_t  *ref,         // input parameter, reference samples Ptr
    uint32_t  ref_stride,  // input parameter, reference stride
    uint32_t  height,      // input parameter, block height (M)
    uint32_t  width)       // input parameter, block width (N)
{
    __m128i xmm0;
    __m256i ymm = _mm256_setzero_si256();
    uint32_t y;
    (void)width;

    for (y = 0; y < height; y += 4) {
        const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
        const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
        ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
        src += src_stride << 2;
        ref += ref_stride << 2;
    }

    xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
        _mm256_extracti128_si256(ymm, 1));

    return (uint32_t)_mm_cvtsi128_si32(xmm0);
}  




gcc -O2 -mavx2 -S



suboptimal asm

.L4:
        vpxor   xmm3, xmm3, xmm3      # 12        [c=4 l=4]  movv4di_internal/0
        vpxor   xmm0, xmm0, xmm0      # 11        [c=4 l=4]  movv8si_internal/0
        vextracti128    xmm3, ymm3, 0x1 # 409     [c=4 l=6] 
vec_extract_hi_v4di
        vpaddd  xmm0, xmm0, xmm3    # 429   [c=4 l=4]  *addv4si3/1
        vmovd   eax, xmm0     # 430     [c=4 l=4]  *movsi_internal/12
        ret       # 437       [c=0 l=1]  simple_return_internal

It can be optimized to just

        xor eax, eax

Before pro_and_epilogue, cfg is like

.L2
...asm...
jmp .L4

.L3:
        vpxor   xmm3, xmm3, xmm3      # 12        [c=4 l=4]  movv4di_internal/0
        vpxor   xmm0, xmm0, xmm0      # 11        [c=4 l=4]  movv8si_internal/0

.L4:

        vextracti128    xmm3, ymm3, 0x1 # 409     [c=4 l=6] 
vec_extract_hi_v4di
        vpaddd  xmm0, xmm0, xmm3    # 429   [c=4 l=4]  *addv4si3/1
        vmovd   eax, xmm0     # 430     [c=4 l=4]  *movsi_internal/12
        ret       # 437       [c=0 l=1]  simple_return_internal


And Since there're 2 predecessor bbs for .L4, it can't be optimized off, but
after pro_and_epilogue, GCC copy .L4 to .L2 and merge .L4 with .L3, and exposed
the opportunity.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-02-25  1:06 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-08  4:29 [Bug target/104438] New: Combine optimization exposed after pro_and_epilogue crazylht at gmail dot com
2022-02-08  9:15 ` [Bug rtl-optimization/104438] Combine optimization opportunity " rguenth at gcc dot gnu.org
2022-02-08  9:50 ` rsandifo at gcc dot gnu.org
2022-02-08 14:42 ` segher at gcc dot gnu.org
2022-02-09  2:41 ` crazylht at gmail dot com
2022-02-09  4:17 ` crazylht at gmail dot com
2022-02-10  3:00 ` crazylht at gmail dot com
2022-02-10  3:01 ` crazylht at gmail dot com
2022-02-24 10:46 ` marxin at gcc dot gnu.org
2022-02-25  1:06 ` crazylht at gmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).