public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/114852] New: jpegxl 10.0.1 is faster with clang18 then with gcc14
@ 2024-04-25 12:28 hubicka at gcc dot gnu.org
  0 siblings, 0 replies; only message in thread
From: hubicka at gcc dot gnu.org @ 2024-04-25 12:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114852

            Bug ID: 114852
           Summary: jpegxl 10.0.1 is faster with clang18 then with gcc14
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

https://www.phoronix.com/review/gcc14-clang18-amd-zen4/3
reports about 8% difference.  I can measure 13% on zen3.  The code has changed
and it is no longer bound by push_back but runs AVX2 version of inner loops.

The hottest loops looks comparable

  0.00 │266:┌─→vmovaps      (%r14,%rax,4),%ymm0
  0.11 │    │  vmulps       (%rcx,%rax,4),%ymm7,%ymm2
  1.18 │    │  vfnmadd213ps (%rsi,%rax,4),%ymm11,%ymm0
  0.25 │    │  vmulps       %ymm2,%ymm0,%ymm0
  5.94 │    │  vroundps     $0x8,%ymm0,%ymm2
  0.35 │    │  vsubps       %ymm2,%ymm0,%ymm0
  1.05 │    │  vmulps       (%rdx,%rax,4),%ymm0,%ymm0
  3.19 │    │  vmovaps      %ymm0,0x0(%r13,%rax,4)
  0.15 │    │  vandps       %ymm10,%ymm2,%ymm0
  0.03 │    │  add          $0x8,%rax
  0.03 │    │  vcmpeqps     %ymm8,%ymm0,%ymm2
  0.09 │    │  vsqrtps      %ymm0,%ymm0
 27.25 │    │  vaddps       %ymm0,%ymm6,%ymm6
  0.35 │    │  vandnps      %ymm9,%ymm2,%ymm0
  0.12 │    │  vaddps       %ymm0,%ymm5,%ymm5
  0.05 │    ├──cmp          %r12,%rax
  0.02 │    └──jb           266

and clang

  0.00 │ c90:┌─→vmulps       (%r9,%rdx,4),%ymm0,%ymm2
  0.97 │     │  vmovaps      (%r15,%rdx,4),%ymm1
  0.36 │     │  vsubps       %ymm2,%ymm1,%ymm1
  4.24 │     │  vmulps       (%rcx,%rdx,4),%ymm4,%ymm2
  1.92 │     │  vmulps       %ymm2,%ymm1,%ymm1
  0.65 │     │  vroundps     $0x8,%ymm1,%ymm2
  0.06 │     │  vsubps       %ymm2,%ymm1,%ymm1
  1.11 │     │  vmulps       (%rax,%rdx,4),%ymm1,%ymm1
  3.53 │     │  vmovaps      %ymm1,(%rsi,%rdx,4)
  0.68 │     │  vandps       %ymm6,%ymm2,%ymm1
  0.23 │     │  vcmpneqps    %ymm5,%ymm2,%ymm2
  3.64 │     │  add          $0x8,%rdx
  0.24 │     │  vsqrtps      %ymm1,%ymm1
 22.16 │     │  vaddps       %ymm1,%ymm8,%ymm8
  0.25 │     │  vbroadcastss 0x31eba5(%rip),%ymm1        # 34f840
<jxl::palette_internal::GetPaletteValue(int const*, int, unsigned long, int,
int, int)::kMultiplier+0xe0>
  0.05 │     │  vandps       %ymm1,%ymm2,%ymm1
  0.04 │     │  vaddps       %ymm1,%ymm7,%ymm7
  0.11 │     ├──cmp          %rdi,%rdx
  0.07 │     └──jb           c90▒

GCC profile:
  10.78%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long,
unsigned long, jxl::ACSConfig const&, float con
   7.02%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::FindBestMultiplier(float const*, float const*, unsigned long,
float, float, bool) [clone .part.0]
   4.50%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&,
jxl::RectT<unsigned long> const&, long, jxl:
   4.47%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::TransformFromPixels(jxl::AcStrategy::Type,
float const*, unsigned long, float*, float*
   4.31%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::TransformToPixels(jxl::AcStrategy::Type,
float*, float*, unsigned long, float*)
   4.00%  cjxl             libjxl.so.0.10.1               [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternalNoSizeCheck(unsigned char const*, unsig
   3.64%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::TokenizeCoefficients(unsigned int const*, jxl::RectT<unsigned
long> const&, int const* restrict*, jxl::AcStra
   3.56%  cjxl             libm.so.6                      [.] __ieee754_pow_fma
   3.49%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::IDCT1DImpl<8ul, 8ul>::operator()(float
const*, unsigned long, float*, unsigned long, f
   3.43%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float,
jxl::Image3<float> const&, jxl::Re
   3.27%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<32ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2:
   3.16%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<8ul, 8ul>::operator()(float*,
float*) [clone .isra.0]
   2.87%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::ComputeScaledIDCT<4ul,
8ul>::operator()<jxl::N_AVX2::(anonymous namespace)::DCTTo
   2.83%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<64ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2:
   2.22%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float,
unsigned long, unsigned long, unsigned long,
   2.17%  cjxl             libm.so.6                      [.] __log1pf
   2.02%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::IDCT1DWrapper<64ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2
   1.89%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::ComputeScaledDCT<4ul,
8ul>::operator()<jxl::N_AVX2::(anonymous namespace)::DCTFro
   1.84%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<8ul, 4ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::
   1.77%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::IDCT1DWrapper<32ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2
   1.74%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::ComputeScaledDCT<4ul,
4ul>::operator()<jxl::N_AVX2::(anonymous namespace)::DCTFro


clang profile:
  12.96%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long,
unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned
int*)                                           ◆
   8.33%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::TransformToPixels(jxl::AcStrategy::Type,
float*, float*, unsigned long, float*)                                         
                                           ▒
   7.87%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::TransformFromPixels(jxl::AcStrategy::Type,
float const*, unsigned long, float*, float*)                                   
                                         ▒
   7.78%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long>
const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*,
jxl::Plane<int> const*, jxl::Quantizer const*, jxl::Rect▒
   5.03%  cjxl             libjxl.so.0.10.1               [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Pla▒
   4.66%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<16ul, 8ul>::operator()(float*,
float*)                                                                        
                                           ▒
   4.56%  cjxl             libjxl.so.0.10.1               [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternalNoSizeCheck(unsigned char const*, unsigned long,
unsigned long, unsigned long, unsigned long, JxlPixelFormat, unsigne▒
   4.23%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::(anonymous namespace)::IDCT1DImpl<16ul, 8ul>::operator()(float
const*, unsigned long, float*, unsigned long, float*)                          
                                            ▒
   4.22%  cjxl             libm.so.6                      [.] __ieee754_pow_fma
                                                                               
                                                                               
                      ▒
   3.62%  cjxl             libjxl.so.0.10.1               [.]
jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&,
jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>▒
   3.04%  cjxl             libjxl.so.0.10.1               [.] void
jxl::N_AVX2::(anonymous namespace)::GenericTransposeBlock<0ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous
namespace)::DCTTo>(jxl::N_AVX2::(anonymous namespace)::Tra▒
   2.84%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float,
unsigned long, unsigned long, unsigned long, float*, float const*, int*)       
                                           ▒
   2.67%  cjxl             libjxl.so.0.10.1               [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::SRGBToXYB(float const*, jxl::ThreadPool*,
jxl::Image3<float>*)::$_0>::CallDataFunc(void*, unsigned int, unsigned long)   
      ▒
   2.51%  cjxl             libm.so.6                      [.] __log1pf         
                                                                               
                                                                               
                      ▒
   2.48%  cjxl             libjxl.so.0.10.1               [.]
jxl::N_AVX2::TokenizeCoefficients(unsigned int const*, jxl::RectT<unsigned
long> const&, int const* restrict*, jxl::AcStrategyImage const&,
jxl::YCbCrChromaSubsampling, jxl::Image3<int>*, std::vector<▒

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-04-25 12:28 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-25 12:28 [Bug middle-end/114852] New: jpegxl 10.0.1 is faster with clang18 then with gcc14 hubicka at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).