public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/114852] New: jpegxl 10.0.1 is faster with clang18 then with gcc14
@ 2024-04-25 12:28 hubicka at gcc dot gnu.org
0 siblings, 0 replies; only message in thread
From: hubicka at gcc dot gnu.org @ 2024-04-25 12:28 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114852
Bug ID: 114852
Summary: jpegxl 10.0.1 is faster with clang18 then with gcc14
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
https://www.phoronix.com/review/gcc14-clang18-amd-zen4/3
reports about 8% difference. I can measure 13% on zen3. The code has changed
and it is no longer bound by push_back but runs AVX2 version of inner loops.
The hottest loops looks comparable
0.00 │266:┌─→vmovaps (%r14,%rax,4),%ymm0
0.11 │ │ vmulps (%rcx,%rax,4),%ymm7,%ymm2
1.18 │ │ vfnmadd213ps (%rsi,%rax,4),%ymm11,%ymm0
0.25 │ │ vmulps %ymm2,%ymm0,%ymm0
5.94 │ │ vroundps $0x8,%ymm0,%ymm2
0.35 │ │ vsubps %ymm2,%ymm0,%ymm0
1.05 │ │ vmulps (%rdx,%rax,4),%ymm0,%ymm0
3.19 │ │ vmovaps %ymm0,0x0(%r13,%rax,4)
0.15 │ │ vandps %ymm10,%ymm2,%ymm0
0.03 │ │ add $0x8,%rax
0.03 │ │ vcmpeqps %ymm8,%ymm0,%ymm2
0.09 │ │ vsqrtps %ymm0,%ymm0
27.25 │ │ vaddps %ymm0,%ymm6,%ymm6
0.35 │ │ vandnps %ymm9,%ymm2,%ymm0
0.12 │ │ vaddps %ymm0,%ymm5,%ymm5
0.05 │ ├──cmp %r12,%rax
0.02 │ └──jb 266
and clang
0.00 │ c90:┌─→vmulps (%r9,%rdx,4),%ymm0,%ymm2
0.97 │ │ vmovaps (%r15,%rdx,4),%ymm1
0.36 │ │ vsubps %ymm2,%ymm1,%ymm1
4.24 │ │ vmulps (%rcx,%rdx,4),%ymm4,%ymm2
1.92 │ │ vmulps %ymm2,%ymm1,%ymm1
0.65 │ │ vroundps $0x8,%ymm1,%ymm2
0.06 │ │ vsubps %ymm2,%ymm1,%ymm1
1.11 │ │ vmulps (%rax,%rdx,4),%ymm1,%ymm1
3.53 │ │ vmovaps %ymm1,(%rsi,%rdx,4)
0.68 │ │ vandps %ymm6,%ymm2,%ymm1
0.23 │ │ vcmpneqps %ymm5,%ymm2,%ymm2
3.64 │ │ add $0x8,%rdx
0.24 │ │ vsqrtps %ymm1,%ymm1
22.16 │ │ vaddps %ymm1,%ymm8,%ymm8
0.25 │ │ vbroadcastss 0x31eba5(%rip),%ymm1 # 34f840
<jxl::palette_internal::GetPaletteValue(int const*, int, unsigned long, int,
int, int)::kMultiplier+0xe0>
0.05 │ │ vandps %ymm1,%ymm2,%ymm1
0.04 │ │ vaddps %ymm1,%ymm7,%ymm7
0.11 │ ├──cmp %rdi,%rdx
0.07 │ └──jb c90▒
GCC profile:
10.78% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long,
unsigned long, jxl::ACSConfig const&, float con
7.02% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::FindBestMultiplier(float const*, float const*, unsigned long,
float, float, bool) [clone .part.0]
4.50% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::Symmetric5Row<jxl::WrapUnchanged>(jxl::Plane<float> const&,
jxl::RectT<unsigned long> const&, long, jxl:
4.47% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::TransformFromPixels(jxl::AcStrategy::Type,
float const*, unsigned long, float*, float*
4.31% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::TransformToPixels(jxl::AcStrategy::Type,
float*, float*, unsigned long, float*)
4.00% cjxl libjxl.so.0.10.1 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternalNoSizeCheck(unsigned char const*, unsig
3.64% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::TokenizeCoefficients(unsigned int const*, jxl::RectT<unsigned
long> const&, int const* restrict*, jxl::AcStra
3.56% cjxl libm.so.6 [.] __ieee754_pow_fma
3.49% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::IDCT1DImpl<8ul, 8ul>::operator()(float
const*, unsigned long, float*, unsigned long, f
3.43% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationImpl::ComputeTile(float, float,
jxl::Image3<float> const&, jxl::Re
3.27% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<32ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2:
3.16% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<8ul, 8ul>::operator()(float*,
float*) [clone .isra.0]
2.87% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::ComputeScaledIDCT<4ul,
8ul>::operator()<jxl::N_AVX2::(anonymous namespace)::DCTTo
2.83% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<64ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2:
2.22% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float,
unsigned long, unsigned long, unsigned long,
2.17% cjxl libm.so.6 [.] __log1pf
2.02% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::IDCT1DWrapper<64ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2
1.89% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::ComputeScaledDCT<4ul,
8ul>::operator()<jxl::N_AVX2::(anonymous namespace)::DCTFro
1.84% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::DCT1DWrapper<8ul, 4ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::
1.77% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::IDCT1DWrapper<32ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2
1.74% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::ComputeScaledDCT<4ul,
4ul>::operator()<jxl::N_AVX2::(anonymous namespace)::DCTFro
clang profile:
12.96% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::EstimateEntropy(jxl::AcStrategy const&, float, unsigned long,
unsigned long, jxl::ACSConfig const&, float const*, float*, float*, unsigned
int*) ◆
8.33% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::TransformToPixels(jxl::AcStrategy::Type,
float*, float*, unsigned long, float*)
▒
7.87% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::TransformFromPixels(jxl::AcStrategy::Type,
float const*, unsigned long, float*, float*)
▒
7.78% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long>
const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*,
jxl::Plane<int> const*, jxl::Quantizer const*, jxl::Rect▒
5.03% cjxl libjxl.so.0.10.1 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::Symmetric5(jxl::Plane<float> const&, jxl::RectT<unsigned long>
const&, jxl::WeightsSymmetric5 const&, jxl::ThreadPool*, jxl::Pla▒
4.66% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::DCT1DImpl<16ul, 8ul>::operator()(float*,
float*)
▒
4.56% cjxl libjxl.so.0.10.1 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::ConvertFromExternalNoSizeCheck(unsigned char const*, unsigned long,
unsigned long, unsigned long, unsigned long, JxlPixelFormat, unsigne▒
4.23% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::(anonymous namespace)::IDCT1DImpl<16ul, 8ul>::operator()(float
const*, unsigned long, float*, unsigned long, float*)
▒
4.22% cjxl libm.so.6 [.] __ieee754_pow_fma
▒
3.62% cjxl libjxl.so.0.10.1 [.]
jxl::ThreadPool::RunCallState<jxl::N_AVX2::(anonymous
namespace)::AdaptiveQuantizationMap(float, jxl::Image3<float> const&,
jxl::RectT<unsigned long> const&, float, jxl::ThreadPool*, jxl::Plane<float>▒
3.04% cjxl libjxl.so.0.10.1 [.] void
jxl::N_AVX2::(anonymous namespace)::GenericTransposeBlock<0ul, 0ul,
jxl::N_AVX2::(anonymous namespace)::DCTFrom, jxl::N_AVX2::(anonymous
namespace)::DCTTo>(jxl::N_AVX2::(anonymous namespace)::Tra▒
2.84% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::AdjustQuantBlockAC(jxl::Quantizer const&, unsigned long, float,
unsigned long, unsigned long, unsigned long, float*, float const*, int*)
▒
2.67% cjxl libjxl.so.0.10.1 [.]
jxl::ThreadPool::RunCallState<jxl::Status (unsigned long),
jxl::N_AVX2::SRGBToXYB(float const*, jxl::ThreadPool*,
jxl::Image3<float>*)::$_0>::CallDataFunc(void*, unsigned int, unsigned long)
▒
2.51% cjxl libm.so.6 [.] __log1pf
▒
2.48% cjxl libjxl.so.0.10.1 [.]
jxl::N_AVX2::TokenizeCoefficients(unsigned int const*, jxl::RectT<unsigned
long> const&, int const* restrict*, jxl::AcStrategyImage const&,
jxl::YCbCrChromaSubsampling, jxl::Image3<int>*, std::vector<▒
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2024-04-25 12:28 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-25 12:28 [Bug middle-end/114852] New: jpegxl 10.0.1 is faster with clang18 then with gcc14 hubicka at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).