public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "hubicka at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug target/109811] libjxl 0.7 is a lot slower in GCC 13.1 vs Clang 16 Date: Wed, 17 May 2023 12:35:48 +0000 [thread overview] Message-ID: <bug-109811-4-ic7wFnEa3t@http.gcc.gnu.org/bugzilla/> (raw) In-Reply-To: <bug-109811-4@http.gcc.gnu.org/bugzilla/> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811 --- Comment #6 from Jan Hubicka <hubicka at gcc dot gnu.org> --- hottest loop in clang's profile is: for (size_t y = 0; y < opsin.ysize(); y++) { for (size_t x = 0; x < opsin.xsize(); x++) { if (is_background_row[y * is_background_stride + x]) continue; cc.clear(); stack.clear(); stack.emplace_back(x, y); size_t min_x = x; size_t max_x = x; size_t min_y = y; size_t max_y = y; std::pair<uint32_t, uint32_t> reference; bool found_border = false; bool all_similar = true; while (!stack.empty()) { std::pair<uint32_t, uint32_t> cur = stack.back(); stack.pop_back(); if (visited_row[cur.second * visited_stride + cur.first]) continue; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ closed by this continue. visited_row[cur.second * visited_stride + cur.first] = 1; if (cur.first < min_x) min_x = cur.first; if (cur.first > max_x) max_x = cur.first; if (cur.second < min_y) min_y = cur.second; if (cur.second > max_y) max_y = cur.second; if (paint_ccs) { cc.push_back(cur); } for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) { for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) { if (dx == 0 && dy == 0) continue; int next_first = static_cast<int32_t>(cur.first) + dx; int next_second = static_cast<int32_t>(cur.second) + dy; if (next_first < 0 || next_second < 0 || static_cast<uint32_t>(next_first) >= opsin.xsize() || static_cast<uint32_t>(next_second) >= opsin.ysize()) { continue; } std::pair<uint32_t, uint32_t> next{next_first, next_second}; if (!is_background_row[next.second * is_background_stride + next.first]) { stack.push_back(next); } else { if (!found_border) { reference = next; found_border = true; } else { if (!is_similar_b(next, reference)) all_similar = false; } } } } } if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize || max_y - min_y >= kMaxPatchSize) { continue; } size_t bpos = background_stride * reference.second + reference.first; float ref[3] = {background_rows[0][bpos], background_rows[1][bpos], background_rows[2][bpos]}; bool has_similar = false; for (size_t iy = std::max<int>( static_cast<int32_t>(min_y) - kHasSimilarRadius, 0); iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) { for (size_t ix = std::max<int>( static_cast<int32_t>(min_x) - kHasSimilarRadius, 0); ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize()); ix++) { size_t opos = opsin_stride * iy + ix; float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos], opsin_rows[2][opos]}; if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) { has_similar = true; } } } if (!has_similar) continue; info.emplace_back(); info.back().second.emplace_back(min_x, min_y); QuantizedPatch& patch = info.back().first; patch.xsize = max_x - min_x + 1; patch.ysize = max_y - min_y + 1; int max_value = 0; for (size_t c : {1, 0, 2}) { for (size_t iy = min_y; iy <= max_y; iy++) { for (size_t ix = min_x; ix <= max_x; ix++) { size_t offset = (iy - min_y) * patch.xsize + ix - min_x; patch.fpixels[c][offset] = opsin_rows[c][iy * opsin_stride + ix] - ref[c]; int val = pci.Quantize(patch.fpixels[c][offset], c); patch.pixels[c][offset] = val; if (std::abs(val) > max_value) max_value = std::abs(val); } } } if (max_value < kMinPeak) { info.pop_back(); continue; } if (paint_ccs) { float cc_color = rng.UniformF(0.5, 1.0); for (std::pair<uint32_t, uint32_t> p : cc) { ccs.Row(p.second)[p.first] = cc_color; } } } } I guess such a large loop nest with hottest loop not being the innermost is bad for register pressure. Clangs code is : 0.02 │1196:┌─→cmp %r10,-0xb8(%rbp) ▒ │ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&, jxl::PassesEncoderState*, JxlCmsInterface co▒ │ │while (!stack.empty()) { ◆ 1.39 │ │↓ je 1690 ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ │11a3:│ mov -0x8(%r10),%rbx ▒ 9.80 │ │ mov -0x230(%rbp),%r14 ▒ │ │__gnu_cxx::__normal_iterator<std::pair<unsigned int, unsigned int>*, std::vector<std::pair<unsigned ▒ │ │{ return __normal_iterator(_M_current - __n); } ▒ 0.86 │ │ add $0xfffffffffffffff8,%r10 ▒ │ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&, jxl::PassesEncoderState*, JxlCmsInterface co▒ 0.36 │ │ mov %rbx,%r13 ▒ 0.01 │ │ shr $0x20,%r13 ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 2.69 │ │ mov %ebx,%eax ▒ 0.00 │ │ mov %r13,%rcx ▒ 1.12 │ │ imul -0x180(%rbp),%rcx ▒ 0.78 │ │ add %rax,%rcx ▒ 2.73 │ ├──cmpb $0x0,(%r14,%rcx,1) ▒ 19.91 │ └──jne 1196 ▒ While GCC is longer: Percent│ Percent│ while (!stack.empty()) { ▒ │1241:┌─→cmp %rax,-0x370(%rbp) ▒ 0.70 │ │↓ je 1481 ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ 0.02 │124e:│ mov -0x8(%rax),%rdx ▒ │ │std::vector<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒ │ │--this->_M_impl._M_finish; ▒ 0.63 │ │ sub $0x8,%rax ▒ 0.74 │ │ mov %rax,-0x368(%rbp) ▒ │ │FindTextLikePatches(): ▒ 0.01 │ │ mov %edx,%esi ◆ 0.35 │ │ mov %rdx,%rdi ▒ 0.82 │ │ mov %rdx,-0x490(%rbp) ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 0.00 │ │ mov -0x5f8(%rbp),%rdx ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ 1.32 │ │ shr $0x20,%rdi ▒ 0.11 │ │ mov %rsi,%r15 ▒ 0.09 │ │ mov %edi,%ecx ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 0.72 │ │ mov -0x5f0(%rbp),%rdi ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ 0.03 │ │ mov %rcx,%r12 ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 0.39 │ │ imul %rcx,%rdx ▒ 1.00 │ │ add %rsi,%rdx ▒ 1.35 │ │ add %rdi,%rdx ▒ 1.37 │ ├──cmpb $0x0,(%rdx) ▒ 9.56 │ └──jne 1241 ▒ while (!stack.empty()) { ▒ │1241:┌─→cmp %rax,-0x370(%rbp) ▒ 0.70 │ │↓ je 1481 ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ 0.02 │124e:│ mov -0x8(%rax),%rdx ▒ │ │std::vector<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒ │ │--this->_M_impl._M_finish; ▒ 0.63 │ │ sub $0x8,%rax ▒ 0.74 │ │ mov %rax,-0x368(%rbp) ▒ │ │FindTextLikePatches(): ▒ 0.01 │ │ mov %edx,%esi ◆ 0.35 │ │ mov %rdx,%rdi ▒ 0.82 │ │ mov %rdx,-0x490(%rbp) ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 0.00 │ │ mov -0x5f8(%rbp),%rdx ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ 1.32 │ │ shr $0x20,%rdi ▒ 0.11 │ │ mov %rsi,%r15 ▒ 0.09 │ │ mov %edi,%ecx ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 0.72 │ │ mov -0x5f0(%rbp),%rdi ▒ │ │std::pair<uint32_t, uint32_t> cur = stack.back(); ▒ 0.03 │ │ mov %rcx,%r12 ▒ │ │if (visited_row[cur.second * visited_stride + cur.first]) continue; ▒ 0.39 │ │ imul %rcx,%rdx ▒ 1.00 │ │ add %rsi,%rdx ▒ 1.35 │ │ add %rdi,%rdx ▒ 1.37 │ ├──cmpb $0x0,(%rdx) ▒ 9.56 │ └──jne 1241 ▒ Moreover we have heuristics saying that continue usually closes loops with low trip count. -fno-ivopts improves performance with --num_threads=0 however makes number of instructions worse. Curiously enought with --num_threads=1 and more ivopts is benefical.
next prev parent reply other threads:[~2023-05-17 12:35 UTC|newest] Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-05-11 14:24 [Bug tree-optimization/109811] New: libxjl " aros at gmx dot com 2023-05-11 15:30 ` [Bug target/109811] " xry111 at gcc dot gnu.org 2023-05-11 15:49 ` aros at gmx dot com 2023-05-12 6:59 ` rguenth at gcc dot gnu.org 2023-05-17 8:42 ` hubicka at gcc dot gnu.org 2023-05-17 9:08 ` hubicka at gcc dot gnu.org 2023-05-17 12:35 ` hubicka at gcc dot gnu.org [this message] 2023-05-17 13:59 ` [Bug target/109811] libjxl " juzhe.zhong at rivai dot ai 2023-05-17 14:04 ` hubicka at gcc dot gnu.org 2023-05-17 14:10 ` jakub at gcc dot gnu.org 2023-05-17 14:58 ` hubicka at gcc dot gnu.org 2023-05-18 13:40 ` hubicka at gcc dot gnu.org 2023-06-19 16:28 ` cvs-commit at gcc dot gnu.org 2023-11-02 14:12 ` hubicka at gcc dot gnu.org 2023-11-21 14:17 ` cvs-commit at gcc dot gnu.org 2023-11-24 23:13 ` hubicka at gcc dot gnu.org 2023-11-24 23:23 ` sjames at gcc dot gnu.org 2023-11-25 6:22 ` aros at gmx dot com 2023-11-25 13:31 ` hubicka at gcc dot gnu.org 2024-01-05 21:31 ` hubicka at gcc dot gnu.org 2024-01-20 17:23 ` pinskia at gcc dot gnu.org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-109811-4-ic7wFnEa3t@http.gcc.gnu.org/bugzilla/ \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).