public inbox for libstdc++@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge
@ 2020-06-11  6:32 François Dumont
  2020-06-22  6:27 ` François Dumont
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: François Dumont @ 2020-06-11  6:32 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2124 bytes --]

As we are on patching algos we still have this old one.

     From the original patch I only kept the memory optimization part as 
the new performance test was not showing good result for the other part 
to change pivot value. I also kept the small change in 
get_temporary_buffer even if I don't have strong feeling about it, it 
just make sure that we'll try to allocate 1 element as a last chance 
allocation.

     Note that there is still place for an improvement. If we miss 
memory on the heap we then use a recursive implementation which then 
rely on stack memory. I would be surprise that a system which miss heap 
memory would have no problem to allocate about the same on the stack so 
we will surely end up in a stack overflow. I still have this on my todo 
even if I already made several tries with no satisfying result in terms 
of performance.

     Tested under Linux x86_64.

Commit message:

     libstdc++: Limit memory allocation in stable_sort/inplace_merge (PR 
83938)

     Reduce memory consumption in stable_sort/inplace_merge to what is used.

     Co-authored-by: François Dumont  <fdumont@gcc.gnu.org>

     libstdc++-v3/ChangeLog:

     2020-06-11  John Chang  <john.chang@samba.tv>
                 François Dumont  <fdumont@gcc.gnu.org>

             PR libstdc++/83938
             * include/bits/stl_tempbuf.h (get_temporary_buffer): Change 
__len
             computation in the loop.
             * include/bits/stl_algo.h:
             (__inplace_merge): Take temporary buffer length from 
smallest range.
             (__stable_sort): Limit temporary buffer length.
             * testsuite/25_algorithms/inplace_merge/1.cc (test03): Test 
different
             pivot positions.
             * testsuite/performance/25_algorithms/stable_sort.cc: Test 
stable_sort
             under different heap memory conditions.
             * testsuite/performance/25_algorithms/inplace_merge.cc: New.

Ok to commit ?

François


[-- Attachment #2: pr83938.patch --]
[-- Type: text/x-patch, Size: 16082 bytes --]

diff --git a/libstdc++-v3/include/bits/stl_algo.h b/libstdc++-v3/include/bits/stl_algo.h
index fd6edd0d5f4..44798ffaa7f 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -2531,7 +2531,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       const _DistanceType __len2 = std::distance(__middle, __last);
 
       typedef _Temporary_buffer<_BidirectionalIterator, _ValueType> _TmpBuf;
-      _TmpBuf __buf(__first, __len1 + __len2);
+      _TmpBuf __buf(__first, std::min(__len1, __len2));
 
       if (__buf.begin() == 0)
 	std::__merge_without_buffer
@@ -2740,6 +2740,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  std::__merge_sort_with_buffer(__first, __middle, __buffer, __comp);
 	  std::__merge_sort_with_buffer(__middle, __last, __buffer, __comp);
 	}
+
       std::__merge_adaptive(__first, __middle, __last,
 			    _Distance(__middle - __first),
 			    _Distance(__last - __middle),
@@ -5006,8 +5007,11 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO
       typedef typename iterator_traits<_RandomAccessIterator>::difference_type
 	_DistanceType;
 
+      if (__first == __last)
+	return;
+
       typedef _Temporary_buffer<_RandomAccessIterator, _ValueType> _TmpBuf;
-      _TmpBuf __buf(__first, std::distance(__first, __last));
+      _TmpBuf __buf(__first, (__last - __first + 1) / 2);
 
       if (__buf.begin() == 0)
 	std::__inplace_stable_sort(__first, __last, __comp);
diff --git a/libstdc++-v3/include/bits/stl_tempbuf.h b/libstdc++-v3/include/bits/stl_tempbuf.h
index f6f17960472..d76ed7f7ea6 100644
--- a/libstdc++-v3/include/bits/stl_tempbuf.h
+++ b/libstdc++-v3/include/bits/stl_tempbuf.h
@@ -110,7 +110,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 							std::nothrow));
 	  if (__tmp != 0)
 	    return std::pair<_Tp*, ptrdiff_t>(__tmp, __len);
-	  __len /= 2;
+	  __len = __len == 1 ? 0 : ((__len + 1) / 2);
 	}
       return std::pair<_Tp*, ptrdiff_t>(static_cast<_Tp*>(0), 0);
     }
diff --git a/libstdc++-v3/testsuite/25_algorithms/inplace_merge/1.cc b/libstdc++-v3/testsuite/25_algorithms/inplace_merge/1.cc
index 5859f0363d5..7f78687d0aa 100644
--- a/libstdc++-v3/testsuite/25_algorithms/inplace_merge/1.cc
+++ b/libstdc++-v3/testsuite/25_algorithms/inplace_merge/1.cc
@@ -28,7 +28,7 @@ using std::inplace_merge;
 
 typedef test_container<int, bidirectional_iterator_wrapper> container;
 
-void 
+void
 test1()
 {
   int array[] = { 1 };
@@ -39,7 +39,7 @@ test1()
   inplace_merge(con2.begin(), con2.begin(), con2.end());
 }
 
-void 
+void
 test2()
 {
   int array[] = { 0, 2, 4, 1, 3, 5 };
@@ -60,30 +60,34 @@ struct S
   { return a < _s.a; }
 };
 
-void 
+void
 test3()
 {
   S s[8];
-  s[0].a = 0;
-  s[1].a = 1;
-  s[2].a = 2;
-  s[3].a = 3;
-  s[4].a = 0;
-  s[5].a = 1;
-  s[6].a = 2;
-  s[7].a = 3;
-
-  s[0].b = 0;
-  s[1].b = 1;
-  s[2].b = 2;
-  s[3].b = 3;
-  s[4].b = 4;
-  s[5].b = 5;
-  s[6].b = 6;
-  s[7].b = 7;
-
-  inplace_merge(s, s + 4, s + 8);
-  VERIFY( s[0].b == 0 && s[1].b == 4 && s[2].b == 1 && s[3].b == 5 );
+  for (int pivot_idx = 0; pivot_idx < 8; ++pivot_idx)
+    {
+      int bval = 0;
+      for (int i = 0; i != pivot_idx; ++i)
+	{
+	  s[i].a = i;
+	  s[i].b = bval++;
+	}
+
+      for (int i = pivot_idx; i != 8; ++i)
+	{
+	  s[i].a = i - pivot_idx;
+	  s[i].b = bval++;
+	}
+
+      inplace_merge(s, s + pivot_idx, s + 8);
+
+      for (int i = 1; i < 8; ++i)
+	{
+	  VERIFY( !(s[i] < s[i - 1]) );
+	  if (s[i - 1].a == s[i].a)
+	    VERIFY( s[i - 1].b < s[i].b );
+	}
+    }
 }
 
 int 
diff --git a/libstdc++-v3/testsuite/performance/25_algorithms/inplace_merge.cc b/libstdc++-v3/testsuite/performance/25_algorithms/inplace_merge.cc
new file mode 100644
index 00000000000..780c21912c7
--- /dev/null
+++ b/libstdc++-v3/testsuite/performance/25_algorithms/inplace_merge.cc
@@ -0,0 +1,290 @@
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+#include <testsuite_new_operators.h>
+#include <testsuite_performance.h>
+
+const int max_size = 10000000;
+const int small_size = 200000;
+const int front_pivot_idx = 10000;
+int middle_pivot_idx = max_size / 2;
+int back_pivot_idx = max_size - front_pivot_idx;
+
+void bench(int mem_threshold, int pivot_index,
+	   std::vector<int> revv,
+	   std::vector<int> fwdv,
+	   std::vector<int> wstv,
+	   std::vector<int> rndv)
+{
+  using namespace __gnu_test;
+
+  time_counter time;
+  resource_counter resource;
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(revv.begin(), revv.begin() + pivot_index, revv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+
+  report_performance(__FILE__, "reverse", time, resource);
+  clear_counters(time, resource);
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(fwdv.begin(), fwdv.begin() + pivot_index, fwdv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+
+  report_performance(__FILE__, "forward", time, resource);
+  clear_counters(time, resource);
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(wstv.begin(), wstv.begin() + pivot_index, wstv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+
+  report_performance(__FILE__, "worst", time, resource);
+  clear_counters(time, resource);
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(rndv.begin(), rndv.begin() + pivot_index, rndv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+  report_performance(__FILE__, "random", time, resource);
+}
+
+void mem_bench(double mem_ratio,
+	       const std::vector<int>& front_revv,
+	       const std::vector<int>& middle_revv,
+	       const std::vector<int>& back_revv,
+	       const std::vector<int>& fwdv,
+	       const std::vector<int>& front_wstv,
+	       const std::vector<int>& middle_wstv,
+	       const std::vector<int>& back_wstv,
+	       const std::vector<int>& front_rndv,
+	       const std::vector<int>& middle_rndv,
+	       const std::vector<int>& back_rndv)
+{
+  using namespace __gnu_test;
+
+  time_counter time;
+  resource_counter resource;
+
+  int max_mem = (int)std::ceil(front_pivot_idx * mem_ratio) * sizeof(int);
+  start_counters(time, resource);
+  bench(max_mem, front_pivot_idx, front_revv, fwdv, front_wstv, front_rndv);
+  stop_counters(time, resource);
+  report_performance(__FILE__, "front pivot", time, resource);
+  clear_counters(time, resource);
+
+  max_mem = (int)std::ceil(middle_pivot_idx * mem_ratio) * sizeof(int);
+  start_counters(time, resource);
+  bench(max_mem, middle_pivot_idx, middle_revv, fwdv, middle_wstv, middle_rndv);
+  stop_counters(time, resource);
+  report_performance(__FILE__, "middle pivot", time, resource);
+  clear_counters(time, resource);
+
+  max_mem = (int)std::ceil(front_pivot_idx * mem_ratio) * sizeof(int);
+  start_counters(time, resource);
+  bench(max_mem, back_pivot_idx, back_revv, fwdv, back_wstv, back_rndv);
+  stop_counters(time, resource);
+  report_performance(__FILE__, "back pivot", time, resource);
+}
+
+void init_reverse(std::vector<int>& v, size_t pivot_index)
+{
+  int val = 0;
+  for (size_t i = pivot_index; i != v.size(); ++i)
+    v[i] = val++;
+  for (size_t i = 0; i != pivot_index; ++i)
+    v[i] = val++;
+}
+
+void init_forward(std::vector<int>& v)
+{
+  int val = 0;
+  for (size_t i = 0; i != v.size(); ++i)
+    v[i] = val++;
+}
+
+void init_worst(std::vector<int>& v, size_t pivot_index)
+{
+  int val = 0;
+  if (pivot_index + 1 > v.size() / 2)
+    {
+      for (size_t i = 0; i != pivot_index; val += 2, ++i)
+	v[i] = val;
+      val = 1;
+    }
+  else
+    {
+      for (size_t i = pivot_index; i != v.size(); val += 2, ++i)
+	v[i] = val;
+      val -= pivot_index * 2 + 1;
+    }
+
+  if (pivot_index + 1 > v.size() / 2)
+    for (size_t i = pivot_index; i != v.size(); val += 2, ++i)
+      v[i] = val;
+  else
+    for (size_t i = 0; i != pivot_index; val += 2, ++i)
+      v[i] = val;
+}
+
+void init_random(std::vector<int>& v)
+{
+  // a simple pseudo-random series which does not rely on rand() and friends
+  v[0] = 0;
+  for (size_t i = 1; i != v.size(); ++i)
+    v[i] = (v[i-1] + 110211473) * 745988807;
+}
+
+void reduce_size(std::vector<int>& front_v,
+		 std::vector<int>& middle_v,
+		 std::vector<int>& back_v)
+{
+  front_v.erase(front_v.begin() + front_pivot_idx,
+		front_v.end() - back_pivot_idx);
+  middle_v.erase(middle_v.begin() + small_size / 2,
+		 middle_v.end() - small_size / 2);
+  back_v.erase(back_v.begin() + back_pivot_idx,
+	       back_v.end() - front_pivot_idx);
+}
+
+int main()
+{
+  using namespace __gnu_test;
+
+  // No constraint to build vectors.
+  set_new_limit(~size_t(0));
+
+  std::vector<int> front_revv(max_size);
+  init_reverse(front_revv, front_pivot_idx);
+
+  std::vector<int> middle_revv(max_size);
+  init_reverse(middle_revv, middle_pivot_idx);
+
+  std::vector<int> back_revv(max_size);
+  init_reverse(back_revv, back_pivot_idx);
+
+  std::vector<int> fwdv(max_size);
+  init_forward(fwdv);
+
+  std::vector<int> front_wstv(max_size);
+  init_worst(front_wstv, front_pivot_idx);
+
+  std::vector<int> middle_wstv(max_size);
+  init_worst(middle_wstv, middle_pivot_idx);
+
+  std::vector<int> back_wstv(max_size);
+  init_worst(back_wstv, back_pivot_idx);
+
+  std::vector<int> front_rndv(max_size);
+  init_random(front_rndv);
+  std::vector<int> middle_rndv(front_rndv);
+  std::vector<int> back_rndv(front_rndv);
+
+  sort(front_rndv.begin(), front_rndv.begin() + front_pivot_idx);
+  sort(front_rndv.begin() + front_pivot_idx, front_rndv.end());
+
+  sort(middle_rndv.begin(), middle_rndv.begin() + middle_pivot_idx);
+  sort(middle_rndv.begin() + middle_pivot_idx, middle_rndv.end());
+
+  sort(back_rndv.begin(), back_rndv.begin() + back_pivot_idx);
+  sort(back_rndv.begin() + back_pivot_idx, back_rndv.end());
+
+  time_counter time;
+  resource_counter resource;
+
+  start_counters(time, resource);
+
+  // No limit.
+  mem_bench(1.0,
+	    front_revv, middle_revv, back_revv,
+	    fwdv,
+	    front_wstv, middle_wstv, back_wstv,
+	    front_rndv, middle_rndv, back_rndv);
+
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1 / 1 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+
+  // Limit to the fourth.
+  mem_bench(1.0 / 4,
+	    front_revv, middle_revv, back_revv,
+	    fwdv,
+	    front_wstv, middle_wstv, back_wstv,
+	    front_rndv, middle_rndv, back_rndv);
+
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1 / 4 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+
+  // Really limit allocation.
+  mem_bench(1.0 / 64,
+	    front_revv, middle_revv, back_revv,
+	    fwdv,
+	    front_wstv, middle_wstv, back_wstv,
+	    front_rndv, middle_rndv, back_rndv);
+
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1 /64 memory", time, resource);
+  clear_counters(time, resource);
+
+  middle_pivot_idx = small_size / 2;
+  back_pivot_idx = small_size - front_pivot_idx;
+  reduce_size(front_revv, middle_revv, back_revv);
+  fwdv.resize(small_size);
+  reduce_size(front_wstv, middle_wstv, back_wstv);
+  reduce_size(front_rndv, middle_rndv, back_rndv);
+
+  start_counters(time, resource);
+
+  // No memory.
+  mem_bench(0.0,
+	    front_revv, middle_revv, back_revv,
+	    fwdv,
+	    front_wstv, middle_wstv, back_wstv,
+	    front_rndv, middle_rndv, back_rndv);
+
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 0 / 1 memory", time, resource);
+  return 0;
+}
diff --git a/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc b/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc
index 02b869dd195..fe526638aaf 100644
--- a/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc
+++ b/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc
@@ -17,49 +17,107 @@
 
 #include <vector>
 #include <algorithm>
+
+#include <testsuite_new_operators.h>
 #include <testsuite_performance.h>
 
-int main()
+const int max_size = 10000000;
+const int small_size = 200000;
+
+void bench(size_t mem_threshold,
+	   std::vector<int> revv,
+	   std::vector<int> fwdv,
+	   std::vector<int> rndv)
 {
   using namespace __gnu_test;
 
   time_counter time;
   resource_counter resource;
 
-  const int max_size = 10000000;
-
-  std::vector<int> v(max_size);
-
-  for (int i = 0; i < max_size; ++i)
-    v[i] = -i;
+  set_new_limit(mem_threshold);
 
   start_counters(time, resource);
-  std::stable_sort(v.begin(), v.end());
+  std::stable_sort(revv.begin(), revv.end());
   stop_counters(time, resource);
 
+  set_new_limit(~size_t(0));
   report_performance(__FILE__, "reverse", time, resource);
   clear_counters(time, resource);
 
-  for (int i = 0; i < max_size; ++i)
-    v[i] = i;
+  set_new_limit(mem_threshold);
 
   start_counters(time, resource);
-  std::stable_sort(v.begin(), v.end());
+  std::stable_sort(fwdv.begin(), fwdv.end());
   stop_counters(time, resource);
 
+  set_new_limit(~size_t(0));
   report_performance(__FILE__, "forwards", time, resource);
   clear_counters(time, resource);
 
-  // a simple psuedo-random series which does not rely on rand() and friends
-  v[0] = 0;
+  start_counters(time, resource);
+  std::stable_sort(rndv.begin(), rndv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+  report_performance(__FILE__, "random", time, resource);
+}
+
+int main()
+{
+  using namespace __gnu_test;
+
+  // No memory constraint.
+  set_new_limit(~size_t(0));
+
+  std::vector<int> revv(max_size);
+  for (int i = 0; i < max_size; ++i)
+    revv[i] = -i;
+
+  std::vector<int> fwdv(max_size);
+  for (int i = 0; i < max_size; ++i)
+    fwdv[i] = i;
+
+  // a simple pseudo-random series which does not rely on rand() and friends
+  std::vector<int> rndv(max_size);
+  rndv[0] = 0;
   for (int i = 1; i < max_size; ++i)
-    v[i] = (v[i-1] + 110211473) * 745988807;
+    rndv[i] = (rndv[i-1] + 110211473) * 745988807;
+
+  time_counter time;
+  resource_counter resource;
 
   start_counters(time, resource);
-  std::stable_sort(v.begin(), v.end());
+  bench(~size_t(0), revv, fwdv, rndv);
   stop_counters(time, resource);
 
-  report_performance(__FILE__, "random", time, resource);
+  report_performance(__FILE__, "bench 1 / 1 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Limit to fourth the expected size of the sorted array.
+  bench(max_size * sizeof(int) / 4, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1 / 4 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Limit to 1/64 of range size.
+  bench(max_size * sizeof(int) / 64, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1 /64 memory", time, resource);
+  clear_counters(time, resource);
+
+  revv.resize(small_size);
+  fwdv.resize(small_size);
+  rndv.resize(small_size);
+
+  start_counters(time, resource);
+  // Forbid any allocation.
+  bench(0, revv, fwdv, rndv);
+  stop_counters(time, resource);
 
+  report_performance(__FILE__, "bench 0 / 1 memory", time, resource);
   return 0;
 }

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge
  2020-06-11  6:32 [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge François Dumont
@ 2020-06-22  6:27 ` François Dumont
  2020-06-24 17:39 ` Jonathan Wakely
  2020-11-19 12:26 ` Jonathan Wakely
  2 siblings, 0 replies; 6+ messages in thread
From: François Dumont @ 2020-06-22  6:27 UTC (permalink / raw)
  To: libstdc++, gcc-patches

Gentle remider.



On 11/06/20 8:32 am, François Dumont wrote:
> As we are on patching algos we still have this old one.
>
>     From the original patch I only kept the memory optimization part 
> as the new performance test was not showing good result for the other 
> part to change pivot value. I also kept the small change in 
> get_temporary_buffer even if I don't have strong feeling about it, it 
> just make sure that we'll try to allocate 1 element as a last chance 
> allocation.
>
>     Note that there is still place for an improvement. If we miss 
> memory on the heap we then use a recursive implementation which then 
> rely on stack memory. I would be surprise that a system which miss 
> heap memory would have no problem to allocate about the same on the 
> stack so we will surely end up in a stack overflow. I still have this 
> on my todo even if I already made several tries with no satisfying 
> result in terms of performance.
>
>     Tested under Linux x86_64.
>
> Commit message:
>
>     libstdc++: Limit memory allocation in stable_sort/inplace_merge 
> (PR 83938)
>
>     Reduce memory consumption in stable_sort/inplace_merge to what is 
> used.
>
>     Co-authored-by: François Dumont  <fdumont@gcc.gnu.org>
>
>     libstdc++-v3/ChangeLog:
>
>     2020-06-11  John Chang  <john.chang@samba.tv>
>                 François Dumont  <fdumont@gcc.gnu.org>
>
>             PR libstdc++/83938
>             * include/bits/stl_tempbuf.h (get_temporary_buffer): 
> Change __len
>             computation in the loop.
>             * include/bits/stl_algo.h:
>             (__inplace_merge): Take temporary buffer length from 
> smallest range.
>             (__stable_sort): Limit temporary buffer length.
>             * testsuite/25_algorithms/inplace_merge/1.cc (test03): 
> Test different
>             pivot positions.
>             * testsuite/performance/25_algorithms/stable_sort.cc: Test 
> stable_sort
>             under different heap memory conditions.
>             * testsuite/performance/25_algorithms/inplace_merge.cc: New.
>
> Ok to commit ?
>
> François
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge
  2020-06-11  6:32 [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge François Dumont
  2020-06-22  6:27 ` François Dumont
@ 2020-06-24 17:39 ` Jonathan Wakely
  2020-06-24 20:12   ` François Dumont
  2020-11-18 20:43   ` François Dumont
  2020-11-19 12:26 ` Jonathan Wakely
  2 siblings, 2 replies; 6+ messages in thread
From: Jonathan Wakely @ 2020-06-24 17:39 UTC (permalink / raw)
  To: François Dumont; +Cc: libstdc++, gcc-patches

On 11/06/20 08:32 +0200, François Dumont via Libstdc++ wrote:
>As we are on patching algos we still have this old one.
>
>    From the original patch I only kept the memory optimization part 
>as the new performance test was not showing good result for the other 
>part to change pivot value. I also kept the small change in 
>get_temporary_buffer even if I don't have strong feeling about it, it 
>just make sure that we'll try to allocate 1 element as a last chance 
>allocation.
>
>    Note that there is still place for an improvement. If we miss 
>memory on the heap we then use a recursive implementation which then 
>rely on stack memory. I would be surprise that a system which miss 
>heap memory would have no problem to allocate about the same on the 
>stack so we will surely end up in a stack overflow. I still have this 
>on my todo even if I already made several tries with no satisfying 
>result in terms of performance.
>
>    Tested under Linux x86_64.
>
>Commit message:
>
>    libstdc++: Limit memory allocation in stable_sort/inplace_merge 
>(PR 83938)
>
>    Reduce memory consumption in stable_sort/inplace_merge to what is used.
>
>    Co-authored-by: François Dumont  <fdumont@gcc.gnu.org>
>
>    libstdc++-v3/ChangeLog:
>
>    2020-06-11  John Chang  <john.chang@samba.tv>
>                François Dumont  <fdumont@gcc.gnu.org>
>
>            PR libstdc++/83938
>            * include/bits/stl_tempbuf.h (get_temporary_buffer): 
>Change __len
>            computation in the loop.
>            * include/bits/stl_algo.h:
>            (__inplace_merge): Take temporary buffer length from 
>smallest range.
>            (__stable_sort): Limit temporary buffer length.
>            * testsuite/25_algorithms/inplace_merge/1.cc (test03): 
>Test different
>            pivot positions.
>            * testsuite/performance/25_algorithms/stable_sort.cc: Test 
>stable_sort
>            under different heap memory conditions.
>            * testsuite/performance/25_algorithms/inplace_merge.cc: New.
>
>Ok to commit ?

I'm very nervous about changes to sort algos that aren't absolutely
necessary for correctness. It needs careful review and lots of
testing. Please be patient.



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge
  2020-06-24 17:39 ` Jonathan Wakely
@ 2020-06-24 20:12   ` François Dumont
  2020-11-18 20:43   ` François Dumont
  1 sibling, 0 replies; 6+ messages in thread
From: François Dumont @ 2020-06-24 20:12 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

On 24/06/20 7:39 pm, Jonathan Wakely wrote:
> On 11/06/20 08:32 +0200, François Dumont via Libstdc++ wrote:
>> As we are on patching algos we still have this old one.
>>
>>     From the original patch I only kept the memory optimization 
>> part as the new performance test was not showing good result for the 
>> other part to change pivot value. I also kept the small change in 
>> get_temporary_buffer even if I don't have strong feeling about it, it 
>> just make sure that we'll try to allocate 1 element as a last chance 
>> allocation.
>>
>>     Note that there is still place for an improvement. If we miss 
>> memory on the heap we then use a recursive implementation which then 
>> rely on stack memory. I would be surprise that a system which miss 
>> heap memory would have no problem to allocate about the same on the 
>> stack so we will surely end up in a stack overflow. I still have this 
>> on my todo even if I already made several tries with no satisfying 
>> result in terms of performance.
>>
>>     Tested under Linux x86_64.
>>
>> Commit message:
>>
>>     libstdc++: Limit memory allocation in 
>> stable_sort/inplace_merge (PR 83938)
>>
>>     Reduce memory consumption in stable_sort/inplace_merge to what 
>> is used.
>>
>>     Co-authored-by: François Dumont <fdumont@gcc.gnu.org>
>>
>>     libstdc++-v3/ChangeLog:
>>
>>     2020-06-11  John Chang  <john.chang@samba.tv>
>>                 François Dumont <fdumont@gcc.gnu.org>
>>
>>             PR libstdc++/83938
>>             * include/bits/stl_tempbuf.h 
>> (get_temporary_buffer): Change __len
>>             computation in the loop.
>>             * include/bits/stl_algo.h:
>>             (__inplace_merge): Take temporary buffer 
>> length from smallest range.
>>             (__stable_sort): Limit temporary buffer length.
>>             * testsuite/25_algorithms/inplace_merge/1.cc 
>> (test03): Test different
>>             pivot positions.
>>             * 
>> testsuite/performance/25_algorithms/stable_sort.cc: Test stable_sort
>>             under different heap memory conditions.
>>             * 
>> testsuite/performance/25_algorithms/inplace_merge.cc: New.
>>
>> Ok to commit ?
>
> I'm very nervous about changes to sort algos that aren't absolutely
> necessary for correctness. It needs careful review and lots of
> testing. Please be patient.
>
>
Sure, just note that there is no change to the algo logic in any way. It 
is only to limit the temporary buffer allocation to what is being used.

This kind of situation illustrates also why PR management would be nice. 
I wouldn't wonder if the patch hasn't been forgotten. I know that you 
are for it, I hope you'll make your point one day.

Thanks


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge
  2020-06-24 17:39 ` Jonathan Wakely
  2020-06-24 20:12   ` François Dumont
@ 2020-11-18 20:43   ` François Dumont
  1 sibling, 0 replies; 6+ messages in thread
From: François Dumont @ 2020-11-18 20:43 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

Gentle reminder now that we are in stage 3 ?


On 24/06/20 7:39 pm, Jonathan Wakely wrote:
> On 11/06/20 08:32 +0200, François Dumont via Libstdc++ wrote:
>> As we are on patching algos we still have this old one.
>>
>>     From the original patch I only kept the memory optimization 
>> part as the new performance test was not showing good result for the 
>> other part to change pivot value. I also kept the small change in 
>> get_temporary_buffer even if I don't have strong feeling about it, it 
>> just make sure that we'll try to allocate 1 element as a last chance 
>> allocation.
>>
>>     Note that there is still place for an improvement. If we miss 
>> memory on the heap we then use a recursive implementation which then 
>> rely on stack memory. I would be surprise that a system which miss 
>> heap memory would have no problem to allocate about the same on the 
>> stack so we will surely end up in a stack overflow. I still have this 
>> on my todo even if I already made several tries with no satisfying 
>> result in terms of performance.
>>
>>     Tested under Linux x86_64.
>>
>> Commit message:
>>
>>     libstdc++: Limit memory allocation in 
>> stable_sort/inplace_merge (PR 83938)
>>
>>     Reduce memory consumption in stable_sort/inplace_merge to what 
>> is used.
>>
>>     Co-authored-by: François Dumont <fdumont@gcc.gnu.org>
>>
>>     libstdc++-v3/ChangeLog:
>>
>>     2020-06-11  John Chang  <john.chang@samba.tv>
>>                 François Dumont <fdumont@gcc.gnu.org>
>>
>>             PR libstdc++/83938
>>             * include/bits/stl_tempbuf.h 
>> (get_temporary_buffer): Change __len
>>             computation in the loop.
>>             * include/bits/stl_algo.h:
>>             (__inplace_merge): Take temporary buffer 
>> length from smallest range.
>>             (__stable_sort): Limit temporary buffer length.
>>             * testsuite/25_algorithms/inplace_merge/1.cc 
>> (test03): Test different
>>             pivot positions.
>>             * 
>> testsuite/performance/25_algorithms/stable_sort.cc: Test stable_sort
>>             under different heap memory conditions.
>>             * 
>> testsuite/performance/25_algorithms/inplace_merge.cc: New.
>>
>> Ok to commit ?
>
> I'm very nervous about changes to sort algos that aren't absolutely
> necessary for correctness. It needs careful review and lots of
> testing. Please be patient.
>
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge
  2020-06-11  6:32 [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge François Dumont
  2020-06-22  6:27 ` François Dumont
  2020-06-24 17:39 ` Jonathan Wakely
@ 2020-11-19 12:26 ` Jonathan Wakely
  2 siblings, 0 replies; 6+ messages in thread
From: Jonathan Wakely @ 2020-11-19 12:26 UTC (permalink / raw)
  To: François Dumont; +Cc: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2789 bytes --]

On 11/06/20 08:32 +0200, François Dumont via Libstdc++ wrote:
>As we are on patching algos we still have this old one.
>
>    From the original patch I only kept the memory optimization part 
>as the new performance test was not showing good result for the other 
>part to change pivot value. I also kept the small change in 
>get_temporary_buffer even if I don't have strong feeling about it, it 
>just make sure that we'll try to allocate 1 element as a last chance 
>allocation.
>
>    Note that there is still place for an improvement. If we miss 
>memory on the heap we then use a recursive implementation which then 
>rely on stack memory. I would be surprise that a system which miss 
>heap memory would have no problem to allocate about the same on the 
>stack so we will surely end up in a stack overflow. I still have this 
>on my todo even if I already made several tries with no satisfying 
>result in terms of performance.
>
>    Tested under Linux x86_64.
>
>Commit message:
>
>    libstdc++: Limit memory allocation in stable_sort/inplace_merge 
>(PR 83938)
>
>    Reduce memory consumption in stable_sort/inplace_merge to what is used.
>
>    Co-authored-by: François Dumont  <fdumont@gcc.gnu.org>
>
>    libstdc++-v3/ChangeLog:
>
>    2020-06-11  John Chang  <john.chang@samba.tv>
>                François Dumont  <fdumont@gcc.gnu.org>
>
>            PR libstdc++/83938
>            * include/bits/stl_tempbuf.h (get_temporary_buffer): 
>Change __len
>            computation in the loop.
>            * include/bits/stl_algo.h:
>            (__inplace_merge): Take temporary buffer length from 
>smallest range.
>            (__stable_sort): Limit temporary buffer length.
>            * testsuite/25_algorithms/inplace_merge/1.cc (test03): 
>Test different
>            pivot positions.
>            * testsuite/performance/25_algorithms/stable_sort.cc: Test 
>stable_sort
>            under different heap memory conditions.
>            * testsuite/performance/25_algorithms/inplace_merge.cc: New.
>
>Ok to commit ?

Oops, I replied to the old thread about this patch. See comments at:
https://gcc.gnu.org/pipermail/gcc-patches/2020-November/559583.html

P.S. it looks to me like the "else" branch in __merge_adaptive will
rarely get used. In most cases, the temporary buffer is going to
succeed on the first allocation and have enough space.

So I think we should split out the "else" branch of __merge_adaptive
to a separate function, so that __merge_adaptive is a much smaller
function and the cold branch doesn't have to be loaded into cache.

Something like this, but I haven't benchmarked it.


[-- Attachment #2: patch.txt --]
[-- Type: text/x-patch, Size: 4157 bytes --]

diff --git a/libstdc++-v3/include/bits/stl_algo.h b/libstdc++-v3/include/bits/stl_algo.h
index 6efc99035b7d..428c22cf5d43 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -2400,6 +2400,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	return std::rotate(__first, __middle, __last);
     }
 
+  // This is a helper function for the cold branch of __merge_adaptive.
+  template<typename _BidirectionalIterator, typename _Distance,
+	   typename _Pointer, typename _Compare>
+    void
+    __merge_adaptive_resize(_BidirectionalIterator __first,
+			    _BidirectionalIterator __middle,
+			    _BidirectionalIterator __last,
+			    _Distance __len1, _Distance __len2,
+			    _Pointer __buffer, _Distance __buffer_size,
+			    _Compare __comp);
+
   /// This is a helper function for the merge routines.
   template<typename _BidirectionalIterator, typename _Distance, 
 	   typename _Pointer, typename _Compare>
@@ -2425,42 +2436,58 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
       else
 	{
-	  _BidirectionalIterator __first_cut = __first;
-	  _BidirectionalIterator __second_cut = __middle;
-	  _Distance __len11 = 0;
-	  _Distance __len22 = 0;
-	  if (__len1 > __len2)
-	    {
-	      __len11 = __len1 / 2;
-	      std::advance(__first_cut, __len11);
-	      __second_cut
-		= std::__lower_bound(__middle, __last, *__first_cut,
-				     __gnu_cxx::__ops::__iter_comp_val(__comp));
-	      __len22 = std::distance(__middle, __second_cut);
-	    }
-	  else
-	    {
-	      __len22 = __len2 / 2;
-	      std::advance(__second_cut, __len22);
-	      __first_cut
-		= std::__upper_bound(__first, __middle, *__second_cut,
-				     __gnu_cxx::__ops::__val_comp_iter(__comp));
-	      __len11 = std::distance(__first, __first_cut);
-	    }
-
-	  _BidirectionalIterator __new_middle
-	    = std::__rotate_adaptive(__first_cut, __middle, __second_cut,
-				     __len1 - __len11, __len22, __buffer,
-				     __buffer_size);
-	  std::__merge_adaptive(__first, __first_cut, __new_middle, __len11,
-				__len22, __buffer, __buffer_size, __comp);
-	  std::__merge_adaptive(__new_middle, __second_cut, __last,
-				__len1 - __len11,
-				__len2 - __len22, __buffer,
-				__buffer_size, __comp);
+	  std::__merge_adaptive_resize(__first, __middle, __last,
+				       __len1, __len2, __buffer, __buffer_size,
+				       __comp);
 	}
     }
 
+  // This is a helper function for the cold branch of __merge_adaptive.
+  template<typename _BidirectionalIterator, typename _Distance,
+	   typename _Pointer, typename _Compare>
+    void
+    __merge_adaptive_resize(_BidirectionalIterator __first,
+			    _BidirectionalIterator __middle,
+			    _BidirectionalIterator __last,
+			    _Distance __len1, _Distance __len2,
+			    _Pointer __buffer, _Distance __buffer_size,
+			    _Compare __comp)
+    {
+      _BidirectionalIterator __first_cut = __first;
+      _BidirectionalIterator __second_cut = __middle;
+      _Distance __len11 = 0;
+      _Distance __len22 = 0;
+      if (__len1 > __len2)
+	{
+	  __len11 = __len1 / 2;
+	  std::advance(__first_cut, __len11);
+	  __second_cut
+	    = std::__lower_bound(__middle, __last, *__first_cut,
+				 __gnu_cxx::__ops::__iter_comp_val(__comp));
+	  __len22 = std::distance(__middle, __second_cut);
+	}
+      else
+	{
+	  __len22 = __len2 / 2;
+	  std::advance(__second_cut, __len22);
+	  __first_cut
+	    = std::__upper_bound(__first, __middle, *__second_cut,
+				 __gnu_cxx::__ops::__val_comp_iter(__comp));
+	  __len11 = std::distance(__first, __first_cut);
+	}
+
+      _BidirectionalIterator __new_middle
+	= std::__rotate_adaptive(__first_cut, __middle, __second_cut,
+				 __len1 - __len11, __len22, __buffer,
+				 __buffer_size);
+      std::__merge_adaptive(__first, __first_cut, __new_middle, __len11,
+			    __len22, __buffer, __buffer_size, __comp);
+      std::__merge_adaptive(__new_middle, __second_cut, __last,
+			    __len1 - __len11,
+			    __len2 - __len22, __buffer,
+			    __buffer_size, __comp);
+    }
+
   /// This is a helper function for the merge routines.
   template<typename _BidirectionalIterator, typename _Distance,
 	   typename _Compare>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-11-19 12:26 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-11  6:32 [PATCH] PR 83938 Reduce memory consumption in stable_sort/inplace_merge François Dumont
2020-06-22  6:27 ` François Dumont
2020-06-24 17:39 ` Jonathan Wakely
2020-06-24 20:12   ` François Dumont
2020-11-18 20:43   ` François Dumont
2020-11-19 12:26 ` Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).