From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=29An=HF=kam.mff.cuni.cz=hubicka@sourceware.org>
Received: from nikam.ms.mff.cuni.cz (nikam.ms.mff.cuni.cz [195.113.20.16])
	by sourceware.org (Postfix) with ESMTPS id D97F83858C39;
	Fri, 24 Nov 2023 20:07:37 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org D97F83858C39
Authentication-Results: sourceware.org; dmarc=fail (p=none dis=none) header.from=ucw.cz
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=kam.mff.cuni.cz
ARC-Filter: OpenARC Filter v1.0.0 sourceware.org D97F83858C39
Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=195.113.20.16
ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1700856461; cv=none;
	b=XZazniY36L4WCy9hDbkHRUbuSUpNHsVSX1lr3CFB0+gddqdXM4dicgy/IuzS/KAMLIuNr/E2+ZhkAYYhHBcAIR6YuO+y5pEIgReuGAH83g/jld+LNGjLKjGpz/BxaRwk3a6DY11GiEmNP8utMYo7XduauZNmlMGuhqNgsUct0qg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key;
	t=1700856461; c=relaxed/simple;
	bh=Qpy7/VVUwtSA4h0K/NqjlMQNXjfxIMUTywzkH4gG5qY=;
	h=DKIM-Signature:Date:From:To:Subject:Message-ID:MIME-Version; b=b5rUoRG3T1TmMKLy9/DB74urdmaJkZqgn5phZuJ04zuukTI8TZaZQkhqk0Bomz4X+t7ekx6e2IdYigY0DRhZe2UdG7JlPFxCbgcCRbNrrnpTXcxNbMLquP4UDlRaRLlzFzOAyGMV/QNdlSF0m8G+2rLazCBzK+P1E8vtCmG5F9c=
ARC-Authentication-Results: i=1; server2.sourceware.org
Received: by nikam.ms.mff.cuni.cz (Postfix, from userid 16202)
	id C3CA528BA9D; Fri, 24 Nov 2023 21:07:35 +0100 (CET)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=ucw.cz; s=gen1;
	t=1700856455;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	 to:to:cc:cc:mime-version:mime-version:content-type:content-type:
	 in-reply-to:in-reply-to:references:references;
	bh=xNYzZX+4GR21vLHxN4ddP/sSjXFW22qy03cCKGiw6p8=;
	b=Re3W2oRsOzTgm97qj0spSVMyU8/eYc7AFcQnX5s4EfjnoG/YogLDMWxzqnwIIPC5jcMgYU
	TLrGvrKPys4YuEmVuflAXFMHWyr+Zmd1rPFm8zhuJRYD/0MtFxuNQnKs9AewbTh0DDcFgc
	rORF72h+5IPlZ22zEwXuAAn4n6Ne+RA=
Date: Fri, 24 Nov 2023 21:07:35 +0100
From: Jan Hubicka <hubicka@ucw.cz>
To: Matthias Kretz <m.kretz@gsi.de>
Cc: jwakely@redhat.com, libstdc++@gcc.gnu.org, gcc-patches@gcc.gnu.org
Subject: Re: libstdc++: Speed up push_back
Message-ID: <ZWECh8DQFpocwj+0@kam.mff.cuni.cz>
References: <ZVqD4QG/X2nj6GrP@kam.mff.cuni.cz>
 <11345207.nUPlyArG6x@minbar>
 <ZV9qqZqt6BpBjrlN@kam.mff.cuni.cz>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <ZV9qqZqt6BpBjrlN@kam.mff.cuni.cz>
X-Spam-Status: No, score=-11.0 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,GIT_PATCH_0,HEADER_FROM_DIFFERENT_DOMAINS,JMQ_SPF_NEUTRAL,RCVD_IN_MSPIKE_H3,RCVD_IN_MSPIKE_WL,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <gcc-patches.gcc.gnu.org>

> With my changes at -O3 we now inline push_back, so we could optimize the
> first loop to the second. However with 
> ~/trunk-install/bin/gcc -O3  auto.C  -S -fdump-tree-all-details -fno-exceptions -fno-store-merging -fno-tree-slp-vectorize
> the fist problem is right at the begining:
> 
>   <bb 2> [local count: 97603128]:
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_start = 0B;
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_finish = 0B;
>   MEM[(struct _Vector_impl_data *)x_4(D)]._M_end_of_storage = 0B;
>   _37 = operator new (40);
>   _22 = x_4(D)->D.26019._M_impl.D.25320._M_finish;
>   _23 = x_4(D)->D.26019._M_impl.D.25320._M_start;

Thinking of this problem, it is easy to adjust reserve to copy _M_start
and _M_finish to local variables across the call of new() which makes
the old values visible to compiler regardless of points-to-analysis.
In fact _M_realloc_insert already has such code:

	  // Make local copies of these members because the compiler thinks
	  // the allocator can alter them if 'this' is globally reachable.
	  pointer __old_start = this->_M_impl._M_start;
	  pointer __old_finish = this->_M_impl._M_finish;

So attached patch does that in reserve.  The downside is that if things
are not inlined we may end up pushing extra copy to stack, but I believe
the benefit from inlining actually pays this back.

The testcase with loop still does not optimize it so I simplified it:

#include <vector>

auto
f()
{
  std::vector<int> x;
  x.reserve(10);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  x.push_back(0);
  return x;
}

auto                                                                                                                                                                                                                                                                   
g()                                                                                                                                                                                                                                                                    
{ return std::vector<int>(10, 0); }                                                                                                                                                                                                                                    

This now compiles to less code but it is somewhat funny:

  <bb 2> [local count: 1073741824]:
  MEM <vector(2) long unsigned int> [(int * *)x_3(D)] = { 0, 0 };
  MEM[(struct _Vector_impl_data *)x_3(D)]._M_end_of_storage = 0B;
  _70 = operator new (40);

  <bb 3> [local count: 1073741824]:
  x_3(D)->D.26024._M_impl.D.25325._M_start = _70;
  _65 = _70 + 40;
  x_3(D)->D.26024._M_impl.D.25325._M_end_of_storage = _65;
  _74 = _70 + 4;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _74;
  MEM <unsigned long> [(int *)_70] = 0;
  _80 = _70 + 8;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _80;
  *_80 = 0;
  _86 = _70 + 12;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _86;
  *_86 = 0;
  _92 = _70 + 16;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _92;
  *_92 = 0;
  _98 = _70 + 20;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _98;
  *_98 = 0;
  _104 = _70 + 24;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _104;
  *_104 = 0;
  _110 = _70 + 28;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _110;
  *_110 = 0;
  _116 = _70 + 32;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _116;
  *_116 = 0;
  _122 = _70 + 36;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _122;
  return x_3(D);

The setup code in BB2 is useless and due to fake escape, as discussed in
PRR112653.

However it is funny that we miss dead store elimintation and repeately
set x_3(D)->D.26024._M_impl.D.25325._M_finish = _104;

The problem here is that until pushback.C.208t.forwprop4:std::vector we
do not optimize the reallocation code:

  <bb 2> [local count: 1073741824]:
  MEM <vector(2) long unsigned int> [(int * *)x_3(D)] = { 0, 0 };
  MEM[(struct _Vector_impl_data *)x_3(D)]._M_end_of_storage = 0B;
  _70 = operator new (40);

  <bb 3> [local count: 1073741824]:
  x_3(D)->D.26024._M_impl.D.25325._M_start = _70;
  _65 = _70 + 40;
  x_3(D)->D.26024._M_impl.D.25325._M_end_of_storage = _65;
  *_70 = 0;
  _74 = _70 + 4;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _74;
  D.26115 = 0;
  if (_65 != _74)
    goto <bb 4>; [82.57%]
  else
    goto <bb 5>; [17.43%]

  <bb 4> [local count: 886588624]:
  *_74 = 0;
  _80 = _70 + 8;
  x_3(D)->D.26024._M_impl.D.25325._M_finish = _80;
  goto <bb 8>; [100.00%]

  <bb 5> [local count: 187153200]:
  std::vector<int>::_M_realloc_append<int> (x_3(D), &D.26115);
  goto <bb 7>; [100.00%]

  <bb 6> [count: 0]:
<L12>:
  goto <bb 44>; [100.00%]

  <bb 7> [local count: 187153200]:
  pretmp_127 = MEM[(int * const &)x_3(D) + 8];
  pretmp_144 = MEM[(struct _Vector_base *)x_3(D)]._M_impl.D.25325._M_end_of_storage;

And after forwprop4 it is too late because DSE is no longer run.
I filled PR112706. For some reason FRE is missing to optimize:

int *ptr;
void link_error ();
void
test ()
{

        int *ptr1 = ptr + 10;
        int *ptr2 = ptr + 20;
        if (ptr1 == ptr2)
                link_error ();
}


The vector.tcc change was regtested on x86_64-linux, OK?

libstdc++-v3/ChangeLog:

	* include/bits/vector.tcc (reserve): Copy _M_start and _M_finish
	to local variables to allow propagation across call to
	allocator.
diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc
index 0ccef7911b3..0a9db29c1c7 100644
--- a/libstdc++-v3/include/bits/vector.tcc
+++ b/libstdc++-v3/include/bits/vector.tcc
@@ -72,27 +72,30 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       if (this->capacity() < __n)
 	{
 	  const size_type __old_size = size();
+	  // Make local copies of these members because the compiler thinks
+	  // the allocator can alter them if 'this' is globally reachable.
+	  pointer __old_start = this->_M_impl._M_start;
+	  pointer __old_finish = this->_M_impl._M_finish;
 	  pointer __tmp;
 #if __cplusplus >= 201103L
 	  if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
 	    {
 	      __tmp = this->_M_allocate(__n);
-	      _S_relocate(this->_M_impl._M_start, this->_M_impl._M_finish,
+	      _S_relocate(__old_start, __old_finish,
 			  __tmp, _M_get_Tp_allocator());
 	    }
 	  else
 #endif
 	    {
 	      __tmp = _M_allocate_and_copy(__n,
-		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
-		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
-	      std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
+		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(__old_start),
+		_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(__old_finish));
+	      std::_Destroy(__old_start, __old_finish,
 			    _M_get_Tp_allocator());
 	    }
 	  _GLIBCXX_ASAN_ANNOTATE_REINIT;
-	  _M_deallocate(this->_M_impl._M_start,
-			this->_M_impl._M_end_of_storage
-			- this->_M_impl._M_start);
+	  _M_deallocate(__old_start,
+			this->_M_impl._M_end_of_storage - __old_finish);
 	  this->_M_impl._M_start = __tmp;
 	  this->_M_impl._M_finish = __tmp + __old_size;
 	  this->_M_impl._M_end_of_storage = this->_M_impl._M_start + __n;