On Mon, Sep 5, 2022, 23:51 Charles-Francois Natali <cf.natali@gmail.com>
wrote:

> `basic_filebuf::xsputn` would bypass the buffer when passed a chunk of
> size 1024 and above, seemingly as an optimisation.
>
> This can have a significant performance impact if the overhead of a
> `write` syscall is non-negligible, e.g. on a slow disk, on network
> filesystems, or simply during IO contention because instead of flushing
> every `BUFSIZ` (by default), we can flush every 1024 char.
> The impact is even greater with custom larger buffers, e.g. for network
> filesystems, because the code could issue `write` for example 1000X more
> often than necessary with respect to the buffer size.
> It also introduces a significant discontinuity in performance when
> writing chunks of size 1024 and above.
>
> See this reproducer which writes down a fixed number of chunks to a file
> open with `O_SYNC` - to replicate high-latency `write` - for varying
> size of chunks:
>
> ```
> $ cat test_fstream_flush.cpp
>
> int
> main(int argc, char* argv[])
> {
>   assert(argc == 3);
>
>   const auto* path = argv[1];
>   const auto chunk_size = std::stoul(argv[2]);
>
>   const auto fd =
>     open(path, O_CREAT | O_TRUNC | O_WRONLY | O_SYNC | O_CLOEXEC, 0666);
>   assert(fd >= 0);
>
>   auto filebuf = __gnu_cxx::stdio_filebuf<char>(fd, std::ios_base::out);
>   auto stream = std::ostream(&filebuf);
>
>   const auto chunk = std::vector<char>(chunk_size);
>
>   for (auto i = 0; i < 1'000; ++i) {
>     stream.write(chunk.data(), chunk.size());
>   }
>
>   return 0;
> }
> ```
>
> ```
> $ g++ -o /tmp/test_fstream_flush test_fstream_flush.cpp -std=c++17
> $ for i in $(seq 1021 1025); do echo -e "\n$i"; time
> /tmp/test_fstream_flush /tmp/foo $i; done
>
> 1021
>
> real    0m0.997s
> user    0m0.000s
> sys     0m0.038s
>
> 1022
>
> real    0m0.939s
> user    0m0.005s
> sys     0m0.032s
>
> 1023
>
> real    0m0.954s
> user    0m0.005s
> sys     0m0.034s
>
> 1024
>
> real    0m7.102s
> user    0m0.040s
> sys     0m0.192s
>
> 1025
>
> real    0m7.204s
> user    0m0.025s
> sys     0m0.209s
> ```
>
> See the huge drop in performance at the 1024-boundary.
>
> An `strace` confirms that from size 1024 we effectively defeat
> buffering:
> 1023-sized writes
> ```
> $ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush
> /tmp/foo 1023 2>&1 | head -n5
> openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC,
> 0666) = 3
> writev(3,
> [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=8184},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1023}], 2) = 9207
> writev(3,
> [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=8184},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1023}], 2) = 9207
> writev(3,
> [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=8184},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1023}], 2) = 9207
> writev(3,
> [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=8184},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1023}], 2) = 9207
> ```
>
> vs 1024-sized writes
> ```
> $ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush
> /tmp/foo 1024 2>&1 | head -n5
> openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC,
> 0666) = 3
> writev(3, [{iov_base=NULL, iov_len=0},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1024}], 2) = 1024
> writev(3, [{iov_base="", iov_len=0},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1024}], 2) = 1024
> writev(3, [{iov_base="", iov_len=0},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1024}], 2) = 1024
> writev(3, [{iov_base="", iov_len=0},
> {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> iov_len=1024}], 2) = 1024
> ```
>
> Instead, it makes sense to only bypass the buffer if the amount of data
> to be written is larger than the buffer capacity.
>
> Closes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63746
> ---
>  libstdc++-v3/include/bits/fstream.tcc         |  9 +--
>  .../27_io/basic_filebuf/sputn/char/63746.cc   | 55 +++++++++++++++++++
>  2 files changed, 58 insertions(+), 6 deletions(-)
>  create mode 100644
> libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc
>
> diff --git a/libstdc++-v3/include/bits/fstream.tcc
> b/libstdc++-v3/include/bits/fstream.tcc
> index 7ccc887b8..2e9369628 100644
> --- a/libstdc++-v3/include/bits/fstream.tcc
> +++ b/libstdc++-v3/include/bits/fstream.tcc
> @@ -757,23 +757,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>      {
>        streamsize __ret = 0;
>        // Optimization in the always_noconv() case, to be generalized in
> the
> -      // future: when __n is sufficiently large we write directly instead
> of
> -      // using the buffer.
> +      // future: when __n is larger than the available capacity we write
> +      // directly instead of using the buffer.
>        const bool __testout = (_M_mode & ios_base::out
>                               || _M_mode & ios_base::app);
>        if (__check_facet(_M_codecvt).always_noconv()
>           && __testout && !_M_reading)
>         {
> -         // Measurement would reveal the best choice.
> -         const streamsize __chunk = 1ul << 10;
>           streamsize __bufavail = this->epptr() - this->pptr();
>
>           // Don't mistake 'uncommitted' mode buffered with unbuffered.
>           if (!_M_writing && _M_buf_size > 1)
>             __bufavail = _M_buf_size - 1;
>
> -         const streamsize __limit = std::min(__chunk, __bufavail);
> -         if (__n >= __limit)
> +         if (__n >= __bufavail)
>             {
>               const streamsize __buffill = this->pptr() - this->pbase();
>               const char* __buf = reinterpret_cast<const
> char*>(this->pbase());
> diff --git
> a/libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc
> b/libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc
> new file mode 100644
> index 000000000..36448e049
> --- /dev/null
> +++ b/libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc
> @@ -0,0 +1,55 @@
> +// Copyright (C) 2013-2022 Free Software Foundation, Inc.
> +//
> +// This file is part of the GNU ISO C++ Library.  This library is free
> +// software; you can redistribute it and/or modify it under the
> +// terms of the GNU General Public License as published by the
> +// Free Software Foundation; either version 3, or (at your option)
> +// any later version.
> +
> +// This library is distributed in the hope that it will be useful,
> +// but WITHOUT ANY WARRANTY; without even the implied warranty of
> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +// GNU General Public License for more details.
> +
> +// You should have received a copy of the GNU General Public License along
> +// with this library; see the file COPYING3.  If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +// { dg-require-fileio "" }
> +
> +#include <fstream>
> +#include <testsuite_hooks.h>
> +
> +class testbuf : public std::filebuf {
> +public:
> +  char_type* pub_pprt() const
> +  {
> +    return this->pptr();
> +  }
> +
> +  char_type* pub_pbase() const
> +  {
> +    return this->pbase();
> +  }
> +};
> +
> +void test01()
> +{
> +  using namespace std;
> +
> +  // Leave capacity to avoid flush.
> +  const streamsize chunk_size = BUFSIZ - 1 - 1;
> +  const char data[chunk_size] = {};
> +
> +  testbuf a_f;
> +  VERIFY( a_f.open("tmp_63746_sputn", ios_base::out) );
> +  VERIFY( chunk_size == a_f.sputn(data, chunk_size) );
> +  VERIFY( (a_f.pub_pprt() - a_f.pub_pbase()) == chunk_size );
> +  VERIFY( a_f.close() );
> +}
> +
> +int main()
> +{
> +  test01();
> +  return 0;
> +}
> --
> 2.30.2
>
>