public inbox for gcc-prs@sourceware.org
help / color / mirror / Atom feed
* libstdc++/9028: codecvt<wchar_t, char, mbstate_t> doesn't work on Red Hat Linux 8.0.
@ 2002-12-20  7:46 peturr02
  0 siblings, 0 replies; only message in thread
From: peturr02 @ 2002-12-20  7:46 UTC (permalink / raw)
  To: gcc-gnats


>Number:         9028
>Category:       libstdc++
>Synopsis:       codecvt<wchar_t, char, mbstate_t> doesn't work on Red Hat Linux 8.0.
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    unassigned
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Fri Dec 20 07:46:01 PST 2002
>Closed-Date:
>Last-Modified:
>Originator:     peturr02@ru.is
>Release:        gcc-3.2.1
>Organization:
>Environment:
Red Hat Linux 8.0 on i686; glibc-2.2.93
>Description:
codecvt<wchar_t, char, mbstate_t> doesn't handle variable-width character sets (such as UTF-8) and thus fails on Red Hat Linux 8.0 (which uses UTF-8 as the default charset).

It also requires that input buffers be zero-terminated (which doesn't hold for I/O buffers).

basic_filebuf<wchar_t> also assumes that each wchar_t will be converted to exactly 1 char.

The net result is that wofstream, wifstream, wfstream, wcout, wcin and wcerr are essentially useless on such a system.
>How-To-Repeat:
The attached file codecvtbug.cc contains an example program that attempts to convert several wide character strings to narrow characters and back. The conversion is first attempted with iconv to check if it is possible.

If the conversion is possible the program then tries to write the string to a wofstream and read it with a wifstream.

Finally, the program attempts to convert the string directly with codecvt<wchar_t, char, mbstate_t>.

The whole test is run with several strings (all subsets of ISO-8859-1) and with several locales.
>Fix:

>Release-Note:
>Audit-Trail:
>Unformatted:
----gnatsweb-attachment----
Content-Type: text/plain; name="codecvtbug.cc"
Content-Disposition: inline; filename="codecvtbug.cc"

#include <iconv.h>
#include <langinfo.h>

#include <locale>
#include <cstring>
#include <cwchar>
#include <iterator>
#include <iostream>
#include <algorithm>
#include <string>
#include <fstream>
#include <cstdio>
#include <stdexcept>
#include <sstream>
using namespace std;

size_t const buffer_size = 2048;

class AssertFailed : public runtime_error
{
private:
  static string make_string(char const* file, int line, char const* func,
			    char const* error)
  {
    ostringstream stream;
    stream << file << ':' << line << ": " << func
	   << ": Assertion \'" << error << "\' failed.";
    return stream.str();
  }

public:
  AssertFailed(char const* file, int line, char const* func,
	       char const* error)
    : runtime_error(make_string(file, line, func, error))
  {
  }
};

#define VERIFY(x) \
do { \
if (!(x)) \
  throw AssertFailed(__FILE__, __LINE__, __PRETTY_FUNCTION__, #x); \
} while (false)

bool test_iconv(std::wstring const& str)
{
  // Baseline test that the conversion of str to charset and back is possible
  // and reversible. If this succeeds, any failures in the other tests are
  // bugs in codecvt.
  string charset (nl_langinfo(CODESET));

  char cbuf[buffer_size];
  memset(&cbuf, 'X', sizeof(cbuf));
  size_t inbytesleft = str.size() * sizeof(wchar_t);
  size_t outbytesleft = sizeof(cbuf);
  wchar_t const* pcw = str.data();
  char* pc = cbuf;

  iconv_t cd = iconv_open(charset.c_str(), "WCHAR_T");
  VERIFY(cd != (iconv_t)-1);

  size_t res = iconv(cd,
		     (char**)(&pcw), &inbytesleft,
		     &pc, &outbytesleft
		     );
  iconv_close(cd);
  if (res == static_cast<size_t>(-1))
    return false;

#ifdef DEBUG_PRINT
  copy(cbuf, pc, ostream_iterator<char>(cout));
  cout << endl;
#endif

  wchar_t wbuf[buffer_size];
  inbytesleft = pc - cbuf;
  outbytesleft = sizeof(wbuf);
  wchar_t* pw = wbuf;
  pc = cbuf;

  cd = iconv_open("WCHAR_T", charset.c_str());
  VERIFY(cd != (iconv_t)-1);
	
  res = iconv(cd,
	      &pc, &inbytesleft,
	      (char**)&pw, &outbytesleft
	      );
  iconv_close(cd);
  if (res == static_cast<size_t>(-1))
    return false;

  VERIFY(static_cast<size_t>(pw - wbuf) == str.size());
  VERIFY(equal(str.begin(), str.end(), wbuf));

  return true;
}

void test_wfstream(std::wstring const& str, bool setbuf)
{
  // Check that it is possible to write out a wide character string with
  // wofstream and read it back in with wifstream.
  char name[] = "codecvtbugXXXXXX";
  mktemp(name);

  wstring tmp;
	
  {
    wofstream out;
    if (setbuf)
      out.rdbuf()->pubsetbuf(0, 0);
    out.open(name);
    copy(str.begin(), str.end(),
	 ostreambuf_iterator<wchar_t>(out));
  }
  {
    wifstream in;
    if (setbuf)
      in.rdbuf()->pubsetbuf(0, 0);
    in.open(name);
    copy(istreambuf_iterator<wchar_t>(in),
	 istreambuf_iterator<wchar_t>(),
	 back_inserter(tmp));
  }

  remove(name);

  cout << boolalpha << setbuf << endl;

  // Check that the string that was written is returned correctly
  VERIFY(tmp.size() == str.size());
  VERIFY(tmp == str);
}

void test_codecvt(std::wstring const& str, bool fail)
{
  typedef codecvt<wchar_t, char, mbstate_t> codecvt_type;

  mbstate_t state;
  memset(&state, 0, sizeof(state));

  char cbuf[buffer_size];
  memset(&cbuf, 'X', sizeof(cbuf));

  codecvt_type const& cvt = use_facet<codecvt_type>(locale());

  wchar_t const* cwend;
  char* cend;

  codecvt_base::result res1 =
    cvt.out(state,
	    str.data(), str.data() + str.size(),
	    cwend,
	    cbuf, cbuf + sizeof(cbuf), cend
	    );

  VERIFY(res1 != codecvt_base::partial);
  
  if (!fail)
    {
      // As the buffer should be big enough and the characters should
      // be convertible, the conversion should succeed.
      VERIFY(res1 != codecvt_base::error);
      
      // Since ok or noconv was returned, the entire string should have
      // been converted.
      VERIFY(cwend == str.data() + str.size());
	
#ifdef DEBUG_PRINT
      copy(cbuf, cend, ostream_iterator<char>(cout));
      cout << endl;
#endif
    }

  memset(&state, 0, sizeof(state));
  wchar_t wbuf[buffer_size];
  char const* ccend;
  wchar_t* wend;
	
  codecvt_base::result res2 =
    cvt.in(state,
	   cbuf, cend, ccend,
	   wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), wend
	   );

  VERIFY(res2 != codecvt_base::partial);

  if (fail)
    {
      VERIFY(res1 == codecvt_base::error || res2 == codecvt_base::error);
    }
  else
    {
      VERIFY(res2 != codecvt_base::error);

      // Check that in() did not read past the end of the buffer
      VERIFY(ccend <= cend);
      
      // Check that the same characters were returned as were originally passed
      // in.
      VERIFY(static_cast<size_t>(wend - wbuf) == str.size());
      VERIFY(equal(str.begin(), str.end(), wbuf));
      
      memset(&state, 0, sizeof(state));
      int length = cvt.length(state, cbuf, cend, str.size());
      VERIFY(length == cend - cbuf);

      int enc = cvt.encoding();
      if (enc > 0)
	VERIFY(cend - cbuf == enc * str.size());
    }
}

void test_codecvt2(std::wstring const& str)
{
  typedef codecvt<wchar_t, char, mbstate_t> codecvt_type;

  mbstate_t state;
  memset(&state, 0, sizeof(state));

  char cbuf[buffer_size];
  memset(&cbuf, 'X', sizeof(cbuf));

  codecvt_type const& cvt = use_facet<codecvt_type>(locale());

  char* cend = cbuf;
  wchar_t const* cwend = str.data();
  codecvt_base::result res;	
		
  for (char* i = cbuf + 1; i < cbuf + sizeof(cbuf); ++i)
    {
      res = cvt.out(state, cwend, min(cwend + 1, str.data() + str.size()),
		    cwend, cend, i, cend
		    );

      // XXX is this correct? Should out() return partial or error when the
      // output buffer is too small to hold 1 character?
      VERIFY(res != codecvt_base::error);
      VERIFY(cend <= i);

      // Check that out didn't overrun the output buffer.
      VERIFY(*cend == 'X');

      if (res == codecvt_base::ok &&
	  cwend == str.data() + str.size())
	break;
    }

  // This conversion succeeded when the whole input buffer was passed in,
  // it should also succeed when the input and output buffers are passed
  // in one character at a time.
  VERIFY(res == codecvt_base::ok);

#ifdef DEBUG_PRINT
  copy(cbuf, cend, ostream_iterator<char>(cout));
  cout << endl;
#endif

  memset(&state, 0, sizeof(state));
  wchar_t wbuf[buffer_size];
  char const* ccend = cbuf;
  wchar_t* wend = wbuf;

  for (char* i = cbuf + 1; i <= cend; ++i)
    {
      res = cvt.in(state, ccend, i, ccend,
		   wend, wbuf + sizeof(wbuf) / sizeof(wchar_t), wend
		   );

      VERIFY(res != codecvt_base::error);
      VERIFY(ccend <= i);
      VERIFY(wend <= wbuf + str.size());
    }

  VERIFY(res == codecvt_base::ok);
  VERIFY(ccend <= cend);

  // Check that the conversion is reversible
  VERIFY(static_cast<size_t>(wend - wbuf) == str.size());
  VERIFY(equal(str.begin(), str.end(), wbuf));
}

void test_codecvt3(std::wstring const& str)
{
  typedef codecvt<wchar_t, char, mbstate_t> codecvt_type;
  codecvt_type const& cvt = use_facet<codecvt_type>(locale());

  for (const wchar_t* it = str.data();
       it != str.data() + str.length(); ++it)
    {
      mbstate_t state;
      memset(&state, 0, sizeof(state));
      wchar_t const* pcw;
      char* pc;
      char const* pcc;

      char cbuf[buffer_size];
      memset(cbuf, 'X', sizeof(cbuf));

      codecvt_base::result res =
	cvt.out(state,
		it, it + 1, pcw,
		cbuf, cbuf + sizeof(cbuf), 
		pc
		);

      // Conversion of a single character should succeed.
      VERIFY(res == codecvt_base::ok);
      VERIFY(*pc == 'X');

      memset(&state, 0, sizeof(state));

      int length = cvt.length(state, cbuf, pc, 1);
      VERIFY(length == pc - cbuf);
      if (pc == cbuf + 1)
	continue;

      memset(&state, 0, sizeof(state));

      wchar_t wbuf[buffer_size];
      wchar_t* pw;
      res = cvt.in(state,
		   cbuf, cbuf + 1, pcc,
		   wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t),
		   pw
		   );
      VERIFY(res == codecvt_base::partial);

      memset(&state, 0, sizeof(state));
      memset(cbuf, 'X', sizeof(cbuf));

      res = cvt.out(state,
		    it, it + 1, pcw,
		    cbuf, cbuf + 1, pc);
      VERIFY(res == codecvt_base::partial);
    }
}

void test(std::wstring const& str)
{
  if (test_iconv(str))
    {
      // Test codecvt indirectly through basic_filebuf
      test_wfstream(str, false);
      test_wfstream(str, true);
      // Test codecvt directly
      test_codecvt(str, false);
      test_codecvt2(str);
      test_codecvt3(str);
    }
  else
    test_codecvt(str, true);
}

int main()
{
  char const* const locnames[] = {
    "is_IS.UTF-8", "is_IS", "is_IS.iso885915", "is_IS.iso88591",
    "de_DE.UTF-8", "en_US.UTF-8", "en_US.iso885915",
    "en_US.iso88591" };

  locale loc;
  int ret = 0;

  for (char const* const* locname = locnames;
       locname != locnames + sizeof(locnames) / sizeof(locnames[0]);
       ++locname)
    {
      try
	{
	  loc = locale(*locname);
	}
      catch (exception&)
	{
	  cout << "locale " << *locname << " not supported"
	       << endl;
	  continue;
	}

      locale::global(loc);
		
      try
	{
	  // Lowercase letters of icelandic alphabet in UCS-4, as converted
	  // by iconv.
	  // XXX endianess
	  wchar_t const alpha[] = {
	    97, 225, 98, 100, 240, 101, 233, 102, 103, 104, 105, 237, 106,
	    107, 108, 109, 110, 111, 243, 112, 114, 115, 116, 117, 250, 118,
	    120, 121, 253, 254, 230, 246
	  };

	  wstring str (alpha,
		       alpha + sizeof(alpha) / sizeof(wchar_t));
	  test(str);
	  transform(str.begin(), str.end(), str.begin(),
		    towupper);
	  test(str);
		
	  char const spec[] = "\n\0\t \t\n\0\0\n \n";
			
	  wstring str2;
	  transform(spec, spec + sizeof(spec),
		    back_inserter(str2),
		    bind1st(mem_fun(&wios::widen), &wcout));
	  test(str2);

	  wstring str3;
	  for (int i = 0; i < 256; ++i) // Entire iso8859-1 codeset
	    str3 += static_cast<wchar_t>(i);
	  test(str3);
	}
      catch (AssertFailed const& e)
	{
	  cerr << "locale " << *locname << " failed:\n";
	  cerr << e.what() << endl;
	  ret = 1;
	}
    }

  return ret;
}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2002-12-20 15:46 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-12-20  7:46 libstdc++/9028: codecvt<wchar_t, char, mbstate_t> doesn't work on Red Hat Linux 8.0 peturr02

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).