public inbox for cygwin@cygwin.com
 help / color / mirror / Atom feed
* Trouble with character sets
@ 2020-08-03 15:36 Michael Shay
  2020-08-03 16:31 ` Brian Inglis
  0 siblings, 1 reply; 11+ messages in thread
From: Michael Shay @ 2020-08-03 15:36 UTC (permalink / raw)
  To: cygwin

[-- Attachment #1: Type: text/plain, Size: 3160 bytes --]

I'm having a problem with Cygwin 3.1.4, changing the character set on the 
fly. It seems to work with Cygwin applications, but not with Win32 
applications.

I have a Korn shell script:
#!/bin/ksh

OLD_LANG="$LANG"
OLD_LC_ALL="$LC_ALL"

echo "locale on entry"
locale
echo ""

export LANG="en_US.CP1252"
export LC_ALL=en_US.CP1252

echo "locale changed to"
locale
echo ""

# Default is to run the Win32 program. Input any argument other than 
'WIN32'
# to run '/bin/echo'.

case $# in
   0 )  echo "Running WIN32 pgm"
        ksh -c 'cygtest.exe ZÇ'
        ;;
   1 )  echo "Running Cygwin 'echo'"
        ksh -c '/bin/echo ZÇ'
        ;;
   2 )  echo "Running WIN32 pgm"
        ksh -c 'cygtest.exe ZÇ'
        echo ""
        echo "Running Cygwin 'echo'"
        ksh -c '/bin/echo ZÇ'
        ;;
   * ) ;;
esac

LC_ALL="$OLD_LC_ALL"
LANG="$OLD_LANG"

and a Win32 application (attached file cygtest.cpp)

I used gdb to see what was happening in child_info_spawn::worker(), when a 
Win32 program is started using:

          rc = CreateProcessW (runpath,   /* image name w/ full path */
                   cmd.wcs (wcmd),  /* what was passed to exec */
                   sa,    /* process security attrs */
                   sa,    /* thread security attrs */
                   TRUE,    /* inherit handles */
                   c_flags,
                   envblock,  /* environment */
                   NULL,
                   &si,
                   &pi);
Specifically, 'cmd.wcs(wcmd)' invokes:

  wchar_t *wcs (wchar_t *wbuf, size_t n)
  {
    if (n == 1)
      wbuf[0] = L'\0';
    else
        sys_mbstowcs (wbuf, n, buf);
    return wbuf;
  }

and sys_mbstowcs():

size_t __reg3
sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms)
{
  mbtowc_p f_mbtowc = __MBTOWC;
  if (f_mbtowc == __ascii_mbtowc)
    {
      f_mbtowc = __utf8_mbtowc;                                 <<<<< this 
is ALWAYS done, no matter what charset is in use.
    }
  return sys_cp_mbstowcs (f_mbtowc, dst, dlen, src, nms);
}

Since the CP1252 is an 8-bit single-byte character set with characters >= 
0x80, the '0xc7' character is always translated as '0xc7 0xf0', with the 
'0xf0' byte indicating an invalid character in the string.

This doesn't seem to happen when e.g. '/bin/echo' is run, although I 
haven't stepped into the code to see what's happening.

I do not think this is a Cygwin bug, but since the User's Guide says the 
locale and charset can be changed on the fly, I don't know what's going 
awry.

Any suggestions? If you need more information, I'm happy to provide it.

Mike Shay

Here's the source for the Win32 program. I built it with Visual Studio 
2015, to get something running quickly.



  
NOTICE  from Ab Initio: This email (including any attachments) may contain information that is subject to confidentiality obligations or is legally privileged, and sender does not waive confidentiality or privilege. If received in error, please notify the sender, delete this email, and make no further use, disclosure, or distribution.  

[-- Attachment #2: cygtest.cpp --]
[-- Type: application/octet-stream, Size: 4428 bytes --]

// cygtest.cpp : Defines the entry point for the console application.
//


#include <SDKDDKVer.h>
#include <stdio.h>
#include <windows.h>
#include <string>
using namespace std;

LPSTR __stdcall UnicodeToMByteHelper(LPSTR lpa, int nBytes, LPCWSTR lpw, int nChars, int codepage);

static UINT cyg_codepage_string_to_CP(const string &cp)
{
  const string UTF8 = "UTF-8";
  const string utf8 = "utf-8";
  const string ANSI = "ANSI";
  const string ansi = "ansi";
  const string ISO88591 = "ISO-8859-1";
  const string iso88591 = "iso-8859-1";
  const string OEM = "OEM";
  const string oem = "oem";
  const string WINDOWS = "WINDOWS";
  const string windows = "windows";
  const string CODEPAGE = "CP";
  const string codepage = "cp";
  UINT shell_cp{ 0 };

  if (NULL == cp.c_str() || cp.length() == 0)
    return 0;

  if ((cp.compare(utf8) == 0) || (cp.compare(UTF8) == 0))
    shell_cp = 65001;
  else if ((cp.compare(ansi) == 0) || (cp.compare(ANSI) == 0)
    || (cp.compare(ISO88591) == 0) || (cp.compare(iso88591) == 0))
    shell_cp = 1252;
  // oem is also standard cygwin nomenclature
  else if ((cp.compare(oem) == 0) || (cp.compare(OEM) == 0))
    shell_cp = 437;
  // cpXXX, windows-XXX and windows_XXX are all recognized by
  // the Ab Initio extensions to cygwin.  Not sure if they are
  // known to standard cygwin, but I don't think they are.
  else if ((cp.compare(0, 2, codepage) == 0) ||
    (cp.compare(0, 2, CODEPAGE) == 0) ||
    (cp.compare(0, 7, windows) == 0) ||
    (cp.compare(0, 7, WINDOWS) == 0)) {
    // If the prefix is "CP" or "cp" then get the number after that
    // else it's "WINDOWS{-,_}" or "WINDOWS{-,_}"
    int offset = ((cp.compare(0, 2, codepage) == 0) || (cp.compare(0, 2, CODEPAGE) == 0)) ? 2 : 8;
    shell_cp = atoi(cp.substr(offset).c_str());
  }
  return shell_cp;
}

static UINT get_cygwin_codepage()
{
  string default_cyg_charset = "C.UTF-8";        // Cygwin default character set
  string cyg_locale;
  UINT shell_cp{ 0 };
  UINT default_cp{ 65001 };
  char *envptr = ::getenv("LANG");

  if (NULL == envptr)
    envptr = ::getenv("LC_ALL");

  cyg_locale = (NULL == envptr ? default_cyg_charset : envptr);
  // The 'value' field of the environment string "var_name=value"
  // will be of the form: <language ID>.<codepage ID>
  // We want the substring after the '.'  
  int dotPos = cyg_locale.find_first_of('.');
  if (dotPos >= 0) {
    // The character set string, if specified, starts AFTER  the '.'.
    // If NOT specified, return the input default.
    string page = cyg_locale.substr(++dotPos);
    if (0 <= (shell_cp = cyg_codepage_string_to_CP(page))) {
      return shell_cp;
    }  // end SHELL_CP
  }    // end EQPOS
  return default_cp;
}


LPSTR __stdcall UnicodeToMByteHelper(LPSTR lpa, int nBytes, LPCWSTR lpw, int nChars, int codepage)
{
  static int printInfo = 0;
  int nOut = 0;

  if (NULL == lpa) {
    printf("NULL input string\n");
    return NULL;
  }

  if (printInfo) {
    printf("Transcoding using Cygwin codepage: %d\nInput widechar string:\n", codepage);
    for (int i = 0; i < nChars; i++)
      printf("\tlpw[%d] = %C - %02X\n", i, lpw[i], lpw[i]);
  }
  ++printInfo;

  if (nChars > 0) {
    if (0 == (nOut = WideCharToMultiByte(codepage, 0, lpw, nChars, lpa, nBytes, NULL, NULL))) {
      DWORD dwErr = GetLastError();
      printf("WideCharToMultiByte(%d, %S) failed, error %d\n", codepage, lpw, dwErr);
      return NULL;
    }
  }
  lpa[nOut] = '\0';
  return lpa;
}

int wmain(int argc, wchar_t** wargv)
{
  try {
    char *pNull = "NULL";
    char** argv = new char*[(argc)+1];
    int _argi;
    int codepage = get_cygwin_codepage();
    for (_argi = 0; _argi < (argc); _argi++) {
      if (wargv[_argi]) {
        LPWSTR utf_lpw  = wargv[_argi];
        int utf_len     = lstrlenW(utf_lpw);
        int utf_convert = utf_len * 3 + 1;
        LPSTR utf_lpa   = (LPSTR)_alloca(utf_convert);
        argv[_argi]     = UnicodeToMByteHelper(utf_lpa, utf_convert, utf_lpw, utf_len, codepage);
      }
      else {
        argv[_argi] = pNull;
      }
    }
    argv[(argc)] = NULL;

    // Now print the transcoded string.

    for (int i = 1; i < argc; i++)
      printf("%s: %s\n", __FUNCTION__, argv[i]);

    return 0;
  }
  catch (...) {
    printf("Caught unhandled exception\n");
  }
}

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-08-05  5:22 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-03 15:36 Trouble with character sets Michael Shay
2020-08-03 16:31 ` Brian Inglis
2020-08-03 17:10   ` Michael Shay
2020-08-03 17:42     ` Andrey Repin
2020-08-03 18:15       ` Michael Shay
2020-08-03 21:23       ` Trouble with output character sets from Win32 applications running under mintty Brian Inglis
2020-08-03 22:05         ` Michael Shay
2020-08-04 12:32           ` Trouble with output character sets from Win32 applications running under mksh Brian Inglis
2020-08-04 21:19             ` Michael Shay
2020-08-05  2:10               ` Thomas Wolff
2020-08-05  5:22               ` Brian Inglis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).