public inbox for libc-hacker@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Many new word boundary tests
@ 2003-11-21  0:24 Jakub Jelinek
  2003-11-21 11:46 ` Ulrich Drepper
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2003-11-21  0:24 UTC (permalink / raw)
  To: Ulrich Drepper, Roland McGrath; +Cc: Glibc hackers

Hi!

I wrote some new word boundary tests (though there are really many things
which I haven't even touched yet, like \>, \w, \W) plus infrastructure
so that UTF-8 tests are generated from the ASCII ones.
None of the ASCII tests fail, but many UTF-8 tests fail, more without
my patch from today and slightly less with the patch applied.

So that glibc testsuite passes, perhaps
ret |= do_mb_tests (&tests[i]);
can be replaced with
do_mb_tests (&tests[i]);
until this is fixed.

If somebody has good ideas for new \</\>/\b/\B/\w/\W tests, feel free to add
some.

2003-11-21  Jakub Jelinek  <jakub@redhat.com>

	* posix/bug-regex19.c (BRE, ERE): Define.
	(tests): Add many new tests, remove UTF-8 ones.
	(do_one_test, do_mb_tests): New functions.
	(main): Rewritten using do_one_test and do_mb_tests.

--- libc/posix/bug-regex19.c.jj	2003-11-20 21:24:03.000000000 +0100
+++ libc/posix/bug-regex19.c	2003-11-21 01:02:28.000000000 +0100
@@ -26,87 +26,274 @@
 #include <string.h>
 #include <locale.h>
 
-static struct
+#define BRE RE_SYNTAX_POSIX_BASIC
+#define ERE RE_SYNTAX_POSIX_EXTENDED
+
+static struct test_s
 {
   int syntax;
   const char *pattern;
   const char *string;
   int start, res;
 } tests[] = {
-  /* \xc3\x84		LATIN CAPITAL LETTER A WITH DIAERESIS
-     \xc3\x96		LATIN CAPITAL LETTER O WITH DIAERESIS
-     \xe2\x80\x94	EM DASH  */
-  /* Should not match.  */
-  {RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 0, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 2, -1},
-  {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAAO", 1, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 0, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 2, -1},
-  {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAAO", 1, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
-#if 0
-  /* XXX these 2 tests still fail.  */
-  {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1},
-  {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1},
-#endif
-  {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
-  /* Should match.  */
-  {RE_SYNTAX_POSIX_BASIC, "\\<A", "AA", 0, 0},
-  {RE_SYNTAX_POSIX_BASIC, "\\<A", "a-AA", 2, 2},
-  {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAA-", 1, 2},
-  {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAA", 1, 2},
-  {RE_SYNTAX_POSIX_BASIC, "\\bA", "AA", 0, 0},
-  {RE_SYNTAX_POSIX_BASIC, "\\bA", "a-AA", 2, 2},
-  {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA-", 1, 2},
-  {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA", 1, 2},
-  {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "\xc3\x84\xc3\x84", 0, 0},
-  {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4},
-  {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3},
-  {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84", 1, 3},
-  {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "\xc3\x84\xc3\x84", 0, 0},
-  {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4},
-  {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3},
-  {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84", 1, 3}
+  {BRE, "\\<A", "CBAA", 0, -1},
+  {BRE, "\\<A", "CBAA", 2, -1},
+  {BRE, "A\\>", "CAAB", 1, -1},
+  {BRE, "\\bA", "CBAA", 0, -1},
+  {BRE, "\\bA", "CBAA", 2, -1},
+  {BRE, "A\\b", "CAAB", 1, -1},
+  {BRE, "\\<A", "AA", 0, 0},
+  {BRE, "\\<A", "C-AA", 2, 2},
+  {BRE, "A\\>", "CAA-", 1, 2},
+  {BRE, "A\\>", "CAA", 1, 2},
+  {BRE, "\\bA", "AA", 0, 0},
+  {BRE, "\\bA", "C-AA", 2, 2},
+  {BRE, "A\\b", "CAA-", 1, 2},
+  {BRE, "A\\b", "CAA", 1, 2},
+  {ERE, "\\b(A|!|.B)", "A=AC", 0, 0},
+  {ERE, "\\b(A|!|.B)", "=AC", 0, 1},
+  {ERE, "\\b(A|!|.B)", "!AC", 0, 1},
+  {ERE, "\\b(A|!|.B)", "=AB", 0, 1},
+  {ERE, "\\b(A|!|.B)", "DA!C", 0, 2},
+  {ERE, "\\b(A|!|.B)", "=CB", 0, 1},
+  {ERE, "\\b(A|!|.B)", "!CB", 0, 1},
+  {ERE, "\\b(A|!|.B)", "D,B", 0, 1},
+  {ERE, "\\b(A|!|.B)", "!.C", 0, -1},
+  {ERE, "\\b(A|!|.B)", "BCB", 0, -1},
+  {ERE, "(A|\\b)(A|B|C)", "DAAD", 0, 1},
+  {ERE, "(A|\\b)(A|B|C)", "DABD", 0, 1},
+  {ERE, "(A|\\b)(A|B|C)", "AD", 0, 0},
+  {ERE, "(A|\\b)(A|B|C)", "C!", 0, 0},
+  {ERE, "(A|\\b)(A|B|C)", "D,B", 0, 2},
+  {ERE, "(A|\\b)(A|B|C)", "DA?A", 0, 3},
+  {ERE, "(A|\\b)(A|B|C)", "BBC", 0, 0},
+  {ERE, "(A|\\b)(A|B|C)", "DA", 0, -1},
+  {ERE, "(!|\\b)(!|=|~)", "A!=\\", 0, 1},
+  {ERE, "(!|\\b)(!|=|~)", "/!=A", 0, 1},
+  {ERE, "(!|\\b)(!|=|~)", "A=A", 0, 1},
+  {ERE, "(!|\\b)(!|=|~)", "==!=", 0, 2},
+  {ERE, "(!|\\b)(!|=|~)", "==C~", 0, 3},
+  {ERE, "(!|\\b)(!|=|~)", "=~=", 0, -1},
+  {ERE, "(!|\\b)(!|=|~)", "~!", 0, -1},
+  {ERE, "(!|\\b)(!|=|~)", "~=~", 0, -1},
+  {ERE, "(\\b|A.)[ABC]", "AC", 0, 0},
+  {ERE, "(\\b|A.)[ABC]", "=A", 0, 1},
+  {ERE, "(\\b|A.)[ABC]", "DACC", 0, 1},
+  {ERE, "(\\b|A.)[A~C]", "AC", 0, 0},
+  {ERE, "(\\b|A.)[A~C]", "=A", 0, 1},
+  {ERE, "(\\b|A.)[A~C]", "DACC", 0, 1},
+  {ERE, "(\\b|A.)[A~C]", "B!A=", 0, 2},
+  {ERE, "(\\b|A.)[A~C]", "B~C", 0, 1},
+  {ERE, ".\\b.", "AA~", 0, 1},
+  {ERE, ".\\b.", "=A=", 0, 0},
+  {ERE, ".\\b.", "==", 0, -1},
+  {ERE, ".\\b.", "ABA", 0, -1},
+  {ERE, "\\<(A|!|.B)", "A=AC", 0, 0},
+  {ERE, "\\<(A|!|.B)", "=AC", 0, 1},
+  {ERE, "\\<(A|!|.B)", "!AC", 0, 1},
+  {ERE, "\\<(A|!|.B)", "=AB", 0, 1},
+  {ERE, "\\<(A|!|.B)", "=CB", 0, 1},
+  {ERE, "\\<(A|!|.B)", "!CB", 0, 1},
+  {ERE, "\\<(A|!|.B)", "DA!C", 0, -1},
+  {ERE, "\\<(A|!|.B)", "D,B", 0, -1},
+  {ERE, "\\<(A|!|.B)", "!.C", 0, -1},
+  {ERE, "\\<(A|!|.B)", "BCB", 0, -1},
+  {ERE, "(A|\\<)(A|B|C)", "DAAD", 0, 1},
+  {ERE, "(A|\\<)(A|B|C)", "DABD", 0, 1},
+  {ERE, "(A|\\<)(A|B|C)", "AD", 0, 0},
+  {ERE, "(A|\\<)(A|B|C)", "C!", 0, 0},
+  {ERE, "(A|\\<)(A|B|C)", "D,B", 0, 2},
+  {ERE, "(A|\\<)(A|B|C)", "DA?A", 0, 3},
+  {ERE, "(A|\\<)(A|B|C)", "BBC", 0, 0},
+  {ERE, "(A|\\<)(A|B|C)", "DA", 0, -1},
+  {ERE, "(!|\\<)(!|=|~)", "A!=\\", 0, 1},
+  {ERE, "(!|\\<)(!|=|~)", "/!=A", 0, 1},
+  {ERE, "(!|\\<)(!|=|~)", "==!=", 0, 2},
+  {ERE, "(!|\\<)(!|=|~)", "==C~", 0, -1},
+  {ERE, "(!|\\<)(!|=|~)", "A=A", 0, -1},
+  {ERE, "(!|\\<)(!|=|~)", "=~=", 0, -1},
+  {ERE, "(!|\\<)(!|=|~)", "~!", 0, -1},
+  {ERE, "(!|\\<)(!|=|~)", "~=~", 0, -1},
+  {ERE, "(\\<|A.)[ABC]", "AC", 0, 0},
+  {ERE, "(\\<|A.)[ABC]", "=A", 0, 1},
+  {ERE, "(\\<|A.)[ABC]", "DACC", 0, 1},
+  {ERE, "(\\<|A.)[A~C]", "AC", 0, 0},
+  {ERE, "(\\<|A.)[A~C]", "=A", 0, 1},
+  {ERE, "(\\<|A.)[A~C]", "DACC", 0, 1},
+  {ERE, "(\\<|A.)[A~C]", "B!A=", 0, 2},
+  {ERE, "(\\<|A.)[A~C]", "B~C", 0, 2},
+  {ERE, ".\\<.", "=A=", 0, 0},
+  {ERE, ".\\<.", "AA~", 0, -1},
+  {ERE, ".\\<.", "==", 0, -1},
+  {ERE, ".\\<.", "ABA", 0, -1},
+  {ERE, ".\\B.", "ABA", 0, 0},
+  {ERE, ".\\B.", "=BDC", 0, 1},
+  {ERE, ".(\\b|\\B).", "=~AB", 0, 1},
+  {ERE, ".(\\b|\\B).", "A=C", 0, 0},
+  {ERE, ".(\\b|\\B).", "ABC", 0, 0},
+  {ERE, ".(\\b|\\B).", "=~\\!", 0, -1},
 };
 
 int
-main (void)
+do_one_test (const struct test_s *test, const char *fail)
 {
-  struct re_pattern_buffer regbuf;
+  int res;
   const char *err;
+  struct re_pattern_buffer regbuf;
+
+  re_set_syntax (test->syntax);
+  memset (&regbuf, '\0', sizeof (regbuf));
+  err = re_compile_pattern (test->pattern, strlen (test->pattern),
+			    &regbuf);
+  if (err != NULL)
+    {
+      printf ("%sre_compile_pattern \"%s\" failed: %s\n", fail, test->pattern,
+	      err);
+      return 1;
+    }
+
+  res = re_search (&regbuf, test->string, strlen (test->string),
+		   test->start, strlen (test->string) - test->start, NULL);
+  if (res != test->res)
+    {
+      printf ("%sre_search \"%s\" \"%s\" failed: %d (expected %d)\n",
+	      fail, test->pattern, test->string, res, test->res);
+      regfree (&regbuf);
+      return 1;
+    }
+
+  if (test->res > 0 && test->start == 0)
+    {
+      res = re_search (&regbuf, test->string, strlen (test->string),
+		       test->res, strlen (test->string) - test->res, NULL);
+      if (res != test->res)
+	{
+	  printf ("%sre_search from expected \"%s\" \"%s\" failed: %d (expected %d)\n",
+		  fail, test->pattern, test->string, res, test->res);
+	  regfree (&regbuf);
+	  return 1;
+	}
+    }
+
+  regfree (&regbuf);
+  return 0;  
+}
+
+static inline char *
+replace (char *p, char c)
+{
+  switch (c)
+    {
+      /* A -> A" */
+      case 'A': *p++ = '\xc3'; *p++ = '\x84'; break;
+      /* B -> O" */
+      case 'B': *p++ = '\xc3'; *p++ = '\x96'; break;
+      /* C -> U" */
+      case 'C': *p++ = '\xc3'; *p++ = '\x9c'; break;
+      /* D -> a" */
+      case 'D': *p++ = '\xc3'; *p++ = '\xa4'; break;
+      /* ! -> MULTIPLICATION SIGN */
+      case '!': *p++ = '\xc3'; *p++ = '\x97'; break;
+      /* = -> EM DASH */
+      case '=': *p++ = '\xe2'; *p++ = '\x80'; *p++ = '\x94'; break;
+      /* ~ -> MUSICAL SYMBOL HALF NOTE */
+      case '~': *p++ = '\xf0'; *p++ = '\x9d'; *p++ = '\x85'; *p++ = '\x9e';
+      break;
+    }
+  return p;
+}
+
+int
+do_mb_tests (const struct test_s *test)
+{
+  int i, j;
+  struct test_s t;
+  const char *const chars = "ABCD!=~";
+  char repl[8], *p;
+  char pattern[strlen (test->pattern) * 4 + 1];
+  char string[strlen (test->string) * 4 + 1];
+  char fail[8 + sizeof ("UTF-8 ")];
+
+  t.pattern = pattern;
+  t.string = string;
+  strcpy (fail, "UTF-8 ");
+  for (i = 1; i < 128; ++i)
+    {
+      p = repl;
+      for (j = 0; j < 7; ++j)
+	if (i & (1 << j))
+	  {
+	    if (!strchr (test->pattern, chars[j])
+		&& !strchr (test->string, chars[j]))
+	      break;
+	    *p++ = chars[j];
+	  }
+      if (j < 7)
+	continue;
+      *p = '\0';
+
+      for (j = 0, p = pattern; test->pattern[j]; ++j)
+	if (strchr (repl, test->pattern[j]))
+	  p = replace (p, test->pattern[j]);
+	else if (test->pattern[j] == '\\' && test->pattern[j + 1])
+	  {
+	    *p++ = test->pattern[j++];
+	    *p++ = test->pattern[j];
+	  }
+	else
+	  *p++ = test->pattern[j];
+      *p = '\0';
+
+      t.start = test->start;
+      t.res = test->res;
+
+      for (j = 0, p = string; test->string[j]; ++j)
+	if (strchr (repl, test->string[j]))
+	  {
+	    char *d = replace (p, test->string[j]);
+	    if (test->start > j)
+	      t.start += d - p - 1;
+	    if (test->res > j)
+	      t.res += d - p - 1;
+	    p = d;
+	  }
+	else
+	  *p++ = test->string[j];
+      *p = '\0';
+
+      p = stpcpy (fail + strlen ("UTF-8 "), repl);
+      *p++ = ' ';
+      *p = '\0';
+
+      if (do_one_test (&t, fail))
+	return 1;
+    }
+  return 0;
+}
+
+int
+main (void)
+{
   size_t i;
   int ret = 0;
 
   mtrace ();
 
-  setlocale (LC_ALL, "de_DE.UTF-8");
   for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
     {
-      int res;
-      re_set_syntax (tests[i].syntax);
-      memset (&regbuf, '\0', sizeof (regbuf));
-      err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
-                                &regbuf);
-      if (err != NULL)
+      if (setlocale (LC_ALL, "de_DE.ISO-8859-1") == NULL)
 	{
-	  printf ("re_compile_pattern failed: %s\n", err);
+	  puts ("setlocale de_DE.ISO-8859-1 failed");
 	  ret = 1;
-	  continue;
 	}
-
-      res = re_search (&regbuf, tests[i].string, strlen (tests[i].string),
-		       tests[i].start,
-		       strlen (tests[i].string) - tests[i].start, NULL);
-      if (res != tests[i].res)
+      ret |= do_one_test (&tests[i], "");
+      if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL)
 	{
-	  printf ("re_search %zd failed: %d\n", i, res);
+	  puts ("setlocale de_DE.UTF-8 failed");
 	  ret = 1;
-	  regfree (&regbuf);
-	  continue;
 	}
-      regfree (&regbuf);
+      ret |= do_one_test (&tests[i], "UTF-8 ");
+      ret |= do_mb_tests (&tests[i]);
     }
 
   return ret;

	Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Many new word boundary tests
  2003-11-21  0:24 [PATCH] Many new word boundary tests Jakub Jelinek
@ 2003-11-21 11:46 ` Ulrich Drepper
  0 siblings, 0 replies; 2+ messages in thread
From: Ulrich Drepper @ 2003-11-21 11:46 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Glibc hackers

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Applied.  I did prevent the MB tests from causing a make check failure.

Thanks,

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQE/vc872ijCOnn/RHQRAv45AJ4r2jbBJfFJFX9O9v3eJqF5vYHp+gCfVm8j
8rGduICpR7oympyY9QUm/Kg=
=YF20
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2003-11-21  8:39 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-11-21  0:24 [PATCH] Many new word boundary tests Jakub Jelinek
2003-11-21 11:46 ` Ulrich Drepper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).