public inbox for libc-hacker@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Change \B to match previous regex implementation (as well as dfa.c and perl regex)
@ 2005-01-26 16:17 Jakub Jelinek
  2005-01-26 19:47 ` Ulrich Drepper
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2005-01-26 16:17 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Glibc hackers

Hi!

As http://sources.redhat.com/bugzilla/show_bug.cgi?id=693
shows, old GNU regex, as well as dfa.c and perl regex
all implement \B as negation of \b (but still regex.texi
documents it as an empty string inside of word, rather than
empty string inside of word or surrounded by non-word characters).

2005-01-26  Jakub Jelinek  <jakub@redhat.com>

	[BZ #693]
	* posix/regex_internal.h (DUMMY_CONSTRAINT): Rename to...
	(WORD_DELIM_CONSTRAINT): ...this.
	(NOT_WORD_DELIM_CONSTRAINT): Define.
	(re_context_type): Add INSIDE_NOTWORD and NOT_WORD_DELIM,
	change WORD_DELIM to use WORD_DELIM_CONSTRAINT.
	* posix/regcomp.c (peek_token): For \B create NOT_WORD_DELIM
	anchor instead of INSIDE_WORD.
	(parse_expression): Handle NOT_WORD_DELIM constraint.
	* posix/bug-regex19.c (tests): Adjust tests that relied on \B
	being inside word instead of not word delim.
	* posix/tst-rxspencer.c (mb_frob_pattern): Don't frob escaped
	characters.
	* posix/rxspencer/tests: Add some new tests.

--- libc/posix/regex_internal.h.jj	2005-01-08 16:50:24.000000000 +0100
+++ libc/posix/regex_internal.h	2005-01-26 14:56:20.210528592 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -143,18 +143,21 @@ static inline void bitset_mask (bitset d
 #define NEXT_NEWLINE_CONSTRAINT 0x0020
 #define PREV_BEGBUF_CONSTRAINT 0x0040
 #define NEXT_ENDBUF_CONSTRAINT 0x0080
-#define DUMMY_CONSTRAINT 0x0100
+#define WORD_DELIM_CONSTRAINT 0x0100
+#define NOT_WORD_DELIM_CONSTRAINT 0x0200
 
 typedef enum
 {
   INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
   WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
   WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
+  INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
   LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
   LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
   BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
   BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
-  WORD_DELIM = DUMMY_CONSTRAINT
+  WORD_DELIM = WORD_DELIM_CONSTRAINT,
+  NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT
 } re_context_type;
 
 typedef struct
--- libc/posix/regcomp.c.jj	2005-01-19 14:13:00.000000000 +0100
+++ libc/posix/regcomp.c	2005-01-26 15:01:58.715941197 +0100
@@ -1859,7 +1859,7 @@ peek_token (token, input, syntax)
 	  if (!(syntax & RE_NO_GNU_OPS))
 	    {
 	      token->type = ANCHOR;
-	      token->opr.ctx_type = INSIDE_WORD;
+	      token->opr.ctx_type = NOT_WORD_DELIM;
 	    }
 	  break;
 	case 'w':
@@ -2349,15 +2349,25 @@ parse_expression (regexp, preg, token, s
       break;
     case ANCHOR:
       if ((token->opr.ctx_type
-	   & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
+	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
 	  && dfa->word_ops_used == 0)
 	init_word_char (dfa);
-      if (token->opr.ctx_type == WORD_DELIM)
+      if (token->opr.ctx_type == WORD_DELIM
+          || token->opr.ctx_type == NOT_WORD_DELIM)
 	{
 	  bin_tree_t *tree_first, *tree_last;
-	  token->opr.ctx_type = WORD_FIRST;
-	  tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
-	  token->opr.ctx_type = WORD_LAST;
+	  if (token->opr.ctx_type == WORD_DELIM)
+	    {
+	      token->opr.ctx_type = WORD_FIRST;
+	      tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
+	      token->opr.ctx_type = WORD_LAST;
+            }
+          else
+            {
+	      token->opr.ctx_type = INSIDE_WORD;
+	      tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
+	      token->opr.ctx_type = INSIDE_NOTWORD;
+            }
 	  tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
 	  token->type = OP_ALT;
 	  tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
--- libc/posix/bug-regex19.c.jj	2003-12-22 13:38:11.000000000 +0100
+++ libc/posix/bug-regex19.c	2005-01-26 15:10:51.334648249 +0100
@@ -1,5 +1,5 @@
 /* Regular expression tests.
-   Copyright (C) 2003 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
 
@@ -170,22 +170,22 @@ static struct test_s
   {ERE, "[^k]\\B[^k]", "kBk", 0, -1},
   {ERE, "[^C]\\B[^C]", "CCCABA", 0, 3},
   {ERE, "[^C]\\B[^C]", "CBC", 0, -1},
-  {ERE, ".(\\b|\\B).", "=~AB", 0, 1},
+  {ERE, ".(\\b|\\B).", "=~AB", 0, 0},
   {ERE, ".(\\b|\\B).", "A=C", 0, 0},
   {ERE, ".(\\b|\\B).", "ABC", 0, 0},
-  {ERE, ".(\\b|\\B).", "=~\\!", 0, -1},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 1},
+  {ERE, ".(\\b|\\B).", "=~\\!", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 0},
   {ERE, "[^k](\\b|\\B)[^k]", "A=C", 0, 0},
   {ERE, "[^k](\\b|\\B)[^k]", "ABC", 0, 0},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 3},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, -1},
-  {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, -1},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 1},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 0},
   {ERE, "[^C](\\b|\\B)[^C]", "A=C", 0, 0},
   {ERE, "[^C](\\b|\\B)[^C]", "ABC", 0, 0},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 3},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, -1},
-  {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, -1},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, 0},
   {ERE, "\\b([A]|[!]|.B)", "A=AC", 0, 0},
   {ERE, "\\b([A]|[!]|.B)", "=AC", 0, 1},
   {ERE, "\\b([A]|[!]|.B)", "!AC", 0, 1},
--- libc/posix/tst-rxspencer.c.jj	2003-12-29 15:01:28.000000000 +0100
+++ libc/posix/tst-rxspencer.c	2005-01-26 17:01:31.234511142 +0100
@@ -1,5 +1,5 @@
 /* Regular expression tests.
-   Copyright (C) 2003 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
 
@@ -127,14 +127,15 @@ mb_frob_string (const char *str, const c
 }
 
 /* Like mb_frob_string, but don't replace anything between
-   [: and :], [. and .] or [= and =].  */
+   [: and :], [. and .] or [= and =] or characters escaped
+   with a backslash.  */
 
 static char *
 mb_frob_pattern (const char *str, const char *letters)
 {
   char *ret, *dst;
   const char *src;
-  int in_class = 0;
+  int in_class = 0, escaped = 0;
 
   if (str == NULL)
     return NULL;
@@ -144,7 +145,18 @@ mb_frob_pattern (const char *str, const 
     return NULL;
 
   for (src = str, dst = ret; *src; ++src)
-    if (!in_class && strchr (letters, *src))
+    if (*src == '\\')
+      {
+	escaped ^= 1;
+	*dst++ = *src;
+      }
+    else if (escaped)
+      {
+	escaped = 0;
+	*dst++ = *src;
+	continue;
+      }
+    else if (!in_class && strchr (letters, *src))
       dst = mb_replace (dst, *src);
     else
       {
--- libc/posix/rxspencer/tests.jj	2004-11-19 13:38:46.000000000 +0100
+++ libc/posix/rxspencer/tests	2005-01-26 16:35:32.596730739 +0100
@@ -526,3 +526,12 @@ a((b+|((c)*)))+d	-	abcd	abcd	c,c,c,c
 (((\b))){0}	-	x	@x	-,-,-
 a(((.*)))b((\2)){0}c	-	abc	abc	@bc,@bc,@bc,-,-
 a(((.*)))b((\1)){0}c	-	axbc	axbc	x,x,x,-,-
+
+\b	&	SaT	@aT
+\b	&	aT	@aT
+a.*\b	&	abT	ab
+\b	&	STSS
+\B	&	abc	@bc
+\B	&	aSbTc
+\B	&	SaT	@SaT
+\B	&	aSTSb	@TSb

	Jakub

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] Change \B to match previous regex implementation (as well as dfa.c and perl regex)
  2005-01-26 16:17 [PATCH] Change \B to match previous regex implementation (as well as dfa.c and perl regex) Jakub Jelinek
@ 2005-01-26 19:47 ` Ulrich Drepper
  0 siblings, 0 replies; 2+ messages in thread
From: Ulrich Drepper @ 2005-01-26 19:47 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Glibc hackers

[-- Attachment #1: Type: text/plain, Size: 110 bytes --]

Applied.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 252 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2005-01-26 19:47 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-26 16:17 [PATCH] Change \B to match previous regex implementation (as well as dfa.c and perl regex) Jakub Jelinek
2005-01-26 19:47 ` Ulrich Drepper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).