public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v9 1/3] posix: regcomp(): clear RE_DOT_NOT_NULL
@ 2023-07-03 17:52 наб
  2023-07-03 17:52 ` [PATCH v9 2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor наб
  2023-07-03 17:52 ` [PATCH v9 3/3] posix: add test for REG_STARTEND наб
  0 siblings, 2 replies; 3+ messages in thread
From: наб @ 2023-07-03 17:52 UTC (permalink / raw)
  Cc: Carlos O'Donell, Adhemerval Zanella Netto, Paul Eggert, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1421 bytes --]

The POSIX API always stops at first NUL so there's no change for that.

The BSD REG_STARTEND API, with its explicit range, can include NULs
within that range, and those NULs are matched with . and [^].

Heretofor, for a string of "a\0c", glibc would match "[^q]c", but not
".c". This is both inconsistent and nonconformant to BSD REG_STARTEND.

With this patch, they're identical like you'd expect, and the
  tst-reg-startend.c: ..c: a^@c: no match$
failure is removed.

Another approach would be to remove it from _RE_SYNTAX_POSIX_COMMON,
but it's unclear to me what the custody chain is like for that and what
other regex APIs glibc offers that could be affected by this.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
I hereby disclaim all copyright interest in this changeset.

No-change clean rebase.

 posix/regcomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/posix/regcomp.c b/posix/regcomp.c
index 12650714c0..a928ef6c2d 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -462,7 +462,7 @@ regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 {
   reg_errcode_t ret;
   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
-			 : RE_SYNTAX_POSIX_BASIC);
+			 : RE_SYNTAX_POSIX_BASIC) & ~RE_DOT_NOT_NULL;
 
   preg->buffer = NULL;
   preg->allocated = 0;
-- 
2.39.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v9 2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor
  2023-07-03 17:52 [PATCH v9 1/3] posix: regcomp(): clear RE_DOT_NOT_NULL наб
@ 2023-07-03 17:52 ` наб
  2023-07-03 17:52 ` [PATCH v9 3/3] posix: add test for REG_STARTEND наб
  1 sibling, 0 replies; 3+ messages in thread
From: наб @ 2023-07-03 17:52 UTC (permalink / raw)
  Cc: Carlos O'Donell, Adhemerval Zanella Netto, Paul Eggert, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 3419 bytes --]

re_search_internal () starts with
  /* If initial states with non-begbuf contexts have no elements,
     the regex must be anchored.  If preg->newline_anchor is set,
     we'll never use init_state_nl, so do not check it.  */
  if (dfa->init_state->nodes.nelem == 0
      && dfa->init_state_word->nodes.nelem == 0
      && (dfa->init_state_nl->nodes.nelem == 0
	  || !preg->newline_anchor))
    {
      if (start != 0 && last_start != 0)
        return REG_NOMATCH;
      start = last_start = 0;
    }
and heretofor start and last_start (for example when "abc", {1, 2},
so matching just the "b") were != 0, and the return was taken for a "^b"
regex, which is erroneous.

Fix this by giving re_search_internal (string+rm_so, start=0),
then fixing up the returned matches in an after-pass.

This brings us to compatibility with the BSD spec and implementations.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
I hereby disclaim all copyright interest in this changeset.

 posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/posix/regexec.c b/posix/regexec.c
index bd0cd412d0..2ef868e1f6 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
    string; if REG_NOTEOL is set, then $ does not match at the end.
 
    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
-   EFLAGS is invalid.  */
+   EFLAGS is invalid.
+
+   If REG_STARTEND, the bounds are
+     [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
+   instead of the usual
+     [STRING, STRING + strlen(STRING)),
+   but returned matches are still referenced to STRING,
+   and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
+   re_search_internal () has a built-in assumption of
+   (start != 0) <=> (^ doesn't match), so give it a truncated view
+   and fix up the matches afterward.  */
 
 int
 regexec (const regex_t *__restrict preg, const char *__restrict string,
 	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
 {
   reg_errcode_t err;
-  Idx start, length;
+  Idx startoff = 0, length;
   re_dfa_t *dfa = preg->buffer;
+  size_t i = 0;
 
   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
     return REG_BADPAT;
 
   if (eflags & REG_STARTEND)
     {
-      start = pmatch[0].rm_so;
-      length = pmatch[0].rm_eo;
+      startoff = pmatch[0].rm_so;
+      string += startoff;
+      length = pmatch[0].rm_eo - startoff;
     }
   else
-    {
-      start = 0;
-      length = strlen (string);
-    }
+    length = strlen (string);
 
   lock_lock (dfa->lock);
   if (preg->no_sub)
-    err = re_search_internal (preg, string, length, start, length,
-			      length, 0, NULL, eflags);
-  else
-    err = re_search_internal (preg, string, length, start, length,
-			      length, nmatch, pmatch, eflags);
+    nmatch = 0;
+  err = re_search_internal (preg, string, length, 0, length,
+			    length, nmatch, pmatch, eflags);
   lock_unlock (dfa->lock);
+
+  if (err == REG_NOERROR && startoff)
+    for (i = 0; i < nmatch; ++i)
+      if (pmatch[i].rm_so != -1)
+	{
+	  pmatch[i].rm_so += startoff;
+	  pmatch[i].rm_eo += startoff;
+	}
   return err != REG_NOERROR;
 }
 
-- 
2.39.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH v9 3/3] posix: add test for REG_STARTEND
  2023-07-03 17:52 [PATCH v9 1/3] posix: regcomp(): clear RE_DOT_NOT_NULL наб
  2023-07-03 17:52 ` [PATCH v9 2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor наб
@ 2023-07-03 17:52 ` наб
  1 sibling, 0 replies; 3+ messages in thread
From: наб @ 2023-07-03 17:52 UTC (permalink / raw)
  Cc: Carlos O'Donell, Adhemerval Zanella Netto, Paul Eggert, libc-alpha

[-- Attachment #1: Type: text/plain, Size: 6178 bytes --]

This test passes on NetBSD, the illumos gate, musl with
https://www.openwall.com/lists/musl/2023/05/14/1,
and now glibc.
It's nothing revolutionary and the behaviour it tests
is largely guaranteed by the 4.4BSD-Lite manual;
nevertheless, it used to fail with
  tst-reg-startend.c: ^a: a^@c: no match$
  tst-reg-startend.c: ^a: a^@c: wanted {1, 2}, got {1, 4}$
  tst-reg-startend.c: ^a: abc: no match$
  tst-reg-startend.c: ^a: abc: wanted {1, 2}, got {1, 4}$
  tst-reg-startend.c: ^a.c$: a^@c: no match$
  tst-reg-startend.c: ^a.c$: abc: no match$
  tst-reg-startend.c: ^a.*c$: a^@c: no match$
  tst-reg-startend.c: ^a.*c$: abc: no match$
  tst-reg-startend.c: ^a[^c]c$: a^@c: no match$
  tst-reg-startend.c: ^a[^c]c$: abc: no match$
  tst-reg-startend.c: ^a..: a^@c: no match$
  tst-reg-startend.c: ^a..: abc: no match$
  tst-reg-startend.c: ..c: a^@c: no match$

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
 posix/Makefile           |   1 +
 posix/tst-reg-startend.c | 142 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 posix/tst-reg-startend.c

diff --git a/posix/Makefile b/posix/Makefile
index ad43cbdec6..10b64206f1 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -296,6 +296,7 @@ tests := \
   tst-posix_spawn-setsid \
   tst-preadwrite \
   tst-preadwrite64 \
+  tst-reg-startend \
   tst-regcomp-truncated \
   tst-regex \
   tst-regex2 \
diff --git a/posix/tst-reg-startend.c b/posix/tst-reg-startend.c
new file mode 100644
index 0000000000..854d430676
--- /dev/null
+++ b/posix/tst-reg-startend.c
@@ -0,0 +1,142 @@
+/* This is free and unencumbered software released into the public domain.
+
+   Anyone is free to copy, modify, publish, use, compile, sell, or
+   distribute this software, either in source code form or as a compiled
+   binary, for any purpose, commercial or non-commercial, and by any
+   means.
+
+   In jurisdictions that recognize copyright laws, the author or authors
+   of this software dedicate any and all copyright interest in the
+   software to the public domain. We make this dedication for the benefit
+   of the public at large and to the detriment of our heirs and
+   successors. We intend this dedication to be an overt act of
+   relinquishment in perpetuity of all present and future rights to this
+   software under copyright law.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+   IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+   OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+   OTHER DEALINGS IN THE SOFTWARE.  */
+
+
+#include <assert.h>
+#include <locale.h>
+#include <string.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <support/check.h>
+
+
+static const regmatch_t bound = {1, 4};
+
+
+struct reg_res {
+  const char *regex;
+  regmatch_t result;
+};
+static const struct reg_res reg_res_ac[] = {
+  {"^a",       {1, 2}},
+  {"c$",       {3, 4}},
+  {"^a.c$",    {1, 4}},
+  {"^a.*c$",   {1, 4}},
+  {"^a[^c]c$", {1, 4}},
+  {"^a..",     {1, 4}},
+  {"..c",      {1, 4}},
+  {"[^z]c",    {2, 4}},
+  {}
+};
+static const char *const data_ac[] = {"_a\0cdef", "_abcdef", NULL};
+
+static const struct reg_res reg_res_aa[] = {
+  {"^",             {1, 1}},
+  {"^a",            {1, 2}},
+  {"a$",            {3, 4}},
+  {"^\\(a\\).\\1$", {1, 4}},
+  {"^a[^a]*" ,      {1, 3}},
+  {}
+};
+static const char *const data_aa[] = {"_a\0adef", "_abadef", NULL};
+
+
+static void
+testbunch (const struct reg_res *reg_reses, const char *const *const data)
+{
+#define BASEERR(data)                                      \
+  support_record_failure (),                               \
+    fprintf (stdout, __FILE__ ": %s: ", reg_reses->regex), \
+    fwrite (data + bound.rm_so, 1, bound.rm_eo - bound.rm_so, stdout)
+
+  for (; reg_reses->regex; ++reg_reses)
+    {
+      regex_t rgx;
+      assert (!regcomp (&rgx, reg_reses->regex, 0));
+
+      for (const char *const *dt = data; *dt; ++dt)
+        {
+          regmatch_t match = bound;
+          if (regexec (&rgx, *dt, 1, &match, REG_STARTEND))
+            BASEERR(dt), fputs (": no match\n", stdout);
+
+          if (memcmp(&match, &reg_reses->result, sizeof (regmatch_t)))
+            BASEERR(dt), fprintf (stdout, ": wanted {%d, %d}, got {%d, %d}\n",
+                                  (int)reg_reses->result.rm_so,
+                                  (int)reg_reses->result.rm_eo,
+                                  (int)match.rm_so, (int)match.rm_eo);
+        }
+
+      regfree(&rgx);
+    }
+}
+
+
+struct mb_data_exp {
+  const char *data;
+  bool exp;
+};
+static const struct mb_data_exp mb_data_exp[] = {
+  {"_aaćdef", false},
+  {"_aćdef", true},
+  {}
+};
+
+static void
+testmb (void)
+{
+  regex_t rgx;
+  const struct reg_res reg_reses[] = {{"ać"}};
+  assert (!regcomp (&rgx, reg_reses->regex, 0));
+
+  for (const struct mb_data_exp *de = mb_data_exp; de->data; ++de)
+    {
+      regmatch_t match = bound;
+      if (regexec (&rgx, de->data, 1, &match, REG_STARTEND) == de->exp)
+        BASEERR(de->data), fprintf (stdout, ": %s match\n",
+                                    de->exp ? "no" : "yes");
+
+      if (memcmp(&match, &bound, sizeof (regmatch_t)))
+        BASEERR(de->data), fprintf (stdout, ": wanted {%d, %d}, got {%d, %d}\n",
+                                    (int)bound.rm_so, (int)bound.rm_eo,
+                                    (int)match.rm_so, (int)match.rm_eo);
+    }
+
+  regfree(&rgx);
+}
+
+
+static int
+do_test (void)
+{
+  assert (setlocale (LC_ALL, "C.UTF-8"));
+
+  testbunch (reg_res_ac, data_ac);
+  testbunch (reg_res_aa, data_aa);
+  testmb ();
+  return 0;
+}
+
+
+#include <support/test-driver.c>
-- 
2.39.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-07-03 17:52 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-03 17:52 [PATCH v9 1/3] posix: regcomp(): clear RE_DOT_NOT_NULL наб
2023-07-03 17:52 ` [PATCH v9 2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor наб
2023-07-03 17:52 ` [PATCH v9 3/3] posix: add test for REG_STARTEND наб

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).