* [PATCH v11 2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor
2023-08-20 12:37 [PATCH v11 1/3] posix: regcomp(): clear RE_DOT_NOT_NULL Ahelenia Ziemiańska
@ 2023-08-20 12:38 ` Ahelenia Ziemiańska
2023-08-20 12:38 ` [PATCH v11 3/3] posix: add test for REG_STARTEND наб
1 sibling, 0 replies; 3+ messages in thread
From: Ahelenia Ziemiańska @ 2023-08-20 12:38 UTC (permalink / raw)
Cc: Carlos O'Donell, Adhemerval Zanella Netto, Paul Eggert, libc-alpha
[-- Attachment #1: Type: text/plain, Size: 3282 bytes --]
re_search_internal () starts with
/* If initial states with non-begbuf contexts have no elements,
the regex must be anchored. If preg->newline_anchor is set,
we'll never use init_state_nl, so do not check it. */
if (dfa->init_state->nodes.nelem == 0
&& dfa->init_state_word->nodes.nelem == 0
&& (dfa->init_state_nl->nodes.nelem == 0
|| !preg->newline_anchor))
{
if (start != 0 && last_start != 0)
return REG_NOMATCH;
start = last_start = 0;
}
and heretofore start and last_start (for example when "abc", {1, 2},
so matching just the "b") were != 0, and the return was taken for a "^b"
regex, which is erroneous.
Fix this by giving re_search_internal (string+rm_so, start=0),
then fixing up the returned matches in an after-pass.
This brings us to compatibility with the BSD spec and implementations.
---
posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)
diff --git a/posix/regexec.c b/posix/regexec.c
index bd0cd412d0..2ef868e1f6 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
string; if REG_NOTEOL is set, then $ does not match at the end.
Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
- EFLAGS is invalid. */
+ EFLAGS is invalid.
+
+ If REG_STARTEND, the bounds are
+ [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
+ instead of the usual
+ [STRING, STRING + strlen(STRING)),
+ but returned matches are still referenced to STRING,
+ and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
+ re_search_internal () has a built-in assumption of
+ (start != 0) <=> (^ doesn't match), so give it a truncated view
+ and fix up the matches afterward. */
int
regexec (const regex_t *__restrict preg, const char *__restrict string,
size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
{
reg_errcode_t err;
- Idx start, length;
+ Idx startoff = 0, length;
re_dfa_t *dfa = preg->buffer;
+ size_t i = 0;
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
return REG_BADPAT;
if (eflags & REG_STARTEND)
{
- start = pmatch[0].rm_so;
- length = pmatch[0].rm_eo;
+ startoff = pmatch[0].rm_so;
+ string += startoff;
+ length = pmatch[0].rm_eo - startoff;
}
else
- {
- start = 0;
- length = strlen (string);
- }
+ length = strlen (string);
lock_lock (dfa->lock);
if (preg->no_sub)
- err = re_search_internal (preg, string, length, start, length,
- length, 0, NULL, eflags);
- else
- err = re_search_internal (preg, string, length, start, length,
- length, nmatch, pmatch, eflags);
+ nmatch = 0;
+ err = re_search_internal (preg, string, length, 0, length,
+ length, nmatch, pmatch, eflags);
lock_unlock (dfa->lock);
+
+ if (err == REG_NOERROR && startoff)
+ for (i = 0; i < nmatch; ++i)
+ if (pmatch[i].rm_so != -1)
+ {
+ pmatch[i].rm_so += startoff;
+ pmatch[i].rm_eo += startoff;
+ }
return err != REG_NOERROR;
}
--
2.39.2
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH v11 3/3] posix: add test for REG_STARTEND
2023-08-20 12:37 [PATCH v11 1/3] posix: regcomp(): clear RE_DOT_NOT_NULL Ahelenia Ziemiańska
2023-08-20 12:38 ` [PATCH v11 2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor Ahelenia Ziemiańska
@ 2023-08-20 12:38 ` наб
1 sibling, 0 replies; 3+ messages in thread
From: наб @ 2023-08-20 12:38 UTC (permalink / raw)
Cc: Carlos O'Donell, Adhemerval Zanella Netto, Paul Eggert, libc-alpha
[-- Attachment #1: Type: text/plain, Size: 6178 bytes --]
This test passes on NetBSD, the illumos gate, musl with
https://www.openwall.com/lists/musl/2023/05/14/1,
and now glibc.
It's nothing revolutionary and the behaviour it tests
is largely guaranteed by the 4.4BSD-Lite manual;
nevertheless, it used to fail with
tst-reg-startend.c: ^a: a^@c: no match$
tst-reg-startend.c: ^a: a^@c: wanted {1, 2}, got {1, 4}$
tst-reg-startend.c: ^a: abc: no match$
tst-reg-startend.c: ^a: abc: wanted {1, 2}, got {1, 4}$
tst-reg-startend.c: ^a.c$: a^@c: no match$
tst-reg-startend.c: ^a.c$: abc: no match$
tst-reg-startend.c: ^a.*c$: a^@c: no match$
tst-reg-startend.c: ^a.*c$: abc: no match$
tst-reg-startend.c: ^a[^c]c$: a^@c: no match$
tst-reg-startend.c: ^a[^c]c$: abc: no match$
tst-reg-startend.c: ^a..: a^@c: no match$
tst-reg-startend.c: ^a..: abc: no match$
tst-reg-startend.c: ..c: a^@c: no match$
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
posix/Makefile | 1 +
posix/tst-reg-startend.c | 142 +++++++++++++++++++++++++++++++++++++++
2 files changed, 143 insertions(+)
create mode 100644 posix/tst-reg-startend.c
diff --git a/posix/Makefile b/posix/Makefile
index 3d368b91f6..84baa70c48 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -305,6 +305,7 @@ tests := \
tst-posix_spawn-setsid \
tst-preadwrite \
tst-preadwrite64 \
+ tst-reg-startend \
tst-regcomp-truncated \
tst-regex \
tst-regex2 \
diff --git a/posix/tst-reg-startend.c b/posix/tst-reg-startend.c
new file mode 100644
index 0000000000..854d430676
--- /dev/null
+++ b/posix/tst-reg-startend.c
@@ -0,0 +1,142 @@
+/* This is free and unencumbered software released into the public domain.
+
+ Anyone is free to copy, modify, publish, use, compile, sell, or
+ distribute this software, either in source code form or as a compiled
+ binary, for any purpose, commercial or non-commercial, and by any
+ means.
+
+ In jurisdictions that recognize copyright laws, the author or authors
+ of this software dedicate any and all copyright interest in the
+ software to the public domain. We make this dedication for the benefit
+ of the public at large and to the detriment of our heirs and
+ successors. We intend this dedication to be an overt act of
+ relinquishment in perpetuity of all present and future rights to this
+ software under copyright law.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS IN THE SOFTWARE. */
+
+
+#include <assert.h>
+#include <locale.h>
+#include <string.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <support/check.h>
+
+
+static const regmatch_t bound = {1, 4};
+
+
+struct reg_res {
+ const char *regex;
+ regmatch_t result;
+};
+static const struct reg_res reg_res_ac[] = {
+ {"^a", {1, 2}},
+ {"c$", {3, 4}},
+ {"^a.c$", {1, 4}},
+ {"^a.*c$", {1, 4}},
+ {"^a[^c]c$", {1, 4}},
+ {"^a..", {1, 4}},
+ {"..c", {1, 4}},
+ {"[^z]c", {2, 4}},
+ {}
+};
+static const char *const data_ac[] = {"_a\0cdef", "_abcdef", NULL};
+
+static const struct reg_res reg_res_aa[] = {
+ {"^", {1, 1}},
+ {"^a", {1, 2}},
+ {"a$", {3, 4}},
+ {"^\\(a\\).\\1$", {1, 4}},
+ {"^a[^a]*" , {1, 3}},
+ {}
+};
+static const char *const data_aa[] = {"_a\0adef", "_abadef", NULL};
+
+
+static void
+testbunch (const struct reg_res *reg_reses, const char *const *const data)
+{
+#define BASEERR(data) \
+ support_record_failure (), \
+ fprintf (stdout, __FILE__ ": %s: ", reg_reses->regex), \
+ fwrite (data + bound.rm_so, 1, bound.rm_eo - bound.rm_so, stdout)
+
+ for (; reg_reses->regex; ++reg_reses)
+ {
+ regex_t rgx;
+ assert (!regcomp (&rgx, reg_reses->regex, 0));
+
+ for (const char *const *dt = data; *dt; ++dt)
+ {
+ regmatch_t match = bound;
+ if (regexec (&rgx, *dt, 1, &match, REG_STARTEND))
+ BASEERR(dt), fputs (": no match\n", stdout);
+
+ if (memcmp(&match, ®_reses->result, sizeof (regmatch_t)))
+ BASEERR(dt), fprintf (stdout, ": wanted {%d, %d}, got {%d, %d}\n",
+ (int)reg_reses->result.rm_so,
+ (int)reg_reses->result.rm_eo,
+ (int)match.rm_so, (int)match.rm_eo);
+ }
+
+ regfree(&rgx);
+ }
+}
+
+
+struct mb_data_exp {
+ const char *data;
+ bool exp;
+};
+static const struct mb_data_exp mb_data_exp[] = {
+ {"_aaćdef", false},
+ {"_aćdef", true},
+ {}
+};
+
+static void
+testmb (void)
+{
+ regex_t rgx;
+ const struct reg_res reg_reses[] = {{"ać"}};
+ assert (!regcomp (&rgx, reg_reses->regex, 0));
+
+ for (const struct mb_data_exp *de = mb_data_exp; de->data; ++de)
+ {
+ regmatch_t match = bound;
+ if (regexec (&rgx, de->data, 1, &match, REG_STARTEND) == de->exp)
+ BASEERR(de->data), fprintf (stdout, ": %s match\n",
+ de->exp ? "no" : "yes");
+
+ if (memcmp(&match, &bound, sizeof (regmatch_t)))
+ BASEERR(de->data), fprintf (stdout, ": wanted {%d, %d}, got {%d, %d}\n",
+ (int)bound.rm_so, (int)bound.rm_eo,
+ (int)match.rm_so, (int)match.rm_eo);
+ }
+
+ regfree(&rgx);
+}
+
+
+static int
+do_test (void)
+{
+ assert (setlocale (LC_ALL, "C.UTF-8"));
+
+ testbunch (reg_res_ac, data_ac);
+ testbunch (reg_res_aa, data_aa);
+ testmb ();
+ return 0;
+}
+
+
+#include <support/test-driver.c>
--
2.39.2
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread