public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] regex: Unnest nested functions in regcomp.c
@ 2021-10-27  5:29 Fangrui Song
  2021-10-27  9:35 ` Paul Eggert
  2021-11-02  2:40 ` Carlos O'Donell
  0 siblings, 2 replies; 5+ messages in thread
From: Fangrui Song @ 2021-10-27  5:29 UTC (permalink / raw)
  To: libc-alpha, Paul Eggert

If the non-buildable `sysdeps/generic/dl-machine.h` doesn't count,
this patch removes the last `auto inline` usage and makes the file
buildable by Clang once we can resolve the "asm label after first use"
issue[1].

collseqwc, table_size, symb_table, extra are now initialized to appease
GCC -Werror=maybe-uninitialized false positive.

[1]: https://maskray.me/blog/2021-10-10-when-can-glibc-be-built-with-clang#asm-label-after-first-use
---
 posix/regcomp.c | 514 +++++++++++++++++++++++++-----------------------
 1 file changed, 266 insertions(+), 248 deletions(-)

diff --git a/posix/regcomp.c b/posix/regcomp.c
index 887e5b5068..e2b5c2807a 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -2831,6 +2831,263 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name)
 }
 #endif /* not _LIBC */
 
+#ifdef _LIBC
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Seek the collating symbol entry corresponding to NAME.
+   Return the index of the symbol in the SYMB_TABLE,
+   or -1 if not found.  */
+
+static inline int32_t
+__attribute__ ((always_inline))
+seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
+			     const int32_t *symb_table, int32_t table_size,
+			     const unsigned char *extra)
+{
+  int32_t elem;
+
+  for (elem = 0; elem < table_size; elem++)
+    if (symb_table[2 * elem] != 0)
+      {
+	int32_t idx = symb_table[2 * elem + 1];
+	/* Skip the name of collating element name.  */
+	idx += 1 + extra[idx];
+	if (/* Compare the length of the name.  */
+	    name_len == extra[idx]
+	    /* Compare the name.  */
+	    && memcmp (name, &extra[idx + 1], name_len) == 0)
+	  /* Yep, this is the entry.  */
+	  return elem;
+      }
+  return -1;
+}
+
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Look up the collation sequence value of BR_ELEM.
+   Return the value if succeeded, UINT_MAX otherwise.  */
+
+static inline unsigned int
+__attribute__ ((always_inline))
+lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
+				 const unsigned char *collseqmb,
+				 const char *collseqwc, int32_t table_size,
+				 const int32_t *symb_table,
+				 const unsigned char *extra)
+{
+  if (br_elem->type == SB_CHAR)
+    {
+      /* if (MB_CUR_MAX == 1) */
+      if (nrules == 0)
+	return collseqmb[br_elem->opr.ch];
+      else
+	{
+	  wint_t wc = __btowc (br_elem->opr.ch);
+	  return __collseq_table_lookup (collseqwc, wc);
+	}
+    }
+  else if (br_elem->type == MB_CHAR)
+    {
+      if (nrules != 0)
+	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+    }
+  else if (br_elem->type == COLL_SYM)
+    {
+      size_t sym_name_len = strlen ((char *) br_elem->opr.name);
+      if (nrules != 0)
+	{
+	  int32_t elem, idx;
+	  elem = seek_collating_symbol_entry (br_elem->opr.name,
+					      sym_name_len,
+					      symb_table, table_size,
+					      extra);
+	  if (elem != -1)
+	    {
+	      /* We found the entry.  */
+	      idx = symb_table[2 * elem + 1];
+	      /* Skip the name of collating element name.  */
+	      idx += 1 + extra[idx];
+	      /* Skip the byte sequence of the collating element.  */
+	      idx += 1 + extra[idx];
+	      /* Adjust for the alignment.  */
+	      idx = (idx + 3) & ~3;
+	      /* Skip the multibyte collation sequence value.  */
+	      idx += sizeof (unsigned int);
+	      /* Skip the wide char sequence of the collating element.  */
+	      idx += sizeof (unsigned int) *
+		(1 + *(unsigned int *) (extra + idx));
+	      /* Return the collation sequence value.  */
+	      return *(unsigned int *) (extra + idx);
+	    }
+	  else if (sym_name_len == 1)
+	    {
+	      /* No valid character.  Match it as a single byte
+		 character.  */
+	      return collseqmb[br_elem->opr.name[0]];
+	    }
+	}
+      else if (sym_name_len == 1)
+	return collseqmb[br_elem->opr.name[0]];
+    }
+  return UINT_MAX;
+}
+
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Build the range expression which starts from START_ELEM, and ends
+   at END_ELEM.  The result are written to MBCSET and SBCSET.
+   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
+   mbcset->range_ends, is a pointer argument since we may
+   update it.  */
+
+static inline reg_errcode_t
+__attribute__ ((always_inline))
+build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
+		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
+		 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
+		 const unsigned char *collseqmb, const char *collseqwc,
+		 int32_t table_size, const int32_t *symb_table,
+		 const unsigned char *extra)
+{
+  unsigned int ch;
+  uint32_t start_collseq;
+  uint32_t end_collseq;
+
+  /* Equivalence Classes and Character Classes can't be a range
+     start/end.  */
+  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
+                        || start_elem->type == CHAR_CLASS
+                        || end_elem->type == EQUIV_CLASS
+                        || end_elem->type == CHAR_CLASS))
+    return REG_ERANGE;
+
+  /* FIXME: Implement rational ranges here, too.  */
+  start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
+						   table_size, symb_table, extra);
+  end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
+						 table_size, symb_table, extra);
+  /* Check start/end collation sequence values.  */
+  if (__glibc_unlikely (start_collseq == UINT_MAX
+                        || end_collseq == UINT_MAX))
+    return REG_ECOLLATE;
+  if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
+                        && start_collseq > end_collseq))
+    return REG_ERANGE;
+
+  /* Got valid collation sequence values, add them as a new entry.
+     However, if we have no collation elements, and the character set
+     is single byte, the single byte character set that we
+     build below suffices. */
+  if (nrules > 0 || dfa->mb_cur_max > 1)
+    {
+      /* Check the space of the arrays.  */
+      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
+	{
+	  /* There is not enough space, need realloc.  */
+	  uint32_t *new_array_start;
+	  uint32_t *new_array_end;
+	  int new_nranges;
+
+	  /* +1 in case of mbcset->nranges is 0.  */
+	  new_nranges = 2 * mbcset->nranges + 1;
+	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
+					new_nranges);
+	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
+				      new_nranges);
+
+          if (__glibc_unlikely (new_array_start == NULL
+                                || new_array_end == NULL))
+	    return REG_ESPACE;
+
+	  mbcset->range_starts = new_array_start;
+	  mbcset->range_ends = new_array_end;
+	  *range_alloc = new_nranges;
+	}
+
+      mbcset->range_starts[mbcset->nranges] = start_collseq;
+      mbcset->range_ends[mbcset->nranges++] = end_collseq;
+    }
+
+  /* Build the table for single byte characters.  */
+  for (ch = 0; ch < SBC_MAX; ch++)
+    {
+      uint32_t ch_collseq;
+      /* if (MB_CUR_MAX == 1) */
+      if (nrules == 0)
+	ch_collseq = collseqmb[ch];
+      else
+	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
+      if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
+	bitset_set (sbcset, ch);
+    }
+  return REG_NOERROR;
+}
+
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Build the collating element which is represented by NAME.
+   The result are written to MBCSET and SBCSET.
+   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
+   pointer argument since we may update it.  */
+
+static inline reg_errcode_t
+__attribute__ ((always_inline))
+build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
+			int *coll_sym_alloc, const unsigned char *name,
+			uint32_t nrules, int32_t table_size,
+			const int32_t *symb_table, const unsigned char *extra)
+{
+  int32_t elem, idx;
+  size_t name_len = strlen ((const char *) name);
+  if (nrules != 0)
+    {
+      elem = seek_collating_symbol_entry (name, name_len, symb_table,
+					  table_size, extra);
+      if (elem != -1)
+	{
+	  /* We found the entry.  */
+	  idx = symb_table[2 * elem + 1];
+	  /* Skip the name of collating element name.  */
+	  idx += 1 + extra[idx];
+	}
+      else if (name_len == 1)
+	{
+	  /* No valid character, treat it as a normal
+	     character.  */
+	  bitset_set (sbcset, name[0]);
+	  return REG_NOERROR;
+	}
+      else
+	return REG_ECOLLATE;
+
+      /* Got valid collation sequence, add it as a new entry.  */
+      /* Check the space of the arrays.  */
+      if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
+	{
+	  /* Not enough, realloc it.  */
+	  /* +1 in case of mbcset->ncoll_syms is 0.  */
+	  int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
+	  /* Use realloc since mbcset->coll_syms is NULL
+	     if *alloc == 0.  */
+	  int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
+					       new_coll_sym_alloc);
+          if (__glibc_unlikely (new_coll_syms == NULL))
+	    return REG_ESPACE;
+	  mbcset->coll_syms = new_coll_syms;
+	  *coll_sym_alloc = new_coll_sym_alloc;
+	}
+      mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
+      return REG_NOERROR;
+    }
+  else
+    {
+      if (__glibc_unlikely (name_len != 1))
+	return REG_ECOLLATE;
+      else
+	{
+	  bitset_set (sbcset, name[0]);
+	  return REG_NOERROR;
+	}
+    }
+}
+#endif /* _LIBC */
+
 /* This function parse bracket expression like "[abc]", "[a-c]",
    "[[.a-a.]]" etc.  */
 
@@ -2840,253 +3097,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 {
 #ifdef _LIBC
   const unsigned char *collseqmb;
-  const char *collseqwc;
+  const char *collseqwc = NULL;
   uint32_t nrules;
-  int32_t table_size;
-  const int32_t *symb_table;
-  const unsigned char *extra;
-
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Seek the collating symbol entry corresponding to NAME.
-     Return the index of the symbol in the SYMB_TABLE,
-     or -1 if not found.  */
-
-  auto inline int32_t
-  __attribute__ ((always_inline))
-  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
-    {
-      int32_t elem;
-
-      for (elem = 0; elem < table_size; elem++)
-	if (symb_table[2 * elem] != 0)
-	  {
-	    int32_t idx = symb_table[2 * elem + 1];
-	    /* Skip the name of collating element name.  */
-	    idx += 1 + extra[idx];
-	    if (/* Compare the length of the name.  */
-		name_len == extra[idx]
-		/* Compare the name.  */
-		&& memcmp (name, &extra[idx + 1], name_len) == 0)
-	      /* Yep, this is the entry.  */
-	      return elem;
-	  }
-      return -1;
-    }
-
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Look up the collation sequence value of BR_ELEM.
-     Return the value if succeeded, UINT_MAX otherwise.  */
-
-  auto inline unsigned int
-  __attribute__ ((always_inline))
-  lookup_collation_sequence_value (bracket_elem_t *br_elem)
-    {
-      if (br_elem->type == SB_CHAR)
-	{
-	  /*
-	  if (MB_CUR_MAX == 1)
-	  */
-	  if (nrules == 0)
-	    return collseqmb[br_elem->opr.ch];
-	  else
-	    {
-	      wint_t wc = __btowc (br_elem->opr.ch);
-	      return __collseq_table_lookup (collseqwc, wc);
-	    }
-	}
-      else if (br_elem->type == MB_CHAR)
-	{
-	  if (nrules != 0)
-	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
-	}
-      else if (br_elem->type == COLL_SYM)
-	{
-	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
-	  if (nrules != 0)
-	    {
-	      int32_t elem, idx;
-	      elem = seek_collating_symbol_entry (br_elem->opr.name,
-						  sym_name_len);
-	      if (elem != -1)
-		{
-		  /* We found the entry.  */
-		  idx = symb_table[2 * elem + 1];
-		  /* Skip the name of collating element name.  */
-		  idx += 1 + extra[idx];
-		  /* Skip the byte sequence of the collating element.  */
-		  idx += 1 + extra[idx];
-		  /* Adjust for the alignment.  */
-		  idx = (idx + 3) & ~3;
-		  /* Skip the multibyte collation sequence value.  */
-		  idx += sizeof (unsigned int);
-		  /* Skip the wide char sequence of the collating element.  */
-		  idx += sizeof (unsigned int) *
-		    (1 + *(unsigned int *) (extra + idx));
-		  /* Return the collation sequence value.  */
-		  return *(unsigned int *) (extra + idx);
-		}
-	      else if (sym_name_len == 1)
-		{
-		  /* No valid character.  Match it as a single byte
-		     character.  */
-		  return collseqmb[br_elem->opr.name[0]];
-		}
-	    }
-	  else if (sym_name_len == 1)
-	    return collseqmb[br_elem->opr.name[0]];
-	}
-      return UINT_MAX;
-    }
-
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Build the range expression which starts from START_ELEM, and ends
-     at END_ELEM.  The result are written to MBCSET and SBCSET.
-     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
-     mbcset->range_ends, is a pointer argument since we may
-     update it.  */
-
-  auto inline reg_errcode_t
-  __attribute__ ((always_inline))
-  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
-		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
-    {
-      unsigned int ch;
-      uint32_t start_collseq;
-      uint32_t end_collseq;
-
-      /* Equivalence Classes and Character Classes can't be a range
-	 start/end.  */
-      if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
-			    || start_elem->type == CHAR_CLASS
-			    || end_elem->type == EQUIV_CLASS
-			    || end_elem->type == CHAR_CLASS))
-	return REG_ERANGE;
-
-      /* FIXME: Implement rational ranges here, too.  */
-      start_collseq = lookup_collation_sequence_value (start_elem);
-      end_collseq = lookup_collation_sequence_value (end_elem);
-      /* Check start/end collation sequence values.  */
-      if (__glibc_unlikely (start_collseq == UINT_MAX
-			    || end_collseq == UINT_MAX))
-	return REG_ECOLLATE;
-      if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
-			    && start_collseq > end_collseq))
-	return REG_ERANGE;
-
-      /* Got valid collation sequence values, add them as a new entry.
-	 However, if we have no collation elements, and the character set
-	 is single byte, the single byte character set that we
-	 build below suffices. */
-      if (nrules > 0 || dfa->mb_cur_max > 1)
-	{
-	  /* Check the space of the arrays.  */
-	  if (__glibc_unlikely (*range_alloc == mbcset->nranges))
-	    {
-	      /* There is not enough space, need realloc.  */
-	      uint32_t *new_array_start;
-	      uint32_t *new_array_end;
-	      Idx new_nranges;
-
-	      /* +1 in case of mbcset->nranges is 0.  */
-	      new_nranges = 2 * mbcset->nranges + 1;
-	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
-					    new_nranges);
-	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
-					  new_nranges);
-
-	      if (__glibc_unlikely (new_array_start == NULL
-				    || new_array_end == NULL))
-		return REG_ESPACE;
-
-	      mbcset->range_starts = new_array_start;
-	      mbcset->range_ends = new_array_end;
-	      *range_alloc = new_nranges;
-	    }
-
-	  mbcset->range_starts[mbcset->nranges] = start_collseq;
-	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
-	}
-
-      /* Build the table for single byte characters.  */
-      for (ch = 0; ch < SBC_MAX; ch++)
-	{
-	  uint32_t ch_collseq;
-	  /*
-	  if (MB_CUR_MAX == 1)
-	  */
-	  if (nrules == 0)
-	    ch_collseq = collseqmb[ch];
-	  else
-	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
-	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
-	    bitset_set (sbcset, ch);
-	}
-      return REG_NOERROR;
-    }
-
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Build the collating element which is represented by NAME.
-     The result are written to MBCSET and SBCSET.
-     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
-     pointer argument since we may update it.  */
-
-  auto inline reg_errcode_t
-  __attribute__ ((always_inline))
-  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
-			  Idx *coll_sym_alloc, const unsigned char *name)
-    {
-      int32_t elem, idx;
-      size_t name_len = strlen ((const char *) name);
-      if (nrules != 0)
-	{
-	  elem = seek_collating_symbol_entry (name, name_len);
-	  if (elem != -1)
-	    {
-	      /* We found the entry.  */
-	      idx = symb_table[2 * elem + 1];
-	      /* Skip the name of collating element name.  */
-	      idx += 1 + extra[idx];
-	    }
-	  else if (name_len == 1)
-	    {
-	      /* No valid character, treat it as a normal
-		 character.  */
-	      bitset_set (sbcset, name[0]);
-	      return REG_NOERROR;
-	    }
-	  else
-	    return REG_ECOLLATE;
-
-	  /* Got valid collation sequence, add it as a new entry.  */
-	  /* Check the space of the arrays.  */
-	  if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
-	    {
-	      /* Not enough, realloc it.  */
-	      /* +1 in case of mbcset->ncoll_syms is 0.  */
-	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
-	      /* Use realloc since mbcset->coll_syms is NULL
-		 if *alloc == 0.  */
-	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
-						   new_coll_sym_alloc);
-	      if (__glibc_unlikely (new_coll_syms == NULL))
-		return REG_ESPACE;
-	      mbcset->coll_syms = new_coll_syms;
-	      *coll_sym_alloc = new_coll_sym_alloc;
-	    }
-	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
-	  return REG_NOERROR;
-	}
-      else
-	{
-	  if (__glibc_unlikely (name_len != 1))
-	    return REG_ECOLLATE;
-	  else
-	    {
-	      bitset_set (sbcset, name[0]);
-	      return REG_NOERROR;
-	    }
-	}
-    }
+  int32_t table_size = 0;
+  const int32_t *symb_table = NULL;
+  const unsigned char *extra = NULL;
 #endif
 
   re_token_t br_token;
@@ -3230,7 +3245,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 
 #ifdef _LIBC
 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
-				  &start_elem, &end_elem);
+				  &start_elem, &end_elem,
+				  dfa, syntax, nrules, collseqmb, collseqwc,
+				  table_size, symb_table, extra);
 #else
 # ifdef RE_ENABLE_I18N
 	  *err = build_range_exp (syntax, sbcset,
@@ -3283,7 +3300,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 #ifdef RE_ENABLE_I18N
 					     mbcset, &coll_sym_alloc,
 #endif /* RE_ENABLE_I18N */
-					     start_elem.opr.name);
+					     start_elem.opr.name,
+					     nrules, table_size, symb_table, extra);
 	      if (__glibc_unlikely (*err != REG_NOERROR))
 		goto parse_bracket_exp_free_return;
 	      break;
-- 
2.33.0.1079.g6e70778dc9-goog


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-11-02  2:41 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-27  5:29 [PATCH] regex: Unnest nested functions in regcomp.c Fangrui Song
2021-10-27  9:35 ` Paul Eggert
2021-10-27 16:10   ` Joseph Myers
2021-10-28  4:12     ` Fāng-ruì Sòng
2021-11-02  2:40 ` Carlos O'Donell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).