public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* PATCH: Regex performance improvements
@ 2001-05-08 10:06 Paolo Bonzini
  0 siblings, 0 replies; only message in thread
From: Paolo Bonzini @ 2001-05-08 10:06 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 978 bytes --]

Here is the unified diff for the changes I posted.

2001-05-08  Paolo Bonzini  (bonzini@gnu.org)

        * posix/regex.c [__GNUC__]: define HAVE_GOTO_VOID_P
        * posix/regex.c [HAVE_GOTO_VOID_P] (re_match_2_internal): use
        first-class labels instead of a switch statement
        * posix/regex.c (re_match_2_internal): replaced break statements
        with NEXT, to check for p > pend before going to the top of the
        loop.  Moved the end-of-pattern code from the top of the loop to
	the bottom (before the failure code).

2001-05-07  Paolo Bonzini  (bonzini@gnu.org)

        * posix/regex.c [SINGLE_STRING] (re_match_2_internal): disable
        double buffer support

        * posix/regex.c [FORCE_POSIX_DOTS] (re_match_2_internal): dot
        matches everything
        * posix/regex.c [FORCE_POSIX_DOTS] (compile_regex): set the
        appropriate syntax bits before starting the parsing loop

--
|_  _  _ __
|_)(_)| ) ,'
-------- '-._
regex.c.diff


[-- Attachment #2: regex.c.diff --]
[-- Type: text/x-diff, Size: 40307 bytes --]

--- grep-2.5e/lib/regex.c	Mon May  7 18:46:28 2001
+++ grep-2.5e-bonz/lib/regex.c	Tue May  8 18:31:18 2001
@@ -70,8 +70,6 @@
     else                                                                      \
       printf ("%C", (wint_t) c); /* Should we use wide stream??  */           \
   } while (0)
-# define TRUE 1
-# define FALSE 0
 #else
 # define CHAR_TYPE char
 # define US_CHAR_TYPE unsigned char /* unsigned character type */
@@ -80,6 +78,11 @@
 # define PUT_CHAR(c) putchar (c)
 #endif /* MBS_SUPPORT */
 
+#ifndef TRUE
+#define TRUE 1
+#define FALSE 0
+#endif
+
 #ifdef _LIBC
 /* We have to keep the namespace clean.  */
 # define regfree(preg) __regfree (preg)
@@ -393,11 +396,15 @@
 #endif /* not using relocating allocator */
 
 
+#ifdef SINGLE_STRING
+#define FIRST_STRING_P(ptr)    FALSE
+#else
 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
    `string1' or just past its end.  This works if PTR is NULL, which is
    a good thing.  */
 #define FIRST_STRING_P(ptr)                                     \
   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
+#endif
 
 /* (Re)Allocate N items of type T using malloc, or fail.  */
 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
@@ -2336,6 +2343,11 @@
      number is put in the stop_memory as the start_memory.  */
   regnum_t regnum = 0;
 
+#ifdef FORCE_POSIX_DOTS
+  syntax |= RE_DOT_NEWLINE;
+  syntax &= ~RE_DOT_NOT_NULL;
+#endif
+
 #ifdef MBS_SUPPORT
   /* Initialize the wchar_t PATTERN and offset_buffer.  */
   p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
@@ -5016,10 +5028,14 @@
               register int lim = 0;
               int irange = range;
 
+#ifdef SINGLE_STRING
+	      d = string2 + startpos;
+#else
               if (startpos < size1 && startpos + range >= size1)
                 lim = range - (size1 - startpos);
 
               d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
+#endif
 
               /* Written out as an if-else to avoid testing `translate'
                  inside the loop.  */
@@ -5036,9 +5052,13 @@
             }
           else                          /* Searching backwards.  */
             {
+#ifdef SINGLE_STRING
+              register CHAR_TYPE c = string2[startpos];
+#else
               register CHAR_TYPE c = (size1 == 0 || startpos >= size1
                                       ? string2[startpos - size1]
                                       : string1[startpos]);
+#endif
 
               if (!fastmap[(unsigned char) TRANSLATE (c)])
                 goto advance;
@@ -5103,13 +5123,38 @@
    : ((regoff_t) ((ptr) - string2 + size1)))
 #endif /* MBS_SUPPORT */
 
+#ifdef SINGLE_STRING
+
+# define MATCHING_IN_FIRST_STRING  FALSE
+
+# define PREFETCH()                                                      \
+  /* End of string => fail.  */                                         \
+  if (d == dend)	                                                \
+    goto fail;
+
+/* Test if at very beginning or at very end of the virtual concatenation
+   of `string1' and `string2'.  If only one string, it's `string2'.  */
+# define AT_STRINGS_BEG(d) ((d) == string2 || !size2)
+# define AT_STRINGS_END(d) ((d) == end2)
+
+# ifdef MBS_SUPPORT
+/* Use internationalized API instead of SYNTAX.  */
+#  define WORDCHAR_P(d)                                                  \
+  (iswalnum (*(d)) != 0)
+#else
+#  define WORDCHAR_P(d)                                                  \
+  (SYNTAX (*(d)) == Sword)
+# endif /* MBS_SUPPORT */
+
+#else
+
 /* Macros for dealing with the split strings in re_match_2.  */
 
-#define MATCHING_IN_FIRST_STRING  (dend == end_match_1)
+# define MATCHING_IN_FIRST_STRING  (dend == end_match_1)
 
 /* Call before fetching a character with *d.  This switches over to
    string2 if necessary.  */
-#define PREFETCH()                                                      \
+# define PREFETCH()                                                      \
   while (d == dend)                                                     \
     {                                                                   \
       /* End of string2 => fail.  */                                    \
@@ -5123,14 +5168,9 @@
 
 /* Test if at very beginning or at very end of the virtual concatenation
    of `string1' and `string2'.  If only one string, it's `string2'.  */
-#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
-#define AT_STRINGS_END(d) ((d) == end2)
-
+# define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
+# define AT_STRINGS_END(d) ((d) == end2)
 
-/* Test if D points to a character which is word-constituent.  We have
-   two special cases to check for: if past the end of string1, look at
-   the first character in string2; and if before the beginning of
-   string2, look at the last character in string1.  */
 #ifdef MBS_SUPPORT
 /* Use internationalized API instead of SYNTAX.  */
 # define WORDCHAR_P(d)                                                  \
@@ -5143,6 +5183,39 @@
    == Sword)
 #endif /* MBS_SUPPORT */
 
+#endif /* !SINGLE_STRING */
+
+#ifdef __GNUC__
+#define HAVE_GOTO_VOID_P
+#endif
+
+#ifdef HAVE_GOTO_VOID_P
+# define NEXT do {                                                      \
+  if (p == pend)                                                        \
+    goto end_of_pattern;                                                \
+  else                                                                  \
+    goto *execute_command[SWITCH_ENUM_CAST ((re_opcode_t) *p++)];       \
+} while(0)
+
+# define CASE(x) label_##x:
+# define LABEL(x) &&label_##x
+
+#else
+# define NEXT do {                                                      \
+  if (p == pend)                                                        \
+    goto end_of_pattern;                                                \
+  else                                                                  \
+    goto execute_command;                                               \
+} while(0)
+
+# define CASE(x) case x:
+#endif
+
+/* Test if D points to a character which is word-constituent.  We have
+   two special cases to check for: if past the end of string1, look at
+   the first character in string2; and if before the beginning of
+   string2, look at the last character in string1.  */
+
 /* Disabled due to a compiler bug -- see comment at case wordbound */
 #if 0
 /* Test if the character before D and the one at D differ with respect
@@ -5476,6 +5549,57 @@
   unsigned num_regs_pushed = 0;
 #endif
 
+#ifdef HAVE_GOTO_VOID_P
+# ifdef emacs
+  static void *execute_command[notsyntaxspec+1] =
+# else
+  static void *execute_command[notwordbound+1] =
+# endif
+  {
+    LABEL(no_op),
+    LABEL(succeed),
+    LABEL(exactn),
+# ifdef MBS_SUPPORT
+    LABEL(exactn_bin),
+# endif
+    LABEL(anychar),
+    LABEL(charset),
+    LABEL(charset_not),
+    LABEL(start_memory),
+    LABEL(stop_memory),
+    LABEL(duplicate),
+    LABEL(begline),
+    LABEL(endline),
+    LABEL(begbuf),
+    LABEL(endbuf),
+    LABEL(jump),
+    LABEL(jump_past_alt),
+    LABEL(on_failure_jump),
+    LABEL(on_failure_keep_string_jump),
+    LABEL(pop_failure_jump),
+    LABEL(maybe_pop_jump),
+    LABEL(dummy_failure_jump),
+    LABEL(push_dummy_failure),
+    LABEL(succeed_n),
+    LABEL(jump_n),
+    LABEL(set_number_at),
+    LABEL(wordchar),
+    LABEL(notwordchar),  
+    LABEL(wordbeg),      
+    LABEL(wordend),      
+    LABEL(wordbound),
+    LABEL(notwordbound)
+
+# ifdef emacs
+    ,LABEL(before_dot),  
+    LABEL(at_dot),       
+    LABEL(after_dot),   
+    LABEL(syntaxspec),
+    LABEL(notsyntaxspec)
+# endif
+  }
+#endif /* HAVE_GOTO_VOID_P */
+
   DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
 
   INIT_FAIL_STACK ();
@@ -5531,6 +5655,9 @@
      fill them with converted string.  */
   if (csize1 != 0)
     {
+      if (csize2)
+	return -2;
+
       string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
       mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
       is_binary = REGEX_TALLOC (csize1 + 1, char);
@@ -5570,7 +5697,6 @@
      pattern to (char*) in regex_compile.  */
   p = pattern = (CHAR_TYPE*)bufp->buffer;
   pend = (CHAR_TYPE*)(bufp->buffer + bufp->used);
-
 #endif /* MBS_SUPPORT */
 
   /* Initialize subexpression text positions to -1 to mark ones that no
@@ -5596,11 +5722,20 @@
       string1 = 0;
       size1 = 0;
     }
+#ifdef SINGLE_STRING
+#ifndef MBS_SUPPORT
+  else if (size1 && size2)
+    return -2;
+#endif
+
+#else
   end1 = string1 + size1;
+#endif
   end2 = string2 + size2;
 
   /* Compute where to stop matching, within the two strings.  */
 #ifdef MBS_SUPPORT
+#ifndef SINGLE_STRING
   if (stop <= csize1)
     {
       mcnt = count_mbs_length(mbs_offset1, stop);
@@ -5608,8 +5743,11 @@
       end_match_2 = string2;
     }
   else
+#endif
     {
+#ifndef SINGLE_STRING
       end_match_1 = end1;
+#endif
       mcnt = count_mbs_length(mbs_offset2, stop-csize1);
       end_match_2 = string2 + mcnt;
     }
@@ -5619,14 +5757,18 @@
       return -1;
     }
 #else
+#ifndef SINGLE_STRING
   if (stop <= size1)
     {
       end_match_1 = string1 + stop;
       end_match_2 = string2;
     }
   else
+#endif
     {
+#ifndef SINGLE_STRING
       end_match_1 = end1;
+#endif
       end_match_2 = string2 + stop - size1;
     }
 #endif /* MBS_SUPPORT */
@@ -5638,6 +5780,7 @@
      loop, `d' can be pointing at the end of a string, but it cannot
      equal `string2'.  */
 #ifdef MBS_SUPPORT
+#ifndef SINGLE_STRING
   if (size1 > 0 && pos <= csize1)
     {
       mcnt = count_mbs_length(mbs_offset1, pos);
@@ -5645,6 +5788,7 @@
       dend = end_match_1;
     }
   else
+#endif
     {
       mcnt = count_mbs_length(mbs_offset2, pos-csize1);
       d = string2 + mcnt;
@@ -5657,12 +5801,14 @@
       return -1;
     }
 #else
+#ifndef SINGLE_STRING
   if (size1 > 0 && pos <= size1)
     {
       d = string1 + pos;
       dend = end_match_1;
     }
   else
+#endif
     {
       d = string2 + pos - size1;
       dend = end_match_2;
@@ -5686,206 +5832,30 @@
       DEBUG_PRINT2 ("\n0x%x: ", p);
 #endif
 
-      if (p == pend)
-        { /* End of pattern means we might have succeeded.  */
-          DEBUG_PRINT1 ("end of pattern ... ");
-
-          /* If we haven't matched the entire string, and we want the
-             longest match, try backtracking.  */
-          if (d != end_match_2)
-            {
-              /* 1 if this match ends in the same string (string1 or string2)
-                 as the best previous match.  */
-              boolean same_str_p = (FIRST_STRING_P (match_end)
-                                    == MATCHING_IN_FIRST_STRING);
-              /* 1 if this match is the best seen so far.  */
-              boolean best_match_p;
-
-              /* AIX compiler got confused when this was combined
-                 with the previous declaration.  */
-              if (same_str_p)
-                best_match_p = d > match_end;
-              else
-                best_match_p = !MATCHING_IN_FIRST_STRING;
-
-              DEBUG_PRINT1 ("backtracking.\n");
-
-              if (!FAIL_STACK_EMPTY ())
-                { /* More failure points to try.  */
-
-                  /* If exceeds best match so far, save it.  */
-                  if (!best_regs_set || best_match_p)
-                    {
-                      best_regs_set = true;
-                      match_end = d;
-
-                      DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
-
-                      for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
-                        {
-                          best_regstart[mcnt] = regstart[mcnt];
-                          best_regend[mcnt] = regend[mcnt];
-                        }
-                    }
-                  goto fail;
-                }
-
-              /* If no failure points, don't restore garbage.  And if
-                 last match is real best match, don't restore second
-                 best one. */
-              else if (best_regs_set && !best_match_p)
-                {
-                restore_best_regs:
-                  /* Restore best match.  It may happen that `dend ==
-                     end_match_1' while the restored d is in string2.
-                     For example, the pattern `x.*y.*z' against the
-                     strings `x-' and `y-z-', if the two strings are
-                     not consecutive in memory.  */
-                  DEBUG_PRINT1 ("Restoring best registers.\n");
-
-                  d = match_end;
-                  dend = ((d >= string1 && d <= end1)
-                           ? end_match_1 : end_match_2);
-
-                  for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
-                    {
-                      regstart[mcnt] = best_regstart[mcnt];
-                      regend[mcnt] = best_regend[mcnt];
-                    }
-                }
-            } /* d != end_match_2 */
-
-        succeed_label:
-          DEBUG_PRINT1 ("Accepting match.\n");
-          /* If caller wants register contents data back, do it.  */
-          if (regs && !bufp->no_sub)
-            {
-              /* Have the register data arrays been allocated?  */
-              if (bufp->regs_allocated == REGS_UNALLOCATED)
-                { /* No.  So allocate them with malloc.  We need one
-                     extra element beyond `num_regs' for the `-1' marker
-                     GNU code uses.  */
-                  regs->num_regs = MAX (RE_NREGS, num_regs + 1);
-                  regs->start = TALLOC (regs->num_regs, regoff_t);
-                  regs->end = TALLOC (regs->num_regs, regoff_t);
-                  if (regs->start == NULL || regs->end == NULL)
-                    {
-                      FREE_VARIABLES ();
-                      return -2;
-                    }
-                  bufp->regs_allocated = REGS_REALLOCATE;
-                }
-              else if (bufp->regs_allocated == REGS_REALLOCATE)
-                { /* Yes.  If we need more elements than were already
-                     allocated, reallocate them.  If we need fewer, just
-                     leave it alone.  */
-                  if (regs->num_regs < num_regs + 1)
-                    {
-                      regs->num_regs = num_regs + 1;
-                      RETALLOC (regs->start, regs->num_regs, regoff_t);
-                      RETALLOC (regs->end, regs->num_regs, regoff_t);
-                      if (regs->start == NULL || regs->end == NULL)
-                        {
-                          FREE_VARIABLES ();
-                          return -2;
-                        }
-                    }
-                }
-              else
-                {
-                  /* These braces fend off a "empty body in an else-statement"
-                     warning under GCC when assert expands to nothing.  */
-                  assert (bufp->regs_allocated == REGS_FIXED);
-                }
-
-              /* Convert the pointer data in `regstart' and `regend' to
-                 indices.  Register zero has to be set differently,
-                 since we haven't kept track of any info for it.  */
-              if (regs->num_regs > 0)
-                {
-                  regs->start[0] = pos;
-#ifdef MBS_SUPPORT
-                  if (MATCHING_IN_FIRST_STRING)
-                    regs->end[0] = mbs_offset1 != NULL ?
-                                        mbs_offset1[d-string1] : 0;
-                  else
-                    regs->end[0] = csize1 + (mbs_offset2 != NULL ?
-                                             mbs_offset2[d-string2] : 0);
-#else
-                  regs->end[0] = (MATCHING_IN_FIRST_STRING
-                                  ? ((regoff_t) (d - string1))
-                                  : ((regoff_t) (d - string2 + size1)));
-#endif /* MBS_SUPPORT */
-                }
-
-              /* Go through the first `min (num_regs, regs->num_regs)'
-                 registers, since that is all we initialized.  */
-              for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
-                   mcnt++)
-                {
-                  if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
-                    regs->start[mcnt] = regs->end[mcnt] = -1;
-                  else
-                    {
-                      regs->start[mcnt]
-                        = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
-                      regs->end[mcnt]
-                        = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
-                    }
-                }
-
-              /* If the regs structure we return has more elements than
-                 were in the pattern, set the extra elements to -1.  If
-                 we (re)allocated the registers, this is the case,
-                 because we always allocate enough to have at least one
-                 -1 at the end.  */
-              for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
-                regs->start[mcnt] = regs->end[mcnt] = -1;
-            } /* regs && !bufp->no_sub */
-
-          DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
-                        nfailure_points_pushed, nfailure_points_popped,
-                        nfailure_points_pushed - nfailure_points_popped);
-          DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
-
-#ifdef MBS_SUPPORT
-          if (MATCHING_IN_FIRST_STRING)
-            mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
-          else
-            mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
-                        csize1;
-          mcnt -= pos;
-#else
-          mcnt = d - pos - (MATCHING_IN_FIRST_STRING
-                            ? string1
-                            : string2 - size1);
-#endif /* MBS_SUPPORT */
-
-          DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
-
-          FREE_VARIABLES ();
-          return mcnt;
-        }
+      NEXT;
 
+#ifndef HAVE_GOTO_VOID_P
+    execute_command:
       /* Otherwise match next pattern command.  */
       switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
+#endif
         {
         /* Ignore these.  Used to ignore the n of succeed_n's which
            currently have n == 0.  */
-        case no_op:
+        CASE(no_op)
           DEBUG_PRINT1 ("EXECUTING no_op.\n");
-          break;
+          NEXT;
 
-        case succeed:
+        CASE(succeed)
           DEBUG_PRINT1 ("EXECUTING succeed.\n");
           goto succeed_label;
 
         /* Match the next n pattern characters exactly.  The following
            byte in the pattern defines n, and the n bytes after that
            are the characters to match.  */
-        case exactn:
+        CASE(exactn)
 #ifdef MBS_SUPPORT
-        case exactn_bin:
+        CASE(exactn_bin)
 #endif
           mcnt = *p++;
           DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
@@ -5927,27 +5897,29 @@
               while (--mcnt);
             }
           SET_REGS_MATCHED ();
-          break;
+          NEXT;
 
 
         /* Match any character except possibly a newline or a null.  */
-        case anychar:
+        CASE(anychar)
           DEBUG_PRINT1 ("EXECUTING anychar.\n");
 
           PREFETCH ();
 
+#ifndef FORCE_POSIX_DOTS
           if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
               || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
             goto fail;
+#endif
 
           SET_REGS_MATCHED ();
           DEBUG_PRINT2 ("  Matched `%ld'.\n", (long int) *d);
           d++;
-          break;
+          NEXT;
 
 
-        case charset:
-        case charset_not:
+        CASE(charset)
+        CASE(charset_not)
           {
             register US_CHAR_TYPE c;
 #ifdef MBS_SUPPORT
@@ -6314,7 +6286,7 @@
 #endif /* MBS_SUPPORT */
             SET_REGS_MATCHED ();
             d++;
-            break;
+            NEXT;
           }
 
 
@@ -6323,7 +6295,7 @@
            number of groups inner to this one in the next.  The text
            matched within the group is recorded (in the internal
            registers data structure) under the register number.  */
-        case start_memory:
+        CASE(start_memory)
           DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
                         (long int) *p, (long int) p[1]);
 
@@ -6366,13 +6338,13 @@
           p += 2;
           just_past_start_mem = p;
 
-          break;
+          NEXT;
 
 
         /* The stop_memory opcode represents the end of a group.  Its
            arguments are the same as start_memory's: the register
            number, and the number of inner groups.  */
-        case stop_memory:
+        CASE(stop_memory)
           DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
                         (long int) *p, (long int) p[1]);
 
@@ -6505,12 +6477,12 @@
 
           /* Move past the register number and the inner group count.  */
           p += 2;
-          break;
+          NEXT;
 
 
         /* \<digit> has been turned into a `duplicate' command which is
            followed by the numeric value of <digit> as the register number.  */
-        case duplicate:
+        CASE(duplicate)
           {
             register const CHAR_TYPE *d2, *dend2;
             int regno = *p++;   /* Get which register to match against.  */
@@ -6523,11 +6495,35 @@
             /* Where in input to try to start matching.  */
             d2 = regstart[regno];
 
+#ifdef SINGLE_STRING
+            /* Where to stop matching.  */
+            dend2 = regend[regno];
+
+	    /* At end of register contents (i.e. reg empty) => success */
+	    if (d2 != dend2)
+	      {
+	    	mcnt = dend2 - d2;
+
+		/* Not enough space in source => fail  */
+		if (d + mcnt > dend)
+		  goto fail;
+
+		/* Compare that many; failure if mismatch, else move
+		   past them.  */
+		if (translate
+		    ? bcmp_translate (d, d2, mcnt, translate)
+		    : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE)))
+		  goto fail;
+		d += mcnt;
+	      }
+
+            /* Do this because we've match some characters.  */
+            SET_REGS_MATCHED ();
+#else
             /* Where to stop matching; if both the place to start and
                the place to stop matching are in the same string, then
                set to the place to stop, otherwise, for now have to use
                the end of the first string.  */
-
             dend2 = ((FIRST_STRING_P (regstart[regno])
                       == FIRST_STRING_P (regend[regno]))
                      ? regend[regno] : end_match_1);
@@ -6569,59 +6565,65 @@
                 /* Do this because we've match some characters.  */
                 SET_REGS_MATCHED ();
               }
+#endif
           }
-          break;
+          NEXT;
 
 
         /* begline matches the empty string at the beginning of the string
            (unless `not_bol' is set in `bufp'), and, if
            `newline_anchor' is set, after newlines.  */
-        case begline:
+        CASE(begline)
           DEBUG_PRINT1 ("EXECUTING begline.\n");
 
           if (AT_STRINGS_BEG (d))
             {
-              if (!bufp->not_bol) break;
+              if (!bufp->not_bol) NEXT;
             }
           else if (d[-1] == '\n' && bufp->newline_anchor)
             {
-              break;
+              NEXT;
             }
           /* In all other cases, we fail.  */
           goto fail;
 
 
         /* endline is the dual of begline.  */
-        case endline:
+        CASE(endline)
           DEBUG_PRINT1 ("EXECUTING endline.\n");
 
           if (AT_STRINGS_END (d))
             {
-              if (!bufp->not_eol) break;
+              if (!bufp->not_eol) NEXT;
             }
 
           /* We have to ``prefetch'' the next character.  */
+#ifdef SINGLE_STRING
+          else if (*d == '\n'
+                   && bufp->newline_anchor)
+#else
           else if ((d == end1 ? *string2 : *d) == '\n'
                    && bufp->newline_anchor)
+#endif
             {
-              break;
+              NEXT;
             }
           goto fail;
 
 
         /* Match at the very beginning of the data.  */
-        case begbuf:
+        CASE(begbuf)
           DEBUG_PRINT1 ("EXECUTING begbuf.\n");
           if (AT_STRINGS_BEG (d))
-            break;
+            NEXT;
           goto fail;
 
 
         /* Match at the very end of the data.  */
-        case endbuf:
+        CASE(endbuf)
           DEBUG_PRINT1 ("EXECUTING endbuf.\n");
           if (AT_STRINGS_END (d))
-            break;
+            NEXT;
           goto fail;
 
 
@@ -6641,7 +6643,7 @@
            stack at all is that otherwise we would have to change
            `anychar's code to do something besides goto fail in this
            case; that seems worse than this.  */
-        case on_failure_keep_string_jump:
+        CASE(on_failure_keep_string_jump)
           DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
 
           EXTRACT_NUMBER_AND_INCR (mcnt, p);
@@ -6652,7 +6654,7 @@
 #endif
 
           PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
-          break;
+          NEXT;
 
 
         /* Uses of on_failure_jump:
@@ -6667,7 +6669,7 @@
            Repeats start with an on_failure_jump that points past both
            the repetition text and either the following jump or
            pop_failure_jump back to this on_failure_jump.  */
-        case on_failure_jump:
+        CASE(on_failure_jump)
         on_failure:
           DEBUG_PRINT1 ("EXECUTING on_failure_jump");
 
@@ -6709,12 +6711,12 @@
 
           DEBUG_PRINT1 (":\n");
           PUSH_FAILURE_POINT (p + mcnt, d, -2);
-          break;
+          NEXT;
 
 
         /* A smart repeat ends with `maybe_pop_jump'.
            We change it to either `pop_failure_jump' or `jump'.  */
-        case maybe_pop_jump:
+        CASE(maybe_pop_jump)
           EXTRACT_NUMBER_AND_INCR (mcnt, p);
           DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
           {
@@ -6874,7 +6876,7 @@
               DEBUG_PRINT1 ("  Match => jump.\n");
               goto unconditional_jump;
             }
-        /* Note fall through.  */
+	  /* Note fall through.  */
 
 
         /* The end of a simple repeat has a pop_failure_jump back to
@@ -6883,7 +6885,7 @@
            points put on by this pop_failure_jump's matching
            on_failure_jump; we got through the pattern to here from the
            matching on_failure_jump, so didn't fail.  */
-        case pop_failure_jump:
+        CASE(pop_failure_jump)
           {
             /* We need to pass separate storage for the lowest and
                highest registers, even though we don't care about the
@@ -6910,7 +6912,7 @@
           /* Note fall through.  */
 
         /* Unconditionally jump (without popping any failure points).  */
-        case jump:
+        CASE(jump)
           EXTRACT_NUMBER_AND_INCR (mcnt, p);    /* Get the amount to jump.  */
           DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
           p += mcnt;                            /* Do the jump.  */
@@ -6919,12 +6921,12 @@
 #else
           DEBUG_PRINT2 ("(to 0x%x).\n", p);
 #endif
-          break;
+          NEXT;
 
 
         /* We need this opcode so we can detect where alternatives end
            in `group_match_null_string_p' et al.  */
-        case jump_past_alt:
+        CASE(jump_past_alt)
           DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
           goto unconditional_jump;
 
@@ -6934,7 +6936,7 @@
            pop_failure_jump, also, and with a pattern of, say, `a+', we
            are skipping over the on_failure_jump, so we have to push
            something meaningless for pop_failure_jump to pop.  */
-        case dummy_failure_jump:
+        CASE(dummy_failure_jump)
           DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
           /* It doesn't matter what we push for the string here.  What
              the code at `fail' tests is the value for the pattern.  */
@@ -6947,16 +6949,16 @@
            we don't want the failure point for the alternative to be
            popped.  For example, matching `(a|ab)*' against `aab'
            requires that we match the `ab' alternative.  */
-        case push_dummy_failure:
+        CASE(push_dummy_failure)
           DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
           /* See comments just above at `dummy_failure_jump' about the
              two zeroes.  */
           PUSH_FAILURE_POINT (NULL, NULL, -2);
-          break;
+          NEXT;
 
         /* Have to succeed matching what follows at least n times.
            After that, handle like `on_failure_jump'.  */
-        case succeed_n:
+        CASE(succeed_n)
           EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
           DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
 
@@ -6993,9 +6995,9 @@
 #endif /* MBS_SUPPORT */
               goto on_failure;
             }
-          break;
+          NEXT;
 
-        case jump_n:
+        CASE(jump_n)
           EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
           DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
 
@@ -7017,9 +7019,9 @@
           /* If don't have to jump any more, skip over the rest of command.  */
           else
             p += 2 * OFFSET_ADDRESS_SIZE;
-          break;
+          NEXT;
 
-        case set_number_at:
+        CASE(set_number_at)
           {
             DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
 
@@ -7032,7 +7034,7 @@
             DEBUG_PRINT3 ("  Setting 0x%x to %d.\n", p1, mcnt);
 #endif
             STORE_NUMBER (p1, mcnt);
-            break;
+            NEXT;
           }
 
 #if 0
@@ -7041,34 +7043,34 @@
            AT_WORD_BOUNDARY, so this code is disabled.  Expanding the
            macro and introducing temporary variables works around the bug.  */
 
-        case wordbound:
+        CASE(wordbound)
           DEBUG_PRINT1 ("EXECUTING wordbound.\n");
           if (AT_WORD_BOUNDARY (d))
-            break;
+            NEXT;
           goto fail;
 
-        case notwordbound:
+        CASE(notwordbound)
           DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
           if (AT_WORD_BOUNDARY (d))
             goto fail;
-          break;
+          NEXT;
 #else
-        case wordbound:
+        CASE(wordbound)
         {
           boolean prevchar, thischar;
 
           DEBUG_PRINT1 ("EXECUTING wordbound.\n");
           if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
-            break;
+            NEXT;
 
           prevchar = WORDCHAR_P (d - 1);
           thischar = WORDCHAR_P (d);
           if (prevchar != thischar)
-            break;
+            NEXT;
           goto fail;
         }
 
-      case notwordbound:
+      CASE(notwordbound)
         {
           boolean prevchar, thischar;
 
@@ -7080,48 +7082,48 @@
           thischar = WORDCHAR_P (d);
           if (prevchar != thischar)
             goto fail;
-          break;
+          NEXT;
         }
 #endif
 
-        case wordbeg:
+        CASE(wordbeg)
           DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
           if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
-            break;
+            NEXT;
           goto fail;
 
-        case wordend:
+        CASE(wordend)
           DEBUG_PRINT1 ("EXECUTING wordend.\n");
           if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
               && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
-            break;
+            NEXT;
           goto fail;
 
 #ifdef emacs
-        case before_dot:
+        CASE(before_dot)
           DEBUG_PRINT1 ("EXECUTING before_dot.\n");
           if (PTR_CHAR_POS ((unsigned char *) d) >= point)
             goto fail;
-          break;
+          NEXT;
 
-        case at_dot:
+        CASE(at_dot)
           DEBUG_PRINT1 ("EXECUTING at_dot.\n");
           if (PTR_CHAR_POS ((unsigned char *) d) != point)
             goto fail;
-          break;
+          NEXT;
 
-        case after_dot:
+        CASE(after_dot)
           DEBUG_PRINT1 ("EXECUTING after_dot.\n");
           if (PTR_CHAR_POS ((unsigned char *) d) <= point)
             goto fail;
-          break;
+          NEXT;
 
-        case syntaxspec:
+        CASE(syntaxspec)
           DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
           mcnt = *p++;
           goto matchsyntax;
 
-        case wordchar:
+        CASE(wordchar)
           DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
           mcnt = (int) Sword;
         matchsyntax:
@@ -7131,14 +7133,14 @@
           if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
             goto fail;
           SET_REGS_MATCHED ();
-          break;
+          NEXT;
 
-        case notsyntaxspec:
+        CASE(notsyntaxspec)
           DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
           mcnt = *p++;
           goto matchnotsyntax;
 
-        case notwordchar:
+        CASE(notwordchar)
           DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
           mcnt = (int) Sword;
         matchnotsyntax:
@@ -7148,33 +7150,217 @@
           if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
             goto fail;
           SET_REGS_MATCHED ();
-          break;
+          NEXT;
 
 #else /* not emacs */
-        case wordchar:
+        CASE(wordchar)
           DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
           PREFETCH ();
           if (!WORDCHAR_P (d))
             goto fail;
           SET_REGS_MATCHED ();
           d++;
-          break;
+          NEXT;
 
-        case notwordchar:
+        CASE(notwordchar)
           DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
           PREFETCH ();
           if (WORDCHAR_P (d))
             goto fail;
           SET_REGS_MATCHED ();
           d++;
-          break;
+          NEXT;
 #endif /* not emacs */
 
+#ifndef HAVE_GOTO_VOID_P
         default:
           abort ();
+#endif
         }
       continue;  /* Successfully executed one pattern command; keep going.  */
 
+    end_of_pattern:
+      /* End of pattern means we might have succeeded.  */
+      DEBUG_PRINT1 ("end of pattern ... ");
+
+      /* If we haven't matched the entire string, and we want the
+	 longest match, try backtracking.  */
+      if (d != end_match_2)
+	{
+	  /* 1 if this match ends in the same string (string1 or string2)
+	     as the best previous match.  */
+	  boolean same_str_p = (FIRST_STRING_P (match_end)
+				== MATCHING_IN_FIRST_STRING);
+	  /* 1 if this match is the best seen so far.  */
+	  boolean best_match_p;
+	  
+	  /* AIX compiler got confused when this was combined
+	     with the previous declaration.  */
+	  if (same_str_p)
+	    best_match_p = d > match_end;
+	  else
+	    best_match_p = !MATCHING_IN_FIRST_STRING;
+	  
+	  DEBUG_PRINT1 ("backtracking.\n");
+	  
+	  if (!FAIL_STACK_EMPTY ())
+	    { /* More failure points to try.  */
+	      
+	      /* If exceeds best match so far, save it.  */
+	      if (!best_regs_set || best_match_p)
+		{
+		  best_regs_set = true;
+		  match_end = d;
+		  
+		  DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
+		  
+		  for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
+		    {
+		      best_regstart[mcnt] = regstart[mcnt];
+		      best_regend[mcnt] = regend[mcnt];
+		    }
+		}
+	      goto fail;
+	    }
+	  
+	  /* If no failure points, don't restore garbage.  And if
+	     last match is real best match, don't restore second
+	     best one. */
+	  else if (best_regs_set && !best_match_p)
+	    {
+	    restore_best_regs:
+	      /* Restore best match.  It may happen that `dend ==
+		 end_match_1' while the restored d is in string2.
+		 For example, the pattern `x.*y.*z' against the
+		 strings `x-' and `y-z-', if the two strings are
+		 not consecutive in memory.  */
+	      DEBUG_PRINT1 ("Restoring best registers.\n");
+	      
+	      d = match_end;
+#ifdef SINGLE_STRING
+	      dend = end_match_2;
+#else
+	      dend = ((d >= string1 && d <= end1)
+		      ? end_match_1 : end_match_2);
+#endif
+
+	      for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
+		{
+		  regstart[mcnt] = best_regstart[mcnt];
+		  regend[mcnt] = best_regend[mcnt];
+		}
+	    }
+	} /* d != end_match_2 */
+
+    succeed_label:
+      DEBUG_PRINT1 ("Accepting match.\n");
+      /* If caller wants register contents data back, do it.  */
+      if (regs && !bufp->no_sub)
+	{
+	  /* Have the register data arrays been allocated?  */
+	  if (bufp->regs_allocated == REGS_UNALLOCATED)
+	    { /* No.  So allocate them with malloc.  We need one
+		 extra element beyond `num_regs' for the `-1' marker
+		 GNU code uses.  */
+	      regs->num_regs = MAX (RE_NREGS, num_regs + 1);
+	      regs->start = TALLOC (regs->num_regs, regoff_t);
+	      regs->end = TALLOC (regs->num_regs, regoff_t);
+	      if (regs->start == NULL || regs->end == NULL)
+		{
+		  FREE_VARIABLES ();
+		  return -2;
+		}
+	      bufp->regs_allocated = REGS_REALLOCATE;
+	    }
+	  else if (bufp->regs_allocated == REGS_REALLOCATE)
+	    { /* Yes.  If we need more elements than were already
+		 allocated, reallocate them.  If we need fewer, just
+		 leave it alone.  */
+	      if (regs->num_regs < num_regs + 1)
+		{
+		  regs->num_regs = num_regs + 1;
+		  RETALLOC (regs->start, regs->num_regs, regoff_t);
+		  RETALLOC (regs->end, regs->num_regs, regoff_t);
+		  if (regs->start == NULL || regs->end == NULL)
+		    {
+		      FREE_VARIABLES ();
+		      return -2;
+		    }
+		}
+	    }
+	  else
+	    {
+	      /* These braces fend off a "empty body in an else-statement"
+		 warning under GCC when assert expands to nothing.  */
+	      assert (bufp->regs_allocated == REGS_FIXED);
+	    }
+
+	  /* Convert the pointer data in `regstart' and `regend' to
+	     indices.  Register zero has to be set differently,
+	     since we haven't kept track of any info for it.  */
+	  if (regs->num_regs > 0)
+	    {
+	      regs->start[0] = pos;
+#ifdef MBS_SUPPORT
+	      if (MATCHING_IN_FIRST_STRING)
+		regs->end[0] = mbs_offset1 != NULL ?
+		               mbs_offset1[d-string1] : 0;
+	      else
+		regs->end[0] = csize1 + (mbs_offset2 != NULL ?
+					 mbs_offset2[d-string2] : 0);
+#else
+	      regs->end[0] = (MATCHING_IN_FIRST_STRING
+			      ? ((regoff_t) (d - string1))
+			      : ((regoff_t) (d - string2 + size1)));
+#endif /* MBS_SUPPORT */
+	    }
+
+	  /* Go through the first `min (num_regs, regs->num_regs)'
+	     registers, since that is all we initialized.  */
+	  for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
+	       mcnt++)
+	    {
+	      if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
+		regs->start[mcnt] = regs->end[mcnt] = -1;
+	      else
+		{
+		  regs->start[mcnt]
+		    = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
+		  regs->end[mcnt]
+		    = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
+		}
+	    }
+	  
+	  /* If the regs structure we return has more elements than
+	     were in the pattern, set the extra elements to -1.  If
+	     we (re)allocated the registers, this is the case,
+	     because we always allocate enough to have at least one
+	     -1 at the end.  */
+	  for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
+	    regs->start[mcnt] = regs->end[mcnt] = -1;
+	} /* regs && !bufp->no_sub */
+      
+      DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
+		    nfailure_points_pushed, nfailure_points_popped,
+		    nfailure_points_pushed - nfailure_points_popped);
+      DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
+      
+#ifdef MBS_SUPPORT
+      if (MATCHING_IN_FIRST_STRING)
+	mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
+      else
+	mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) + csize1;
+      mcnt -= pos;
+#else
+      mcnt = d - pos - (MATCHING_IN_FIRST_STRING
+			? string1
+			: string2 - size1);
+#endif /* MBS_SUPPORT */
+
+      DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
+      
+      FREE_VARIABLES ();
+      return mcnt;
 
     /* We goto here if a matching operation fails. */
     fail:
@@ -7218,9 +7404,12 @@
                 }
             }
 
+#ifndef SINGLE_STRING
           if (d >= string1 && d <= end1)
             dend = end_match_1;
+#endif
         }
+
       else
         break;   /* Matching at this starting point really fails.  */
     } /* for (;;) */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2001-05-08 10:06 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-05-08 10:06 PATCH: Regex performance improvements Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).