public inbox for glibc-bugs-regex@sourceware.org
help / color / mirror / Atom feed
* [Bug regex/544] New: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot
@ 2004-11-12 16:39 jakub at redhat dot com
  2004-11-19 10:28 ` [Bug regex/544] " jakub at redhat dot com
  0 siblings, 1 reply; 2+ messages in thread
From: jakub at redhat dot com @ 2004-11-12 16:39 UTC (permalink / raw)
  To: glibc-bugs-regex

#include <fcntl.h>
#include <locale.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>

static int
do_test (void)
{
  static const char *pat[] = {
    ".?.?.?.?.?.?.?abcde",
    "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
    "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
    "((((((((((.?))))))))))abcde" };

  int fd = open ("../ChangeLog.14", O_RDONLY);
  if (fd < 0)
    {
      printf ("Couldn't open ChangeLog.14: %m\n");
      return 1;
    }

  struct stat64 st;
  if (fstat64 (fd, &st) < 0)
    {
      printf ("Couldn't fstat ChangeLog.14: %m\n");
      return 1;
    }

  char *buf = malloc (st.st_size + 1);
  if (buf == NULL)
    {
      printf ("Couldn't allocate buffer: %m\n");
      return 1;
    }

  if (read (fd, buf, st.st_size) != (ssize_t) st.st_size)
    {
      puts ("Couldn't read ChangeLog.14");
      return 1;
    }

  close (fd);
  buf[st.st_size] = '\0';


  setlocale (LC_ALL, "de_DE.UTF-8");

  for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i)
    {
      printf ("pattern %s", pat[i]);

      regex_t rbuf;
      int err = regcomp (&rbuf, pat[i], REG_EXTENDED | REG_NOSUB);
      if (err != 0)
        {
          putchar ('\n');
          char errstr[300];
          regerror (err, &rbuf, errstr, sizeof (errstr));
          puts (errstr);
          return err;
        }

      struct timeval start, stop;
      gettimeofday (&start, NULL);

      err = regexec (&rbuf, buf, 0, NULL, 0);
      if (err != REG_NOMATCH)
        {
          puts ("\nregexec unexpectedly matched");
          return 1;
        }

      gettimeofday (&stop, NULL);
      stop.tv_sec -= start.tv_sec;
      if (stop.tv_usec < start.tv_usec)
        {
          stop.tv_sec--;
          stop.tv_usec += 1000000 - start.tv_usec;
        }
      else
        stop.tv_usec -= start.tv_usec;
      printf (": %ld.%06lds\n", (long) stop.tv_sec, (long) stop.tv_usec);

      regfree (&rbuf);
    }

  return 0;
}

#define TIMEOUT 10
#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"

There is no reason why the second regexec should be any slower than the first
one, yet on my box the second regexec is about 6 times slower than the first one.

I'll look into what can be done.  I hope regcomp can kill those nodes from being
seen by regexec if they aren't needed for backreferences and REG_NOSUB, or if
they are nested with no intervening tokens in between (e.g. '((x))') and we can
tell the final match computation that say match 0 is identical to match 1.

-- 
           Summary: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec
                    down a lot
           Product: glibc
           Version: unspecified
            Status: NEW
          Severity: normal
          Priority: P2
         Component: regex
        AssignedTo: gotom at debian dot or dot jp
        ReportedBy: jakub at redhat dot com
                CC: glibc-bugs-regex at sources dot redhat dot com,glibc-
                    bugs at sources dot redhat dot com


http://sources.redhat.com/bugzilla/show_bug.cgi?id=544

------- You are receiving this mail because: -------
You are on the CC list for the bug, or are watching someone who is.


^ permalink raw reply	[flat|nested] 2+ messages in thread

* [Bug regex/544] Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot
  2004-11-12 16:39 [Bug regex/544] New: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot jakub at redhat dot com
@ 2004-11-19 10:28 ` jakub at redhat dot com
  0 siblings, 0 replies; 2+ messages in thread
From: jakub at redhat dot com @ 2004-11-19 10:28 UTC (permalink / raw)
  To: glibc-bugs-regex


------- Additional Comments From jakub at redhat dot com  2004-11-19 10:28 -------
This should be fixed in CVS now.

-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
         Resolution|                            |FIXED


http://sources.redhat.com/bugzilla/show_bug.cgi?id=544

------- You are receiving this mail because: -------
You are on the CC list for the bug, or are watching someone who is.


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2004-11-19 10:28 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-11-12 16:39 [Bug regex/544] New: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot jakub at redhat dot com
2004-11-19 10:28 ` [Bug regex/544] " jakub at redhat dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).