public inbox for elfutils@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] PR26775 interruptible/observable grooming
@ 2020-10-29 18:30 Frank Ch. Eigler
  2020-10-30 12:36 ` Mark Wielaard
  0 siblings, 1 reply; 3+ messages in thread
From: Frank Ch. Eigler @ 2020-10-29 18:30 UTC (permalink / raw)
  To: elfutils-devel

Hi -

Author: Frank Ch. Eigler <fche@redhat.com>
Date:   Thu Oct 29 14:25:18 2020 -0400

    PR26775: make grooming progress visible & interruptible
    
    On very large servers, it's desirable to be able to interrupt a rescan
    or groom cycle.  SIGUSR[12] now do that.  (Unfortunately, this is not
    practically testable in the testsuite, since these cycles are so fast
    on that small dataset.)  We also expose more internal progress count
    about the grooming pass, so the administrator can assess possible need
    to interrupt.
    
    Signed-off-by: Frank Ch. Eigler <fche@redhat.com>

diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 9eb9c813dd23..5af4dc6ca08f 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,14 @@
+2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* debuginfod.cxx (forced_*_count): Make these global.
+	(runq::clear): New function.
+	(thread_main_scanner): Check for pending SIGUSR2; interrupt.
+	(scan_source_paths): Check for pending SIGUSR2; interrupt.
+	(groom): Report prometheus stats before groom also.  Check for
+	pending SIGUSR1; interrupt.  Increment thread_work_total for
+	each file scanned, not the entire cycle.
+
 2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
 
 	PR26810
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 9da65d87d601..8448a501f356 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -388,7 +388,9 @@ static string db_path;
 static sqlite3 *db; // single connection, serialized across all our threads!
 static unsigned verbose;
 static volatile sig_atomic_t interrupted = 0;
+static volatile sig_atomic_t forced_rescan_count = 0;
 static volatile sig_atomic_t sigusr1 = 0;
+static volatile sig_atomic_t forced_groom_count = 0;
 static volatile sig_atomic_t sigusr2 = 0;
 static unsigned http_port = 8002;
 static unsigned rescan_s = 300;
@@ -607,6 +609,14 @@ class workq
     cv.notify_all();
   }
 
+  // clear the workqueue, when scanning is interrupted with USR2
+  void clear() {
+    unique_lock<mutex> lock(mtx);
+    q.clear();
+    set_metric("thread_work_pending","role","scan", q.size());
+    cv.notify_all(); // maybe wake up waiting idlers
+  }
+
   // block this scanner thread until there is work to do and no active
   bool wait_front (Payload& p)
   {
@@ -2741,6 +2751,12 @@ thread_main_scanner (void* arg)
         }
 
       inc_metric("thread_work_total", "role","scan");
+      
+      if (sigusr2 != forced_groom_count) // stop early if groom triggered 
+        {
+          scanq.clear();
+          break;
+        }
     }
 
   add_metric("thread_busy", "role", "scan", -1);
@@ -2784,6 +2800,9 @@ scan_source_paths()
   {
     if (interrupted) break;
 
+    if (sigusr2 != forced_groom_count) // stop early if groom triggered 
+      break;
+    
     fts_scanned ++;
 
     if (verbose > 2)
@@ -2842,7 +2861,6 @@ thread_main_fts_source_paths (void* arg)
 {
   (void) arg; // ignore; we operate on global data
 
-  sig_atomic_t forced_rescan_count = 0;
   set_metric("thread_tid", "role","traverse", tid());
   add_metric("thread_count", "role", "traverse", 1);
 
@@ -2923,6 +2941,8 @@ void groom()
   struct timeval tv_start, tv_end;
   gettimeofday (&tv_start, NULL);
 
+  database_stats_report();
+  
   // scan for files that have disappeared
   sqlite_ps files (db, "check old files", "select s.mtime, s.file, f.name from "
                        BUILDIDS "_file_mtime_scanned s, " BUILDIDS "_files f "
@@ -2951,6 +2971,10 @@ void groom()
           files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
           files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
         }
+      
+      inc_metric("thread_work_total", "role", "groom");
+      if (sigusr1 != forced_rescan_count) // stop early if scan triggered
+        break;
     }
   files.reset();
 
@@ -2987,7 +3011,6 @@ void groom()
 static void*
 thread_main_groom (void* /*arg*/)
 {
-  sig_atomic_t forced_groom_count = 0;
   set_metric("thread_tid", "role", "groom", tid());
   add_metric("thread_count", "role", "groom", 1);
 
@@ -3016,7 +3039,6 @@ thread_main_groom (void* /*arg*/)
             set_metric("thread_busy", "role", "groom", 1);
             groom ();
             last_groom = time(NULL); // NB: now was before grooming
-            inc_metric("thread_work_total", "role", "groom");
             set_metric("thread_busy", "role", "groom", 0);
           }
         catch (const sqlite_exception& e)
diff --git a/doc/ChangeLog b/doc/ChangeLog
index bde52c1157b7..8c33f1749c7f 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* debuginfod.8: Document that SIGUSR1 interrupts the groom
+	cycle, and SIGUSR2 interrupts rescan.
+
 2020-10-25  Mark Wielaard  <mark@klomp.org>
 
 	* debuginfod_find_debuginfo.3 (ECONNREFUSED): Document that this
diff --git a/doc/debuginfod.8 b/doc/debuginfod.8
index a645ceed5eaa..152e368ce5be 100644
--- a/doc/debuginfod.8
+++ b/doc/debuginfod.8
@@ -155,7 +155,8 @@ before doing it again.  A rescan for unchanged files is fast (because
 the index also stores the file mtimes).  A time of zero is acceptable,
 and means that only one initial scan should performed.  The default
 rescan time is 300 seconds.  Receiving a SIGUSR1 signal triggers a new
-scan, independent of the rescan time (including if it was zero).
+scan, independent of the rescan time (including if it was zero),
+interrupting a groom pass (if any).
 
 .TP
 .B "\-g SECONDS" "\-\-groom\-time=SECONDS"
@@ -167,7 +168,8 @@ it can deindex obsolete files.  See also the \fIDATA MANAGEMENT\fP
 section.  The default groom time is 86400 seconds (1 day).  A time of
 zero is acceptable, and means that only one initial groom should be
 performed.  Receiving a SIGUSR2 signal triggers a new grooming pass,
-independent of the groom time (including if it was zero).
+independent of the groom time (including if it was zero), interrupting
+a rescan pass (if any)..
 
 .TP
 .B "\-G"
diff --git a/tests/ChangeLog b/tests/ChangeLog
index f57db304bfeb..471479f6921e 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* run-debuginfod-find.sh: Modify test for different
+	thread_work_total semantics for grooming.
+
 2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
 
 	PR26810
diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh
index 52def36438bb..3af04c00bcd6 100755
--- a/tests/run-debuginfod-find.sh
+++ b/tests/run-debuginfod-find.sh
@@ -345,8 +345,9 @@ RPM_BUILDID=d44d42cbd7d915bc938c81333a21e355a6022fb7 # in rhel6/ subdir, for a l
 rm -r R/debuginfod-rpms/rhel6/*
 kill -USR2 $PID1  # groom cycle
 # Expect 3 rpms to be deleted by the groom
-# 1 groom already took place at/soon-after startup, so -USR2 makes 2
-wait_ready $PORT1 'thread_work_total{role="groom"}' 2
+# 1 groom cycle already took place at/soon-after startup, so -USR2 makes 2
+# ... times the # of files checked in each cycle
+wait_ready $PORT1 'thread_work_total{role="groom"}' 51
 wait_ready $PORT1 'groom{statistic="file d/e"}' 3
 
 rm -rf $DEBUGINFOD_CACHE_PATH # clean it from previous tests
@@ -363,7 +364,7 @@ testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable $BUILDID2
 
 # run a groom cycle to force server to drop its fdcache
 kill -USR2 $PID1  # groom cycle
-wait_ready $PORT1 'thread_work_total{role="groom"}' 3
+wait_ready $PORT1 'thread_work_total{role="groom"}' 98 # 3 complete cycles
 # move it around a couple of times to make it likely to hit a nonexistent entry during iteration
 mv R/debuginfod-rpms/rhel7 R/debuginfod-rpms/rhel7renamed
 kill -USR1 $PID1  # scan cycle


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] PR26775 interruptible/observable grooming
  2020-10-29 18:30 [PATCH] PR26775 interruptible/observable grooming Frank Ch. Eigler
@ 2020-10-30 12:36 ` Mark Wielaard
  2020-10-30 22:36   ` Frank Ch. Eigler
  0 siblings, 1 reply; 3+ messages in thread
From: Mark Wielaard @ 2020-10-30 12:36 UTC (permalink / raw)
  To: Frank Ch. Eigler, elfutils-devel

Hi Frank,

On Thu, 2020-10-29 at 14:30 -0400, Frank Ch. Eigler via Elfutils-devel
wrote:
> Author: Frank Ch. Eigler <fche@redhat.com>
> Date:   Thu Oct 29 14:25:18 2020 -0400
> 
>     PR26775: make grooming progress visible & interruptible
>     
>     On very large servers, it's desirable to be able to interrupt a rescan
>     or groom cycle.  SIGUSR[12] now do that.  (Unfortunately, this is not
>     practically testable in the testsuite, since these cycles are so fast
>     on that small dataset.)  We also expose more internal progress count
>     about the grooming pass, so the administrator can assess possible need
>     to interrupt.

Looks good to me. Agreed about not really being able to test this due
to timing.

Cheers,

Mark

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] PR26775 interruptible/observable grooming
  2020-10-30 12:36 ` Mark Wielaard
@ 2020-10-30 22:36   ` Frank Ch. Eigler
  0 siblings, 0 replies; 3+ messages in thread
From: Frank Ch. Eigler @ 2020-10-30 22:36 UTC (permalink / raw)
  To: Mark Wielaard; +Cc: elfutils-devel

Hi -

Pushed the following patch to clear up mysterious testsuite
inconsistencies from the previous patch.


Author: Frank Ch. Eigler <fche@redhat.com>
Date:   Fri Oct 30 18:26:04 2020 -0400

    PR26775: restore thread_work_groom metric to cycle count
    
    ... and add new metrics about progress of traversal and groom
    processes.  Correct one control flow abnormality that could
    prematurely end a scanner thread and might have accounted for
    the inconsistent test results from the previous patch.
    
    Signed-off-by: Frank Ch. Eigler <fche@redhat.com>

diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 5af4dc6ca08f..ec791c68d460 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,14 @@
+2020-10-30  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775 cont'd.
+	(thread_main_scanner): Ensure control doesn't leave
+	infinite loop until program exit, even if SIGUSR2.
+	(scan_source_paths): Have traverser clean scanq on
+	SIGUSR2.  Emit additional traversed_total metrics.
+	(groom): Emit additional groomed_total metrics.
+	(thread_main_groom): Restore previous thread_work_total
+	metric.
+
 2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
 
 	PR26775
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 8448a501f356..1020a6405978 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -2711,7 +2711,8 @@ thread_main_scanner (void* arg)
       add_metric("thread_busy", "role", "scan", -1);
       bool gotone = scanq.wait_front(p);
       add_metric("thread_busy", "role", "scan", 1);
-      if (! gotone) continue; // or break
+
+      if (! gotone) continue; // go back to waiting
 
       try
         {
@@ -2750,13 +2751,9 @@ thread_main_scanner (void* arg)
           e.report(cerr);
         }
 
-      inc_metric("thread_work_total", "role","scan");
-      
-      if (sigusr2 != forced_groom_count) // stop early if groom triggered 
-        {
-          scanq.clear();
-          break;
-        }
+      // finished a scanning step -- not a "loop", because we just
+      // consume the traversal loop's work, whenever
+      inc_metric("thread_work_total","role","scan");
     }
 
   add_metric("thread_busy", "role", "scan", -1);
@@ -2801,7 +2798,10 @@ scan_source_paths()
     if (interrupted) break;
 
     if (sigusr2 != forced_groom_count) // stop early if groom triggered 
-      break;
+      {
+        scanq.clear(); // clear previously issued work for scanner threads
+        break;
+      }
     
     fts_scanned ++;
 
@@ -2825,6 +2825,10 @@ scan_source_paths()
         if (verbose > 3)
           obatched(clog) << "fts skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
         fts_regex ++;
+        if (!ri)
+          inc_metric("traversed_total","type","regex-skipped-I");
+        if (rx)
+          inc_metric("traversed_total","type","regex-skipped-X");
         continue;
       }
 
@@ -2832,6 +2836,7 @@ scan_source_paths()
       {
       case FTS_F:
         scanq.push_back (make_pair(rps, *f->fts_statp));
+        inc_metric("traversed_total","type","file");
         break;
 
       case FTS_ERR:
@@ -2841,11 +2846,16 @@ scan_source_paths()
           auto x = libc_exception(f->fts_errno, string("fts traversal ") + string(f->fts_path));
           x.report(cerr);
         }
+        inc_metric("traversed_total","type","error");
         break;
 
-      default:
-        ;
-        /* ignore */
+      case FTS_D: // ignore
+        inc_metric("traversed_total","type","directory");
+        break;
+        
+      default: // ignore
+        inc_metric("traversed_total","type","other");
+        break;
       }
   }
   gettimeofday (&tv_end, NULL);
@@ -2890,6 +2900,7 @@ thread_main_fts_source_paths (void* arg)
             set_metric("thread_busy", "role","traverse", 1);
             scan_source_paths();
             last_rescan = time(NULL); // NB: now was before scanning
+            // finished a traversal loop
             inc_metric("thread_work_total", "role","traverse");
             set_metric("thread_busy", "role","traverse", 0);
           }
@@ -2970,9 +2981,11 @@ void groom()
           files_del_f_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
           files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
           files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
+          inc_metric("groomed_total", "decision", "stale");
         }
-      
-      inc_metric("thread_work_total", "role", "groom");
+      else
+        inc_metric("groomed_total", "decision", "fresh");
+
       if (sigusr1 != forced_rescan_count) // stop early if scan triggered
         break;
     }
@@ -3039,6 +3052,8 @@ thread_main_groom (void* /*arg*/)
             set_metric("thread_busy", "role", "groom", 1);
             groom ();
             last_groom = time(NULL); // NB: now was before grooming
+            // finished a grooming loop
+            inc_metric("thread_work_total", "role", "groom");
             set_metric("thread_busy", "role", "groom", 0);
           }
         catch (const sqlite_exception& e)
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 012e305cf418..13abf89ac5dd 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-30  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* run-debuginfod-find.sh: Modify test for restored
+	thread_work_total semantics for grooming.
+
 2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
 
 	PR26775
diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh
index 3af04c00bcd6..0d1fcf1fb829 100755
--- a/tests/run-debuginfod-find.sh
+++ b/tests/run-debuginfod-find.sh
@@ -346,8 +346,7 @@ rm -r R/debuginfod-rpms/rhel6/*
 kill -USR2 $PID1  # groom cycle
 # Expect 3 rpms to be deleted by the groom
 # 1 groom cycle already took place at/soon-after startup, so -USR2 makes 2
-# ... times the # of files checked in each cycle
-wait_ready $PORT1 'thread_work_total{role="groom"}' 51
+wait_ready $PORT1 'thread_work_total{role="groom"}' 2
 wait_ready $PORT1 'groom{statistic="file d/e"}' 3
 
 rm -rf $DEBUGINFOD_CACHE_PATH # clean it from previous tests
@@ -364,7 +363,7 @@ testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable $BUILDID2
 
 # run a groom cycle to force server to drop its fdcache
 kill -USR2 $PID1  # groom cycle
-wait_ready $PORT1 'thread_work_total{role="groom"}' 98 # 3 complete cycles
+wait_ready $PORT1 'thread_work_total{role="groom"}' 3
 # move it around a couple of times to make it likely to hit a nonexistent entry during iteration
 mv R/debuginfod-rpms/rhel7 R/debuginfod-rpms/rhel7renamed
 kill -USR1 $PID1  # scan cycle



- FChE


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-10-30 22:37 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-29 18:30 [PATCH] PR26775 interruptible/observable grooming Frank Ch. Eigler
2020-10-30 12:36 ` Mark Wielaard
2020-10-30 22:36   ` Frank Ch. Eigler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).