From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=x+fo=CZ=klomp.org=mark@sourceware.org>
Received: from gnu.wildebeest.org (gnu.wildebeest.org [45.83.234.184])
	by sourceware.org (Postfix) with ESMTPS id E49C13858C41
	for <elfutils-devel@sourceware.org>; Fri,  7 Jul 2023 14:00:04 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org E49C13858C41
Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=klomp.org
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=klomp.org
Received: from r6.localdomain (82-217-174-174.cable.dynamic.v4.ziggo.nl [82.217.174.174])
	(using TLSv1.2 with cipher ADH-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by gnu.wildebeest.org (Postfix) with ESMTPSA id 705343006CDC;
	Fri,  7 Jul 2023 15:59:58 +0200 (CEST)
Received: by r6.localdomain (Postfix, from userid 1000)
	id 85D92340276; Fri,  7 Jul 2023 15:59:58 +0200 (CEST)
Message-ID: <e94dc7022d39de2626f212b0df5e451186aa2c83.camel@klomp.org>
Subject: Re: PR29472: debuginfod metadata query
From: Mark Wielaard <mark@klomp.org>
To: "Frank Ch. Eigler" <fche@elastic.org>, elfutils-devel@sourceware.org
Date: Fri, 07 Jul 2023 15:59:58 +0200
In-Reply-To: <ZDcVNeakb2ze5oZ8@elastic.org>
References: <ZDcVNeakb2ze5oZ8@elastic.org>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
User-Agent: Evolution 3.48.3 (3.48.3-1.fc38) 
MIME-Version: 1.0
X-Spam-Status: No, score=-3034.4 required=5.0 tests=BAYES_00,GIT_PATCH_0,JMQ_SPF_NEUTRAL,KAM_DMARC_STATUS,RCVD_IN_BARRACUDACENTRAL,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <elfutils-devel.sourceware.org>

Hi Frank,

And finally the debuginfod server code itself.

On Wed, 2023-04-12 at 16:31 -0400, Frank Ch. Eigler via Elfutils-devel
wrote:
> +	* debuginfod.cxx (DEBUGINFOD_SQLITE_DDL): Add an index or two.
> +	(metadata_maxtime_s, parse_opt): New parameter for load control.
> +	(add_client_federation_headers): New refactored function.
> +	(handle_metadata): New function.
> +	(handler_cb): Call it for /metadata URL.  Trace it.
> +	(groom): Tweak sqlite_ps object lifetimes.
> [...]
> diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
> index 5ef6cc32189b..000820fec5ea 100644
> --- a/debuginfod/debuginfod.cxx
> +++ b/debuginfod/debuginfod.cxx
> @@ -1,5 +1,5 @@
>  /* Debuginfo-over-http server.
> -   Copyright (C) 2019-2021 Red Hat, Inc.
> +   Copyright (C) 2019-2023 Red Hat, Inc.
>     Copyright (C) 2021, 2022 Mark J. Wielaard <mark@klomp.org>
>     This file is part of elfutils.
> =20
> @@ -68,6 +68,7 @@ extern "C" {
>  #include <unistd.h>
>  #include <fcntl.h>
>  #include <netdb.h>
> +#include <fnmatch.h>
> =20
> =20
>  /* If fts.h is included before config.h, its indirect inclusions may not
> @@ -127,6 +128,9 @@ using namespace std;
>  #define tid() pthread_self()
>  #endif
> =20
> +#ifdef HAVE_JSON_C
> +  #include <json-c/json.h>
> +#endif
> =20
>  inline bool
>  string_endswith(const string& haystack, const string& needle)
> @@ -185,7 +189,7 @@ static const char DEBUGINFOD_SQLITE_DDL[] =3D
>    "        foreign key (buildid) references " BUILDIDS "_buildids(id) on=
 update cascade on delete cascade,\n"
>    "        primary key (buildid, file, mtime)\n"
>    "        ) " WITHOUT_ROWID ";\n"
> -  // Index for faster delete by file identifier
> +  // Index for faster delete by file identifier and metadata searches
>    "create index if not exists " BUILDIDS "_f_de_idx on " BUILDIDS "_f_de=
 (file, mtime);\n"
>    "create table if not exists " BUILDIDS "_f_s (\n"
>    "        buildid integer not null,\n"
> @@ -211,6 +215,8 @@ static const char DEBUGINFOD_SQLITE_DDL[] =3D
>    "        ) " WITHOUT_ROWID ";\n"
>    // Index for faster delete by archive file identifier
>    "create index if not exists " BUILDIDS "_r_de_idx on " BUILDIDS "_r_de=
 (file, mtime);\n"
> +  // Index for metadata searches
> +  "create index if not exists " BUILDIDS "_r_de_idx2 on " BUILDIDS "_r_d=
e (content);\n" =20
>    "create table if not exists " BUILDIDS "_r_sref (\n" // outgoing dwarf=
 sourcefile references from rpm
>    "        buildid integer not null,\n"
>    "        artifactsrc integer not null,\n"

OK, and extra index on content in the _r_de table. Which is an index
into the _files table.

In general I find is hard to keep the whole sql structure in my head.
Is this described somewhere apart from just reading the whole
DEBUGINFOD_SQLITE_DDL?

> @@ -398,6 +404,9 @@ static const struct argp_option options[] =3D
>     { "passive", ARGP_KEY_PASSIVE, NULL, 0, "Do not scan or groom, read-o=
nly database.", 0 },
>  #define ARGP_KEY_DISABLE_SOURCE_SCAN 0x1009
>     { "disable-source-scan", ARGP_KEY_DISABLE_SOURCE_SCAN, NULL, 0, "Do n=
ot scan dwarf source info.", 0 },
> +#define ARGP_KEY_METADATA_MAXTIME 0x100A
> +   { "metadata-maxtime", ARGP_KEY_METADATA_MAXTIME, "SECONDS", 0,
> +     "Number of seconds to limit metadata query run time, 0=3Dunlimited.=
", 0 },
>     { NULL, 0, NULL, 0, NULL, 0 },
>    };

Ack. New --metadata-maxtime argument.

> @@ -452,6 +461,8 @@ static unsigned forwarded_ttl_limit =3D 8;
>  static bool scan_source_info =3D true;
>  static string tmpdir;
>  static bool passive_p =3D false;
> +static unsigned metadata_maxtime_s =3D 5;
> +

OK. Defaults to 5 seconds.

>  static void set_metric(const string& key, double value);
>  // static void inc_metric(const string& key);
> @@ -653,6 +664,9 @@ parse_opt (int key, char *arg,
>      case ARGP_KEY_DISABLE_SOURCE_SCAN:
>        scan_source_info =3D false;
>        break;
> +    case ARGP_KEY_METADATA_MAXTIME:
> +      metadata_maxtime_s =3D (unsigned) atoi(arg);
> +      break;
>        // case 'h': argp_state_help (state, stderr, ARGP_HELP_LONG|ARGP_H=
ELP_EXIT_OK);
>      default: return ARGP_ERR_UNKNOWN;
>      }

OK. Can be set by user.

> @@ -2040,6 +2054,58 @@ handle_buildid_r_match (bool internal_req_p,
>    return r;
>  }
> =20
> +void
> +add_client_federation_headers(debuginfod_client *client, MHD_Connection*=
 conn){
> +  // Transcribe incoming User-Agent:
> +  string ua =3D MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "Use=
r-Agent") ?: "";
> +  string ua_complete =3D string("User-Agent: ") + ua;
> +  debuginfod_add_http_header (client, ua_complete.c_str());
> +
> +  // Compute larger XFF:, for avoiding info loss during
> +  // federation, and for future cyclicity detection.
> +  string xff =3D MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-=
Forwarded-For") ?: "";
> +  if (xff !=3D "")
> +    xff +=3D string(", "); // comma separated list
> +
> +  unsigned int xff_count =3D 0;
> +  for (auto&& i : xff){
> +    if (i =3D=3D ',') xff_count++;
> +  }
> +
> +  // if X-Forwarded-For: exceeds N hops,
> +  // do not delegate a local lookup miss to upstream debuginfods.
> +  if (xff_count >=3D forwarded_ttl_limit)
> +    throw reportable_exception(MHD_HTTP_NOT_FOUND, "not found, --forware=
d-ttl-limit reached \
> +and will not query the upstream servers");
> +
> +  // Compute the client's numeric IP address only - so can't merge with =
conninfo()
> +  const union MHD_ConnectionInfo *u =3D MHD_get_connection_info (conn,
> +                                                                MHD_CONN=
ECTION_INFO_CLIENT_ADDRESS);
> +  struct sockaddr *so =3D u ? u->client_addr : 0;
> +  char hostname[256] =3D ""; // RFC1035
> +  if (so && so->sa_family =3D=3D AF_INET) {
> +    (void) getnameinfo (so, sizeof (struct sockaddr_in), hostname, sizeo=
f (hostname), NULL, 0,
> +                        NI_NUMERICHOST);
> +  } else if (so && so->sa_family =3D=3D AF_INET6) {
> +    struct sockaddr_in6* addr6 =3D (struct sockaddr_in6*) so;
> +    if (IN6_IS_ADDR_V4MAPPED(&addr6->sin6_addr)) {
> +      struct sockaddr_in addr4;
> +      memset (&addr4, 0, sizeof(addr4));
> +      addr4.sin_family =3D AF_INET;
> +      addr4.sin_port =3D addr6->sin6_port;
> +      memcpy (&addr4.sin_addr.s_addr, addr6->sin6_addr.s6_addr+12, sizeo=
f(addr4.sin_addr.s_addr));
> +      (void) getnameinfo ((struct sockaddr*) &addr4, sizeof (addr4),
> +                          hostname, sizeof (hostname), NULL, 0,
> +                          NI_NUMERICHOST);
> +    } else {
> +      (void) getnameinfo (so, sizeof (struct sockaddr_in6), hostname, si=
zeof (hostname), NULL, 0,
> +                          NI_NUMERICHOST);
> +    }
> +  }
> +
> +  string xff_complete =3D string("X-Forwarded-For: ")+xff+string(hostnam=
e);
> +  debuginfod_add_http_header (client, xff_complete.c_str());
> +}

OK, extracted from handle_buildid.

> =20
>  static struct MHD_Response*
>  handle_buildid_match (bool internal_req_p,
> @@ -2273,57 +2339,7 @@ handle_buildid (MHD_Connection* conn,
>    debuginfod_set_progressfn (client, & debuginfod_find_progress);
> =20
>    if (conn)
> -    {
> -      // Transcribe incoming User-Agent:
> -      string ua =3D MHD_lookup_connection_value (conn, MHD_HEADER_KIND, =
"User-Agent") ?: "";
> -      string ua_complete =3D string("User-Agent: ") + ua;
> -      debuginfod_add_http_header (client, ua_complete.c_str());
> -     =20
> -      // Compute larger XFF:, for avoiding info loss during
> -      // federation, and for future cyclicity detection.
> -      string xff =3D MHD_lookup_connection_value (conn, MHD_HEADER_KIND,=
 "X-Forwarded-For") ?: "";
> -      if (xff !=3D "")
> -        xff +=3D string(", "); // comma separated list
> -     =20
> -      unsigned int xff_count =3D 0;
> -      for (auto&& i : xff){
> -        if (i =3D=3D ',') xff_count++;
> -      }
> -
> -      // if X-Forwarded-For: exceeds N hops,
> -      // do not delegate a local lookup miss to upstream debuginfods.
> -      if (xff_count >=3D forwarded_ttl_limit)
> -        throw reportable_exception(MHD_HTTP_NOT_FOUND, "not found, --for=
wared-ttl-limit reached \
> -and will not query the upstream servers");
> -
> -      // Compute the client's numeric IP address only - so can't merge w=
ith conninfo()
> -      const union MHD_ConnectionInfo *u =3D MHD_get_connection_info (con=
n,
> -                                                                   MHD_C=
ONNECTION_INFO_CLIENT_ADDRESS);
> -      struct sockaddr *so =3D u ? u->client_addr : 0;
> -      char hostname[256] =3D ""; // RFC1035
> -      if (so && so->sa_family =3D=3D AF_INET) {
> -        (void) getnameinfo (so, sizeof (struct sockaddr_in), hostname, s=
izeof (hostname), NULL, 0,
> -                            NI_NUMERICHOST);
> -      } else if (so && so->sa_family =3D=3D AF_INET6) {
> -        struct sockaddr_in6* addr6 =3D (struct sockaddr_in6*) so;
> -        if (IN6_IS_ADDR_V4MAPPED(&addr6->sin6_addr)) {
> -          struct sockaddr_in addr4;
> -          memset (&addr4, 0, sizeof(addr4));
> -          addr4.sin_family =3D AF_INET;
> -          addr4.sin_port =3D addr6->sin6_port;
> -          memcpy (&addr4.sin_addr.s_addr, addr6->sin6_addr.s6_addr+12, s=
izeof(addr4.sin_addr.s_addr));
> -          (void) getnameinfo ((struct sockaddr*) &addr4, sizeof (addr4),
> -                              hostname, sizeof (hostname), NULL, 0,
> -                              NI_NUMERICHOST);
> -        } else {
> -          (void) getnameinfo (so, sizeof (struct sockaddr_in6), hostname=
, sizeof (hostname), NULL, 0,
> -                              NI_NUMERICHOST);
> -        }
> -      }
> -         =20
> -      string xff_complete =3D string("X-Forwarded-For: ")+xff+string(hos=
tname);
> -      debuginfod_add_http_header (client, xff_complete.c_str());
> -    }
> +    add_client_federation_headers(client, conn);
> =20
>    if (artifacttype =3D=3D "debuginfo")
>      fd =3D debuginfod_find_debuginfo (client,
> @@ -2535,6 +2551,176 @@ handle_metrics (off_t* size)
>    return r;
>  }

OK, moved into add_client_federation_headers.

> +
> +#ifdef HAVE_JSON_C
> +static struct MHD_Response*
> +handle_metadata (MHD_Connection* conn,
> +                 string key, string value, off_t* size)
> +{
> +  MHD_Response* r;
> +  sqlite3 *thisdb =3D dbq;
> +
> +  // Query locally for matching e, d files
> +  string op;
> +  if (key =3D=3D "glob")
> +    op =3D "glob";
> +  else if (key =3D=3D "file")
> +    op =3D "=3D";
> +  else
> +    throw reportable_exception("/metadata webapi error, unsupported key"=
);
> +
> +  string sql =3D string(
> +                      // explicit query r_de and f_de once here, rather =
than the query_d and query_e
> +                      // separately, because they scan the same tables, =
so we'd double the work
> +                      "select d1.executable_p, d1.debuginfo_p, 0 as sour=
ce_p, b1.hex, f1.name as file, a1.name as archive "
> +                      "from " BUILDIDS "_r_de d1, " BUILDIDS "_files f1,=
 " BUILDIDS "_buildids b1, " BUILDIDS "_files a1 "
> +                      "where f1.id =3D d1.content and a1.id =3D d1.file =
and d1.buildid =3D b1.id and f1.name " + op + " ? "
> +                      "union all \n"
> +                      "select d2.executable_p, d2.debuginfo_p, 0, b2.hex=
, f2.name, NULL "
> +                      "from " BUILDIDS "_f_de d2, " BUILDIDS "_files f2,=
 " BUILDIDS "_buildids b2 "
> +                      "where f2.id =3D d2.file and d2.buildid =3D b2.id =
and f2.name " + op + " ? ");
> +  // NB: we could query source file names too, thusly:
> +  //
> +  //    select * from " BUILDIDS "_buildids b, " BUILDIDS "_files f1, " =
BUILDIDS "_r_sref sr
> +  //    where b.id =3D sr.buildid and f1.id =3D sr.artifactsrc and f1.na=
me " + op + "?"
> +  //    UNION ALL something with BUILDIDS "_f_s"
> +  //
> +  // But the first part of this query cannot run fast without the same i=
ndex temp-created
> +  // during "maxigroom":
> +  //    create index " BUILDIDS "_r_sref_arc on " BUILDIDS "_r_sref(arti=
factsrc);
> +  // and unfortunately this index is HUGE.  It's similar to the size of =
the _r_sref
> +  // table, which is already the largest part of a debuginfod index.  Ad=
ding that index
> +  // would nearly double the .sqlite db size.   =C2=A0

Magic sql query that looks ok. Thanks for the extra comments.

> +  sqlite_ps *pp =3D new sqlite_ps (thisdb, "mhd-query-meta-glob", sql);
> +  pp->reset();
> +  pp->bind(1, value);
> +  pp->bind(2, value);
> +  // pp->bind(3, value); // "source" query clause disabled
> +  unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or ret=
urn

OK.

> +  json_object *metadata =3D json_object_new_array();
> +  if (!metadata)
> +    throw libc_exception(ENOMEM, "json allocation");

If there is an exception below won't this leak?

> + =20
> +  // consume all the rows
> +  struct timespec ts_start;
> +  clock_gettime (CLOCK_MONOTONIC, &ts_start);
> + =20
> +  int rc;
> +  while (SQLITE_DONE !=3D (rc =3D pp->step()))
> +    {
> +      // break out of loop if we have searched too long
> +      struct timespec ts_end;
> +      clock_gettime (CLOCK_MONOTONIC, &ts_end);
> +      double deltas =3D (ts_end.tv_sec - ts_start.tv_sec) + (ts_end.tv_n=
sec - ts_start.tv_nsec)/1.e9;
> +      if (metadata_maxtime_s > 0 && deltas > metadata_maxtime_s)
> +        break; // NB: no particular signal is given to the client about =
incompleteness

Why not? They get a partial result.

> +      if (rc !=3D SQLITE_ROW) throw sqlite_exception(rc, "step");
> +
> +      int m_executable_p =3D sqlite3_column_int (*pp, 0);
> +      int m_debuginfo_p  =3D sqlite3_column_int (*pp, 1);
> +      int m_source_p     =3D sqlite3_column_int (*pp, 2);
> +      string m_buildid   =3D (const char*) sqlite3_column_text (*pp, 3) =
?: ""; // should always be non-null
> +      string m_file      =3D (const char*) sqlite3_column_text (*pp, 4) =
?: "";
> +      string m_archive   =3D (const char*) sqlite3_column_text (*pp, 5) =
?: "";     =20
> +
> +      // Confirm that m_file matches in the fnmatch(FNM_PATHNAME)
> +      // sense, since sqlite's GLOB operator is a looser filter.
> +      if (key =3D=3D "glob" && fnmatch(value.c_str(), m_file.c_str(), FN=
M_PATHNAME) !=3D 0)
> +        continue;

If I understand correctly when FNM_PATHNAME is set then slashes '/'
need to be explicitly matched. Is that deliberate?

> +      auto add_metadata =3D [metadata, m_buildid, m_file, m_archive](con=
st string& type) {
> +        json_object* entry =3D json_object_new_object();
> +        if (NULL =3D=3D entry) throw libc_exception (ENOMEM, "cannot all=
ocate json");
> +        defer_dtor<json_object*,int> entry_d(entry, json_object_put);
> +       =20
> +        auto add_entry_metadata =3D [entry](const char* k, string v) {
> +          json_object* s;
> +          if(v !=3D "") {
> +            s =3D json_object_new_string(v.c_str());
> +            if (NULL =3D=3D s) throw libc_exception (ENOMEM, "cannot all=
ocate json");
> +            json_object_object_add(entry, k, s);
> +          }
> +        };
> +       =20
> +        add_entry_metadata("type", type.c_str());
> +        add_entry_metadata("buildid", m_buildid);
> +        add_entry_metadata("file", m_file);
> +        if (m_archive !=3D "") add_entry_metadata("archive", m_archive);=
       =20
> +        if (verbose > 3)
> +          obatched(clog) << "metadata found local "
> +                         << json_object_to_json_string_ext(entry,
> +                                                           JSON_C_TO_STR=
ING_PRETTY)
> +                         << endl;
> +       =20
> +        // Increase ref count to switch its ownership
> +        json_object_array_add(metadata, json_object_get(entry));
> +      };
> +
> +      if (m_executable_p) add_metadata("executable");
> +      if (m_debuginfo_p) add_metadata("debuginfo");     =20
> +      if (m_source_p) add_metadata("source");       =C2=A0

Would it make sense to check the size of the created output here?
     =20
> +    }
> +  pp->reset();
> +
> +  unsigned num_local_results =3D json_object_array_length(metadata);
> + =20
> +  // Query upstream as well
> +  debuginfod_client *client =3D debuginfod_pool_begin();
> +  if (metadata && client !=3D NULL)
> +  {
> +    add_client_federation_headers(client, conn);
> +
> +    int upstream_metadata_fd;
> +    upstream_metadata_fd =3D debuginfod_find_metadata(client, key.c_str(=
), value.c_str(), NULL);
> +    if (upstream_metadata_fd >=3D 0) {
> +      json_object *upstream_metadata_json =3D json_object_from_fd(upstre=
am_metadata_fd);
> +      if (NULL !=3D upstream_metadata_json)
> +        {
> +          for (int i =3D 0, n =3D json_object_array_length(upstream_meta=
data_json); i < n; i++) {
> +            json_object *entry =3D json_object_array_get_idx(upstream_me=
tadata_json, i);
> +            if (verbose > 3)
> +              obatched(clog) << "metadata found remote "
> +                             << json_object_to_json_string_ext(entry,
> +                                                               JSON_C_TO=
_STRING_PRETTY)
> +                             << endl;
> +           =20
> +            json_object_get(entry); // increment reference count
> +            json_object_array_add(metadata, entry);
> +          }

What if the upstream json object isn't an array (or invalid)?

> +          json_object_put(upstream_metadata_json);
> +        }
> +      close(upstream_metadata_fd);
> +    }
> +    debuginfod_pool_end (client);
> +  }
> +
> +  unsigned num_total_results =3D json_object_array_length(metadata);
> +
> +  if (verbose > 2)
> +    obatched(clog) << "metadata found local=3D" << num_local_results
> +                   << " remote=3D" << (num_total_results-num_local_resul=
ts)
> +                   << " total=3D" << num_total_results
> +                   << endl;
> + =20
> +  const char* metadata_str =3D (metadata !=3D NULL) ?
> +    json_object_to_json_string(metadata) : "[ ]" ;
> +  if (! metadata_str)
> +    throw libc_exception (ENOMEM, "cannot allocate json");
> +  r =3D MHD_create_response_from_buffer (strlen(metadata_str),
> +                                       (void*) metadata_str,
> +                                       MHD_RESPMEM_MUST_COPY);
> +  *size =3D strlen(metadata_str);
> +  json_object_put(metadata);

I am not sure I totally follow the reference counting.
There are a couple of json_object_gets before adding things to and
array to increase their reference count (I assume they start at one). I
assume the json_object_put will also put/decrease the count of all json
objects contained (recursively)? Are the individual members released
then? It seems their reference count is one too high. Or am I missing
something?

> +  if (r)
> +    add_mhd_response_header(r, "Content-Type", "application/json");
> +  return r;
> +}
> +#endif
> +
> +
>  static struct MHD_Response*
>  handle_root (off_t* size)
>  {
> @@ -2601,6 +2787,7 @@ handler_cb (void * /*cls*/,
>    clock_gettime (CLOCK_MONOTONIC, &ts_start);
>    double afteryou =3D 0.0;
>    string artifacttype, suffix;
> +  string urlargs; // for logging
> =20
>    try
>      {
> @@ -2669,6 +2856,21 @@ handler_cb (void * /*cls*/,
>            inc_metric("http_requests_total", "type", artifacttype);
>            r =3D handle_metrics(& http_size);
>          }
> +#ifdef HAVE_JSON_C
> +      else if (url1 =3D=3D "/metadata")
> +        {
> +          tmp_inc_metric m ("thread_busy", "role", "http-metadata");
> +          const char* key =3D MHD_lookup_connection_value(connection, MH=
D_GET_ARGUMENT_KIND, "key");
> +          const char* value =3D MHD_lookup_connection_value(connection, =
MHD_GET_ARGUMENT_KIND, "value");
> +          if (NULL =3D=3D value || NULL =3D=3D key)
> +            throw reportable_exception("/metadata webapi error, need key=
 and value");
> +
> +          urlargs =3D string("?key=3D") + string(key) + string("&value=
=3D") + string(value); // apprx., for logging
> +          artifacttype =3D "metadata";
> +          inc_metric("http_requests_total", "type", artifacttype);
> +          r =3D handle_metadata(connection, key, value, &http_size);
> +        }
> +#endif

Are the key and value url decoded?

>        else if (url1 =3D=3D "/")
>          {
>            artifacttype =3D "/";
> @@ -2705,7 +2907,7 @@ handler_cb (void * /*cls*/,
>    // afteryou: delay waiting for other client's identical query to compl=
ete
>    // deltas: total latency, including afteryou waiting
>    obatched(clog) << conninfo(connection)
> -                 << ' ' << method << ' ' << url
> +                 << ' ' << method << ' ' << url << urlargs
>                   << ' ' << http_code << ' ' << http_size
>                   << ' ' << (int)(afteryou*1000) << '+' << (int)((deltas-=
afteryou)*1000) << "ms"
>                   << endl;

OK, urlargs not set when no JSOC_C.

> @@ -3956,12 +4158,13 @@ void groom()
>    if (interrupted) return;
> =20
>    // NB: "vacuum" is too heavy for even daily runs: it rewrites the enti=
re db, so is done as maxigroom -G
> -  sqlite_ps g1 (db, "incremental vacuum", "pragma incremental_vacuum");
> -  g1.reset().step_ok_done();
> -  sqlite_ps g2 (db, "optimize", "pragma optimize");
> -  g2.reset().step_ok_done();
> -  sqlite_ps g3 (db, "wal checkpoint", "pragma wal_checkpoint=3Dtruncate"=
);
> -  g3.reset().step_ok_done();
> +  { sqlite_ps g (db, "incremental vacuum", "pragma incremental_vacuum");=
 g.reset().step_ok_done(); }
> +  // https://www.sqlite.org/lang_analyze.html#approx
> +  { sqlite_ps g (db, "analyze setup", "pragma analysis_limit =3D 1000;\n=
"); g.reset().step_ok_done(); }
> +  { sqlite_ps g (db, "analyze", "analyze"); g.reset().step_ok_done(); }
> +  { sqlite_ps g (db, "analyze reload", "analyze sqlite_schema"); g.reset=
().step_ok_done(); }=20
> +  { sqlite_ps g (db, "optimize", "pragma optimize"); g.reset().step_ok_d=
one(); }
> +  { sqlite_ps g (db, "wal checkpoint", "pragma wal_checkpoint=3Dtruncate=
"); g.reset().step_ok_done(); }

Read nicer formatted this way.
I don't fully understand why you run both analyze and then optimize.
Doesn't optimize call analyze itself?

>    database_stats_report();
> =20
> @@ -4333,6 +4536,8 @@ main (int argc, char *argv[])
>    if (maxigroom)
>      {
>        obatched(clog) << "maxigrooming database, please wait." << endl;
> +      // NB: this index alone can nearly double the database size!
> +      // NB: this index would be necessary to run source-file metadata s=
earches fast
>        extra_ddl.push_back("create index if not exists " BUILDIDS "_r_sre=
f_arc on " BUILDIDS "_r_sref(artifactsrc);");
>        extra_ddl.push_back("delete from " BUILDIDS "_r_sdef where not exi=
sts (select 1 from " BUILDIDS "_r_sref b where " BUILDIDS "_r_sdef.content =
=3D b.artifactsrc);");
>        extra_ddl.push_back("drop index if exists " BUILDIDS "_r_sref_arc;=
");
>=20