public inbox for elfutils@sourceware.org
 help / color / mirror / Atom feed
* patch rfc: debuginfod -Z (generalized archive) support
@ 2020-02-05 20:09 Frank Ch. Eigler
  2020-02-07  9:24 ` Mark Wielaard
  0 siblings, 1 reply; 9+ messages in thread
From: Frank Ch. Eigler @ 2020-02-05 20:09 UTC (permalink / raw)
  To: elfutils-devel

Hi -

A little extension lets us process arch-linux archives.  Awaiting
for some small test .pkg's from the arch folks for the elfutils
testsuite.  However, hand-testing on severa larger files works!

commit b51ae89befeb81c8b51b15b7168c6e616255b486 (fche/pacman-Z)
Author: Frank Ch. Eigler <fche@redhat.com>
Date:   Wed Feb 5 15:04:18 2020 -0500

    debuginfod: generalized archive support
    
    Add a '-Z EXT=CMD' option to debuginfod, which lets it scan any given
    extension and run CMD on it to unwrap distro archives.  For example,
    for arch-linux pacman files, -Z '.tar.zst=zstdcat' lets debuginfod
    grok debug and source content in split-debuginfo files.

diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 8c97fdcf7085..d812e6d71ff0 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,9 @@
+2020-02-05  Frank Ch. Eigler  <fche@redhat.com>
+
+	* debuginfod.cxx (argp options): Add -Z option.
+	(canonicalized_archive_entry_pathname): New function for
+	distro-agnostic file name matching/storage.
+
 2020-01-22  Frank Ch. Eigler  <fche@redhat.com>
 
 	* debuginfod.cxx (dwarf_extract_source_paths): Don't print
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 623dbc593c70..0de6bbaea0ee 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -333,9 +333,10 @@ ARGP_PROGRAM_BUG_ADDRESS_DEF = PACKAGE_BUGREPORT;
 static const struct argp_option options[] =
   {
    { NULL, 0, NULL, 0, "Scanners:", 1 },
-   { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning threads.", 0 },
-   { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning threads.", 0 },
-   { "scan-deb-dir", 'U', NULL, 0, "Enable DEB scanning threads.", 0 },
+   { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning.", 0 },
+   { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning.", 0 },
+   { "scan-deb-dir", 'U', NULL, 0, "Enable DEB scanning.", 0 },
+   { "scan-archive", 'Z', "EXT=CMD", 0, "Enable arbitrary archive scanning.", 0 },
    // "source-oci-imageregistry"  ...
 
    { NULL, 0, NULL, 0, "Options:", 2 },
@@ -428,6 +429,15 @@ parse_opt (int key, char *arg,
       scan_archives[".deb"]="dpkg-deb --fsys-tarfile";
       scan_archives[".ddeb"]="dpkg-deb --fsys-tarfile";
       break;
+    case 'Z':
+      {
+        char* extension = strchr(arg, '=');
+        if (extension)
+          scan_archives[string(arg, (extension-arg))]=string(extension+1);
+        else
+          argp_failure(state, 1, EINVAL, "bad EXT=CMD format");
+      }
+      break;
     case 'L':
       traverse_logical = true;
       break;
@@ -1068,6 +1078,25 @@ class libarchive_fdcache
 static libarchive_fdcache fdcache;
 
 
+// For security/portability reasons, many distro-package archives have
+// a "./" in front of path names; others have nothing, others have
+// "/".  Canonicalize them all to a single leading "/", with the
+// assumption that this matches the dwarf-derived file names too.
+string canonicalized_archive_entry_pathname(struct archive_entry *e)
+{
+  string fn = archive_entry_pathname(e);
+  if (fn.size() == 0)
+    return fn;
+  if (fn[0] == '/')
+    return fn;
+  if (fn[0] == '.')
+    return fn.substr(1);
+  else
+    return string("/")+fn;
+}
+
+
+
 static struct MHD_Response*
 handle_buildid_r_match (int64_t b_mtime,
                         const string& b_source0,
@@ -1162,8 +1191,8 @@ handle_buildid_r_match (int64_t b_mtime,
       if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
         continue;
 
-      string fn = archive_entry_pathname (e);
-      if (fn != string(".")+b_source1)
+      string fn = canonicalized_archive_entry_pathname (e);
+      if (fn != b_source1)
         continue;
 
       // extract this file to a temporary file
@@ -2055,9 +2084,7 @@ archive_classify (const string& rps, string& archive_extension,
           if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
             continue;
 
-          string fn = archive_entry_pathname (e);
-          if (fn.size() > 1 && fn[0] == '.')
-            fn = fn.substr(1); // trim off the leading '.'
+          string fn = canonicalized_archive_entry_pathname (e);
 
           if (verbose > 3)
             obatched(clog) << "libarchive checking " << fn << endl;
@@ -2764,7 +2791,7 @@ main (int argc, char *argv[])
              "unexpected argument: %s", argv[remaining]);
 
   if (scan_archives.size()==0 && !scan_files && source_paths.size()>0)
-    obatched(clog) << "warning: without -F -R -U, ignoring PATHs" << endl;
+    obatched(clog) << "warning: without -F -R -U -Z, ignoring PATHs" << endl;
 
   fdcache.limit(fdcache_fds, fdcache_mbs);
 
@@ -2894,7 +2921,7 @@ main (int argc, char *argv[])
       obatched ob(clog);
       auto& o = ob << "scanning archive types ";
       for (auto&& arch : scan_archives)
-	o << arch.first << " ";
+	o << arch.first << "(" << arch.second << ") ";
       o << endl;
     }
   const char* du = getenv(DEBUGINFOD_URLS_ENV_VAR);
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 651ea33d4106..36094d002f75 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2020-02-05  Frank Ch. Eigler  <fche@redhat.com>
+
+	* debuginfod.8: Document new -Z flag and tweak other bits.
+
 2020-01-10  Mark Wielaard  <mark@klomp.org>
 
 	* debuginfod_find_debuginfo.3 (DEBUGINFOD_PROGRESS): Mention progress
diff --git a/doc/debuginfod.8 b/doc/debuginfod.8
index 166c7c4590ed..d6561edf7159 100644
--- a/doc/debuginfod.8
+++ b/doc/debuginfod.8
@@ -61,20 +61,22 @@ or
 ^C
 .ESAMPLE
 
-If the \fB\-R\fP and/or \fB-U\fP option is given, each file is scanned
-as an archive file that may contain ELF/DWARF/source files.  If \-R is
-given, the will scan RPMs; and/or if \-U is given, they will scan DEB
-/ DDEB files.  (The terms RPM and DEB and DDEB are used synonymously
-as "archives" in diagnostic messages.)  Because of complications such
-as DWZ-compressed debuginfo, may require \fItwo\fP traversal passes to
-identify all source code.  Source files for RPMs are only served from
-other RPMs, so the caution for \-F does not apply.  Note that due to
-Debian/Ubuntu packaging policies & mechanisms, debuginfod cannot
-resolve source files for DEB/DDEB at all.
-
-If no PATH is listed, or neither \fB\-F\fP nor \fB\-R\fP nor \fB\-U\fP
-option is given, then \fBdebuginfod\fP will simply serve content that
-it accumulated into its index in all previous runs.
+If any of the \fB\-R\fP, \fB-U\fP, or \fB-Z\fP options is given, each
+file is scanned as an archive file that may contain ELF/DWARF/source
+files.  Archive files are recognized by extension.  If \-R is given,
+".rpm" files are scanned; if \-D is given, ".deb" and ".ddeb" files
+are scanned; if \-Z is given, the listed extensions are scanned.
+Because of complications such as DWZ-compressed debuginfo, may require
+\fItwo\fP traversal passes to identify all source code.  Source files
+for RPMs are only served from other RPMs, so the caution for \-F does
+not apply.  Note that due to Debian/Ubuntu packaging policies &
+mechanisms, debuginfod cannot resolve source files for DEB/DDEB at
+all.
+
+If no PATH is listed, or none of the scanning options is given, then
+\fBdebuginfod\fP will simply serve content that it accumulated into
+its index in all previous runs, and federate to any upstream
+debuginfod servers.
 
 
 .SH OPTIONS
@@ -91,6 +93,16 @@ Activate RPM patterns in archive scanning.  The default is off.
 .B "\-U"
 Activate DEB/DDEB patterns in archive scanning.  The default is off.
 
+.TP
+.B "\-Z EXT=CMD"
+Activate an additional pattern in archive scanning.  Files with name
+extension EXT will be processed with CMD.  CMD is invoked with the
+file name added to its argument list, and is should produce the
+archive on its standard output.  debuginfod uses libarchive to consume
+the result, so it can accept a wide range of archive formats and
+compression.  (Include the dot in EXT.)  The default is no additional
+patterns.  This option may be repeated.
+
 .TP
 .B "\-d FILE" "\-\-database=FILE"
 Set the path of the sqlite database used to store the index.  This
@@ -123,7 +135,8 @@ against the full path of each file, based on its \fBrealpath(3)\fP
 canonicalization.  By default, all files are included and none are
 excluded.  A file that matches both include and exclude REGEX is
 excluded.  (The \fIcontents\fP of archive files are not subject to
-inclusion or exclusion filtering: they are all processed.)
+inclusion or exclusion filtering: they are all processed.)  Only the
+last of each type of regular expression given is used.
 
 .TP
 .B "\-t SECONDS"  "\-\-rescan\-time=SECONDS"

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-05 20:09 patch rfc: debuginfod -Z (generalized archive) support Frank Ch. Eigler
@ 2020-02-07  9:24 ` Mark Wielaard
  2020-02-07 11:44   ` Frank Ch. Eigler
  0 siblings, 1 reply; 9+ messages in thread
From: Mark Wielaard @ 2020-02-07  9:24 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: elfutils-devel

Hi Frank,

On Wed, Feb 05, 2020 at 03:09:18PM -0500, Frank Ch. Eigler wrote:
> A little extension lets us process arch-linux archives.  Awaiting
> for some small test .pkg's from the arch folks for the elfutils
> testsuite.  However, hand-testing on severa larger files works!
> 
> commit b51ae89befeb81c8b51b15b7168c6e616255b486 (fche/pacman-Z)
> Author: Frank Ch. Eigler <fche@redhat.com>
> Date:   Wed Feb 5 15:04:18 2020 -0500
> 
>     debuginfod: generalized archive support
>     
>     Add a '-Z EXT=CMD' option to debuginfod, which lets it scan any given
>     extension and run CMD on it to unwrap distro archives.  For example,
>     for arch-linux pacman files, -Z '.tar.zst=zstdcat' lets debuginfod
>     grok debug and source content in split-debuginfo files.
>
> diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
> index 8c97fdcf7085..d812e6d71ff0 100644
> --- a/debuginfod/ChangeLog
> +++ b/debuginfod/ChangeLog
> @@ -1,3 +1,9 @@
> +2020-02-05  Frank Ch. Eigler  <fche@redhat.com>
> +
> +	* debuginfod.cxx (argp options): Add -Z option.
> +	(canonicalized_archive_entry_pathname): New function for
> +	distro-agnostic file name matching/storage.

I really like how simple adding this feature is. Shows the design for
adding new formats was done with some foresight.

But the simplicity also confused me at first. And I think it might
also confuse users and that it might be less efficient than it could
be. What this "hides" is that archiving is done in two steps. First
converting the package into an archive that libarchive can handle,
then going through and indexing the files in the archive.

For the first step for rpms we use rpm2cpio, for debs dpkg-deb, but as
your example shows for some other packaging formats we don't actually
need any conversion step. And so the man page example uses cat as if
it is an identity operation on the file.

I think it would be better if you could simply leave off the
conversion CMD and just provide an EXT (without =). This would then
just pass the file with the given extension to libarchive directly
(then you also don't need the zstdcat conversion). And letting
libarchive handle it directly might be more efficient.

Cheers,

Mark

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07  9:24 ` Mark Wielaard
@ 2020-02-07 11:44   ` Frank Ch. Eigler
  2020-02-07 13:30     ` Mark Wielaard
  0 siblings, 1 reply; 9+ messages in thread
From: Frank Ch. Eigler @ 2020-02-07 11:44 UTC (permalink / raw)
  To: Mark Wielaard; +Cc: elfutils-devel

Hi - 

> I really like how simple adding this feature is. Shows the design for
> adding new formats was done with some foresight.

Thanks!

> [...] I think it would be better if you could simply leave off the
> conversion CMD and just provide an EXT (without =). This would then
> just pass the file with the given extension to libarchive directly
> (then you also don't need the zstdcat conversion). And letting
> libarchive handle it directly might be more efficient.

How about making =CMD optional.  If it's missing, debuginfod would
assume "cat".  (I'd prefer not to have to change the code again for
some other distro that has a thin wrapper around a libarchive-readable
format, where something like rpm2cpio or dog would be needed.)


- FChE

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07 11:44   ` Frank Ch. Eigler
@ 2020-02-07 13:30     ` Mark Wielaard
  2020-02-07 13:31       ` Frank Ch. Eigler
  2020-02-07 13:57       ` Frank Ch. Eigler
  0 siblings, 2 replies; 9+ messages in thread
From: Mark Wielaard @ 2020-02-07 13:30 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: elfutils-devel

Hi Frank,

On Fri, Feb 07, 2020 at 06:44:02AM -0500, Frank Ch. Eigler wrote:
> > [...] I think it would be better if you could simply leave off the
> > conversion CMD and just provide an EXT (without =). This would then
> > just pass the file with the given extension to libarchive directly
> > (then you also don't need the zstdcat conversion). And letting
> > libarchive handle it directly might be more efficient.
> 
> How about making =CMD optional.  If it's missing, debuginfod would
> assume "cat".  (I'd prefer not to have to change the code again for
> some other distro that has a thin wrapper around a libarchive-readable
> format, where something like rpm2cpio or dog would be needed.)

Yes, lets make =CMD optional. But why assume "cat"? Can't you then
simply use fopen instaed of popen in the code to get the FILE* to pass
to archive_read_open_FILE? Or even just open and archive_read_open_fd?

Cheers,

Mark

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07 13:30     ` Mark Wielaard
@ 2020-02-07 13:31       ` Frank Ch. Eigler
  2020-02-07 13:57       ` Frank Ch. Eigler
  1 sibling, 0 replies; 9+ messages in thread
From: Frank Ch. Eigler @ 2020-02-07 13:31 UTC (permalink / raw)
  To: Mark Wielaard; +Cc: elfutils-devel

Hi -

> Yes, lets make =CMD optional. But why assume "cat"? Can't you then
> simply use fopen instaed of popen in the code to get the FILE* to pass
> to archive_read_open_FILE? Or even just open and archive_read_open_fd?

Sounds like a plan.

- FChE

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07 13:30     ` Mark Wielaard
  2020-02-07 13:31       ` Frank Ch. Eigler
@ 2020-02-07 13:57       ` Frank Ch. Eigler
  2020-02-07 23:09         ` Mark Wielaard
  1 sibling, 1 reply; 9+ messages in thread
From: Frank Ch. Eigler @ 2020-02-07 13:57 UTC (permalink / raw)
  To: Mark Wielaard; +Cc: elfutils-devel

Hi -

> Yes, lets make =CMD optional. But why assume "cat"? Can't you then
> simply use fopen instaed of popen in the code to get the FILE* to pass
> to archive_read_open_FILE? Or even just open and archive_read_open_fd?

One reason: the defer_dtor<>-based auto-closing of these objects is
different.  (popens must be closed with pclose not something else.)  A
robust auto-closer is needed because of all the exceptions we may
trigger.

- FChE

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07 13:57       ` Frank Ch. Eigler
@ 2020-02-07 23:09         ` Mark Wielaard
  2020-02-07 23:15           ` Frank Ch. Eigler
  0 siblings, 1 reply; 9+ messages in thread
From: Mark Wielaard @ 2020-02-07 23:09 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: elfutils-devel

Hi Frank,

On Fri, 2020-02-07 at 08:57 -0500, Frank Ch. Eigler wrote:
> > Yes, lets make =CMD optional. But why assume "cat"? Can't you then
> > simply use fopen instaed of popen in the code to get the FILE* to pass
> > to archive_read_open_FILE? Or even just open and archive_read_open_fd?
> 
> One reason: the defer_dtor<>-based auto-closing of these objects is
> different.  (popens must be closed with pclose not something else.)  A
> robust auto-closer is needed because of all the exceptions we may
> trigger.

My C++ is pretty terrible, but couldn't you handle that with something
like:

diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 6d729023..907b8fc5 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -1160,11 +1160,24 @@ handle_buildid_r_match (int64_t b_mtime,
         archive_extension = arch.first;
         archive_decoder = arch.second;
       }
-  string popen_cmd = archive_decoder + " " + shell_escape(b_source0);
-  FILE* fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
-  if (fp == NULL)
-    throw libc_exception (errno, string("popen ") + popen_cmd);
-  defer_dtor<FILE*,int> fp_closer (fp, pclose);
+  FILE* fp;
+  defer_dtor<FILE*,int>::dtor_fn dfn;
+  if (archive_decoder != "cat")
+    {
+      string popen_cmd = archive_decoder + " " + shell_escape(b_source0);
+      fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
+      dfn = pclose;
+      if (fp == NULL)
+        throw libc_exception (errno, string("popen ") + popen_cmd);
+    }
+  else
+    {
+      fp = fopen (b_source0.c_str(), "r");
+      dfn = fclose;
+      if (fp == NULL)
+        throw libc_exception (errno, string("fopen ") + b_source0);
+    }
+  defer_dtor<FILE*,int> fp_closer (fp, dfn);
 
   struct archive *a;
   a = archive_read_new();

Cheers,

Mark

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07 23:09         ` Mark Wielaard
@ 2020-02-07 23:15           ` Frank Ch. Eigler
  2020-02-10 16:15             ` Frank Ch. Eigler
  0 siblings, 1 reply; 9+ messages in thread
From: Frank Ch. Eigler @ 2020-02-07 23:15 UTC (permalink / raw)
  To: Mark Wielaard; +Cc: elfutils-devel

Hi -

> My C++ is pretty terrible, but couldn't you handle that with something
> like:

Ah, yes, good eye, pclose and fclose are interchangeable function
pointers.  (close(2) along the lines of your other suggestion wouldn't
be.)


- FChE

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: patch rfc: debuginfod -Z (generalized archive) support
  2020-02-07 23:15           ` Frank Ch. Eigler
@ 2020-02-10 16:15             ` Frank Ch. Eigler
  0 siblings, 0 replies; 9+ messages in thread
From: Frank Ch. Eigler @ 2020-02-10 16:15 UTC (permalink / raw)
  To: elfutils-devel; +Cc: Mark Wielaard

Hi -

> > My C++ is pretty terrible, but couldn't you handle that with something
> > like:
> 
> Ah, yes, good eye, pclose and fclose are interchangeable function
> pointers.  (close(2) along the lines of your other suggestion wouldn't
> be.)

pushed to master, with another instance of this improvement, thanks!

- FChE

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-02-10 16:15 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-05 20:09 patch rfc: debuginfod -Z (generalized archive) support Frank Ch. Eigler
2020-02-07  9:24 ` Mark Wielaard
2020-02-07 11:44   ` Frank Ch. Eigler
2020-02-07 13:30     ` Mark Wielaard
2020-02-07 13:31       ` Frank Ch. Eigler
2020-02-07 13:57       ` Frank Ch. Eigler
2020-02-07 23:09         ` Mark Wielaard
2020-02-07 23:15           ` Frank Ch. Eigler
2020-02-10 16:15             ` Frank Ch. Eigler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).