public inbox for systemtap@sourceware.org
 help / color / mirror / Atom feed
* Perfmon systemtap runtime support
@ 2006-07-13 22:21 William Cohen
  0 siblings, 0 replies; 3+ messages in thread
From: William Cohen @ 2006-07-13 22:21 UTC (permalink / raw)
  To: systemtap

[-- Attachment #1: Type: text/plain, Size: 1156 bytes --]

Hi

I have been working on getting some performance monitoring support into 
systemtap. The perfmon1.diff patch is a very simple addition to the 
runtime. It just has functions to setup the perfmon monitoring hardware, 
read a counter, and shutdown the performance monitoring hardware. It 
uses the perfmon2 kernel ABI to configure the hardware.

I have completed changes to the translator to use the runtime functions. 
I took Marin's suggestion of using guru mode to allow access to the 
various C functions and wrote some examples that used the runtime functions.

The cost is relatively high for accessing the counters. Below is the 
output from p2x.stp, counting the number of cycles between consecutive 
calls to read the cycle count:

[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap -g  p2x.stp
interval = 15491
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap -g  p2x.stp
interval = 16317
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap -g  p2x.stp
interval = 15431
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap -g  p2x.stp
interval = 15392

I would appreciate any comments or feedback on this code.

-Will

[-- Attachment #2: perfmon1.diff --]
[-- Type: text/x-patch, Size: 5780 bytes --]

? runtime/bench2/bench.stp
? runtime/bench2/itest
? runtime/bench2/stap.out
? runtime/probes/perf
? runtime/probes/os_timer/.built-in.o.cmd
? runtime/probes/os_timer/.os_timer.o.d
? runtime/probes/os_timer/.tmp_versions
? runtime/probes/os_timer/Makefile
? runtime/probes/os_timer/compile.errors
? runtime/probes/scf/.built-in.o.cmd
? runtime/probes/scf/.scf.o.d
? runtime/probes/scf/.tmp_versions
? runtime/probes/scf/Makefile
? runtime/probes/scf/compile.errors
? runtime/probes/test4/.built-in.o.cmd
? runtime/probes/test4/.test4.o.d
? runtime/probes/test4/.tmp_versions
? runtime/probes/test4/Makefile
? runtime/probes/test4/compile.errors
Index: runtime/perf.c
===================================================================
RCS file: runtime/perf.c
diff -N runtime/perf.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ runtime/perf.c	13 Jul 2006 22:13:19 -0000
@@ -0,0 +1,132 @@
+/* -*- linux-c -*- 
+ * Perf Functions
+ * Copyright (C) 2006 Red Hat Inc.
+ *
+ * This file is part of systemtap, and is free software.  You can
+ * redistribute it and/or modify it under the terms of the GNU General
+ * Public License (GPL); either version 2, or (at your option) any
+ * later version.
+ */
+
+#ifndef _PERF_C_
+#define _PERF_C_
+
+#include <linux/perfmon.h>
+
+#include "perf.h"
+
+/** @file perf.c
+ * @brief Implements performance monitoring hardware support
+ */
+
+/* TODO fix so this works on SMP machines
+ * Need to do context load, register setup, and start on each processor
+ *
+ * Similarly need to stop and unload on each processor
+ */
+
+/* TODO make this work with sampling. There needs to be a help thread
+ * handling the sampling. */
+
+
+static int _stp_pfm_register_setup(void *desc,
+		       struct pfarg_pmc pmc[], int pmc_count,
+		       struct pfarg_pmd pmd[], int pmd_count)
+{
+	int err = 0;
+
+	err = pfmk_write_pmcs(desc, pmc, pmc_count);
+	if (err) return err;
+	
+	err = pfmk_write_pmds(desc, pmd, pmd_count);
+	return err;
+}
+
+static struct completion c;
+static struct pfarg_load load_args;
+static struct pfarg_start start_args;
+
+/** Sets up the performance monitoring hardware.
+ * The locations desc and context point to are modified as
+ * side-effects of the setup. desc is a unique pointer used
+ * by the various routines.
+ * @param desc pointer to void *, handle to describe perfmon config
+ * @param context pointer to context information
+ * @param pmc, pointer to array describing control register setup
+ * @param pmc_count, number of entries in pmc
+ * @param pmd, pointer to array describing data register setup
+ * @param pmd_count, number of entries in pmd
+ * @returns an int, 0 if no errors encountered during setup
+ */
+int _stp_perfmon_setup(void **desc,
+		       struct pfarg_ctx *context,
+		       struct pfarg_pmc pmc[], int pmc_count,
+		       struct pfarg_pmd pmd[], int pmd_count)
+{
+	int err = 0;
+
+	/* create a context */
+	err = pfmk_create_context(context, NULL, 0, &c, desc, NULL);
+	if (err) goto cleanup;
+
+	/* set up the counters */
+	err = _stp_pfm_register_setup(*desc, pmc, pmc_count, pmd, pmd_count);
+	if (err) goto cleanup2;
+
+	/* start measuring */
+	err = pfmk_load_context(*desc, &load_args);
+	if (err) {
+		printk("pfmk_load_context error\n");
+		goto cleanup2;
+	}
+	err = pfmk_start(*desc, &start_args);
+	if (err) {
+		printk("pfmk_start error\n");
+		goto cleanup3;
+	}
+
+	return err;
+
+cleanup3: pfmk_unload_context(*desc);
+cleanup2: pfmk_close(*desc);
+cleanup: *desc=NULL; 
+	return err;
+}
+
+/** Shuts down the performance monitoring hardware.
+ * @param desc unique pointer to describe configuration
+ * @returns an int, 0 if no errors encountered during shutdown
+ */
+int _stp_perfmon_shutdown(void *desc)
+{
+	int err=0;
+	/* stop the counters */
+	err=pfmk_stop(desc);
+	if (err) return err;
+	err=pfmk_unload_context(desc);
+	if (err) return err;
+	err=pfmk_close(desc);
+	return err;
+}
+
+/** Reads the performance counter
+ * @param desc unique pointer to describe configuration
+ * @returns an int64, raw value of counter
+ */
+int64_t _stp_perfmon_read(void *desc, int counter)
+{
+	struct pfarg_pmd storage;
+	
+	storage.reg_set = 0;
+	storage.reg_num = counter;
+
+	if ( desc != NULL) {
+		if (pfmk_read_pmds(desc, &storage, 1))
+			printk( "pfm_read_pmds error\n");
+	}
+
+	return storage.reg_value;
+}
+
+#endif /* _PERF_C_ */
+
Index: runtime/perf.h
===================================================================
RCS file: runtime/perf.h
diff -N runtime/perf.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ runtime/perf.h	13 Jul 2006 22:13:19 -0000
@@ -0,0 +1,27 @@
+/* -*- linux-c -*- 
+ * Perf Header File
+ * Copyright (C) 2006 Red Hat Inc.
+ *
+ * This file is part of systemtap, and is free software.  You can
+ * redistribute it and/or modify it under the terms of the GNU General
+ * Public License (GPL); either version 2, or (at your option) any
+ * later version.
+ */
+
+#ifndef _PERF_H_
+#define _PERF_H_
+
+/** @file perf.h
+ * @brief Header file for performance monitoring hardware support
+ */
+
+int _stp_perfmon_setup(void **desc,
+		       struct pfarg_ctx *context,
+		       struct pfarg_pmc pmc[], int pmc_count,
+		       struct pfarg_pmd pmd[], int pmd_count);
+
+int _stp_perfmon_shutdown(void *desc);
+
+int64_t _stp_perfmon_read(void *desc, int counter);
+
+#endif /* _PERF_H_ */
Index: runtime/runtime.h
===================================================================
RCS file: /cvs/systemtap/src/runtime/runtime.h,v
retrieving revision 1.28
diff -u -r1.28 runtime.h
--- runtime/runtime.h	28 Nov 2005 22:08:39 -0000	1.28
+++ runtime/runtime.h	13 Jul 2006 22:13:19 -0000
@@ -64,6 +64,7 @@
 #include "copy.c"
 #include "sym.h"
 #include "alloc.c"
+#include "perf.c"
 
 
 /************* Module Stuff ********************/

[-- Attachment #3: p1x.stp --]
[-- Type: text/plain, Size: 1350 bytes --]

/* stap -g p1x.stp
   Make use of guru mode to check that the runtime functions are in place
   This code only works on AMD64 processors.
*/

%{
static struct pfarg_ctx context;
static void *desc;

/* set things up for AMD64 */
#define USR_BIT (1<<16)
#define OS_BIT (1<<17)
#define E_BIT (1<<18)
#define PC_BIT (1<<19)
#define INT_BIT (1<<20)
#define EN_BIT (1<<22)
#define INV_BIT (1<<23)
#define NUM_PMD 1
static struct pfarg_pmd pmd[] = {
	{.reg_num=0, .reg_value=0}
};
static int num_pfm_pmd = NUM_PMD;
#define NUM_PMC 1
static struct pfarg_pmc pmc[] = {
	{.reg_num=0, .reg_value=(0x76|USR_BIT|OS_BIT|EN_BIT|INT_BIT)}
};
static int num_pfm_pmc = NUM_PMC;
%}

function cpu_pfm_init:long ()
%{
	int err = 0;

	/* set up context information */
	/* only does system-wide contexts */
	context.ctx_flags |= PFM_FL_SYSTEM_WIDE;

	err = _stp_perfmon_setup(&desc, &context,
				 pmc, num_pfm_pmc,
				 pmd, num_pfm_pmd);

	printk("err = %d, desc = 0x%p\n", err, desc);

	if (err) {
		printk("unable to set up counters\n");
	}
%}

function cpu_pfm_getreg:long (reg:long)
%{
	THIS->__retvalue = _stp_perfmon_read(desc, THIS->reg);
%}

function cpu_pfm_cleanup:long ()
%{
	if (_stp_perfmon_shutdown(desc)) printk("_stp_pfmk_shutdown error\n");
%}

probe begin { cpu_pfm_init(); }

probe end
{
	printf("pmd = %d\n", cpu_pfm_getreg(0));
	cpu_pfm_cleanup();
}

[-- Attachment #4: p2x.stp --]
[-- Type: text/plain, Size: 1419 bytes --]

/* stap -g p2x.stp
   Quick check to see how expensive the reading of the perfmon hw is.
   This code only works on AMD64 processors.
*/

%{
static struct pfarg_ctx context;
static void *desc;

/* set things up for AMD64 */
#define USR_BIT (1<<16)
#define OS_BIT (1<<17)
#define E_BIT (1<<18)
#define PC_BIT (1<<19)
#define INT_BIT (1<<20)
#define EN_BIT (1<<22)
#define INV_BIT (1<<23)
#define NUM_PMD 1
static struct pfarg_pmd pmd[] = {
	{.reg_num=0, .reg_value=0}
};
static int num_pfm_pmd = NUM_PMD;
#define NUM_PMC 1
static struct pfarg_pmc pmc[] = {
	{.reg_num=0, .reg_value=(0x76|USR_BIT|OS_BIT|EN_BIT|INT_BIT)}
};
static int num_pfm_pmc = NUM_PMC;
%}

function cpu_pfm_init:long ()
%{
	int err = 0;

	/* set up context information */
	/* only does system-wide contexts */
	context.ctx_flags |= PFM_FL_SYSTEM_WIDE;

	err = _stp_perfmon_setup(&desc, &context,
				 pmc, num_pfm_pmc,
				 pmd, num_pfm_pmd);

	printk("err = %d, desc = 0x%p\n", err, desc);

	if (err) {
		printk("unable to set up counters\n");
	}
%}

function cpu_pfm_getreg:long (reg:long)
%{
	THIS->__retvalue = _stp_perfmon_read(desc, THIS->reg);
%}

function cpu_pfm_cleanup:long ()
%{
	if (_stp_perfmon_shutdown(desc)) printk("_stp_pfmk_shutdown error\n");
%}

global first
global second

probe begin
{
	cpu_pfm_init();
	first =  cpu_pfm_getreg(0);
	second =  cpu_pfm_getreg(0);
	printf("interval = %d\n", second-first);
	cpu_pfm_cleanup();
}

[-- Attachment #5: p3x.stp --]
[-- Type: text/plain, Size: 1505 bytes --]

/* stap -g p3x.stp
   Quick check to see how expensive the reading of the perfmon hw is
   from the C code. Check to see if how much overhead is in stap generated
   code in p2x.stp.
   This code only works on AMD64 processors.
*/

%{
static struct pfarg_ctx context;
static void *desc;

/* set things up for AMD64 */
#define USR_BIT (1<<16)
#define OS_BIT (1<<17)
#define E_BIT (1<<18)
#define PC_BIT (1<<19)
#define INT_BIT (1<<20)
#define EN_BIT (1<<22)
#define INV_BIT (1<<23)
#define NUM_PMD 1
static struct pfarg_pmd pmd[] = {
	{.reg_num=0, .reg_value=0}
};
static int num_pfm_pmd = NUM_PMD;
#define NUM_PMC 1
static struct pfarg_pmc pmc[] = {
	{.reg_num=0, .reg_value=(0x76|USR_BIT|OS_BIT|EN_BIT|INT_BIT)}
};
static int num_pfm_pmc = NUM_PMC;
%}

function cpu_pfm_init:long ()
%{
	int err = 0;

	/* set up context information */
	/* only does system-wide contexts */
	context.ctx_flags |= PFM_FL_SYSTEM_WIDE;

	err = _stp_perfmon_setup(&desc, &context,
				 pmc, num_pfm_pmc,
				 pmd, num_pfm_pmd);

	printk("err = %d, desc = 0x%p\n", err, desc);

	if (err) {
		printk("unable to set up counters\n");
	}
%}

function cpu_pfm_getdiff:long ()
%{
	int64_t first, second;
	first =  _stp_perfmon_read(desc, 0);
	second =  _stp_perfmon_read(desc, 0);
	THIS->__retvalue = (second-first);
%}

function cpu_pfm_cleanup:long ()
%{
	if (_stp_perfmon_shutdown(desc)) printk("_stp_pfmk_shutdown error\n");
%}

probe begin
{
	cpu_pfm_init();
	printf("interval = %d\n", cpu_pfm_getdiff());
	cpu_pfm_cleanup();
}

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Perfmon systemtap runtime support
  2006-07-14  4:59 Chuck Ebbert
@ 2006-07-14 15:50 ` William Cohen
  0 siblings, 0 replies; 3+ messages in thread
From: William Cohen @ 2006-07-14 15:50 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: systemtap

Chuck Ebbert wrote:
> In-Reply-To: <44B6C74E.7040008@redhat.com>
> 
> On Thu, 13 Jul 2006 18:21:02 -0400, William Cohen wrote:
> 
>>I have been working on getting some performance monitoring support into 
>>systemtap. The perfmon1.diff patch is a very simple addition to the 
>>runtime. It just has functions to setup the perfmon monitoring hardware, 
>>read a counter, and shutdown the performance monitoring hardware. It 
>>uses the perfmon2 kernel ABI to configure the hardware.
>>
>>I have completed changes to the translator to use the runtime functions. 
>>I took Marin's suggestion of using guru mode to allow access to the 
>>various C functions and wrote some examples that used the runtime functions.
>>
>>The cost is relatively high for accessing the counters. Below is the 
>>output from p2x.stp, counting the number of cycles between consecutive 
>>calls to read the cycle count:
>>
>>[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap -g  p2x.stp
>>interval = 15491
> 
> 
> I modified your sample pure C module from a while ago and I get 500-700
> cycles overhead:

I tried tried your example and I got similar numbers:

Jul 14 11:47:40 dhcp59-187 kernel: val0 = 6595400, val1 = 6595847, 
interval = 447
Jul 14 11:47:41 dhcp59-187 kernel: val0 = 6842567, val1 = 6843068, 
interval = 501

I don't know why I got such high cycle counts yesterday. Running the 
same p2x.stp today I got lower cycle counts:

[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap  -g p2x.stp
interval = 1051
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap  -g p2x.stp
interval = 1009
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap  -g p2x.stp
interval = 650
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap  -g p2x.stp
interval = 1063
[wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap  -g p2x.stp
interval = 983

-Will

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Perfmon systemtap runtime support
@ 2006-07-14  4:59 Chuck Ebbert
  2006-07-14 15:50 ` William Cohen
  0 siblings, 1 reply; 3+ messages in thread
From: Chuck Ebbert @ 2006-07-14  4:59 UTC (permalink / raw)
  To: William Cohen; +Cc: systemtap

In-Reply-To: <44B6C74E.7040008@redhat.com>

On Thu, 13 Jul 2006 18:21:02 -0400, William Cohen wrote:
> 
> I have been working on getting some performance monitoring support into 
> systemtap. The perfmon1.diff patch is a very simple addition to the 
> runtime. It just has functions to setup the perfmon monitoring hardware, 
> read a counter, and shutdown the performance monitoring hardware. It 
> uses the perfmon2 kernel ABI to configure the hardware.
> 
> I have completed changes to the translator to use the runtime functions. 
> I took Marin's suggestion of using guru mode to allow access to the 
> various C functions and wrote some examples that used the runtime functions.
> 
> The cost is relatively high for accessing the counters. Below is the 
> output from p2x.stp, counting the number of cycles between consecutive 
> calls to read the cycle count:
> 
> [wcohen@dhcp59-187 systemtap_perfmon]$ ./install/bin/stap -g  p2x.stp
> interval = 15491

I modified your sample pure C module from a while ago and I get 500-700
cycles overhead:

[root@tu kpfm_test3]# insmod ./kpfm_test3.ko ; rmmod kpfm_test3
val0 = 998005, val1 = 998533, interval = 528

---------------- Makefile ----------------
buildtest:
	make -C /lib/modules/2.6.17.1-32-pfmon/build M=`pwd` modules

obj-m += kpfm_test3.o

clean:
	/bin/rm -rf *.o *.ko *~ *.mod.c .*.cmd .tmp_versions Modules.symvers

---------------- README ----------------
This is a simple example to show how Perfmon2 kabi works.  It counts
the cpu_clock_unhalted events. This is currently only set up for the
AMD64. It won't work with other processors without modifications.

To run:

# make
# modprobe perfmon_{arch}
# /sbin/insmod ./kpfm_test3.ko
# /sbin/rmmod kpfm_test3
(should print out information in /var/log/messages)
# tail /var/log/messages

Modified to count how many clocks it takes to read the PMD registers.

---------------- kpfm_test3.c ----------------
/*
 * kpfm_test3.c
 *
 * Copyright (c) 2006 Red Hat
 * 		Contributions by William Cohen <wcohen@redhat.com>
 *		Modified by Chuck Ebbert <76306.1226@compuserve.com>
 *
 * A simple program to test overhead of reading perfmon counters.
 *
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/config.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/sysctl.h>

#include <linux/perfmon.h>

MODULE_AUTHOR("William Cohen <wcohen@redhat.com>");
MODULE_DESCRIPTION("kpfm_test3 module to exercise perfmon2 KABI");
MODULE_LICENSE("GPL");

static struct pfarg_ctx req;
static struct completion c;
static void *desc;
static struct pfarg_load load_args;
static struct pfarg_start start_args;

/* Set things up for AMD64.
 */
/* monitor events in user mode (CPL > 0) */
#define USR_BIT (1<<16)
/* monitor events in system mode (CPL == 0) */
#define OS_BIT (1<<17)
/* edge detect */
#define EDGE_BIT (1<<18)
/* pin control */
#define PC_BIT (1<<19)
/* generate APIC interrupt on overflow */
#define INT_BIT (1<<20)
/* enable counter */
#define EN_BIT (1<<22)
/* invert counter mask */
#define INV_BIT (1<<23)

#define NUM_PMD 1
/* start with reg 1 (0 is used by softlockup on x86_64) */
#define FIRST_PMD 1
static struct pfarg_pmd pd[] = {
	{.reg_num = FIRST_PMD,
	 .reg_value = 0,
	},
};
static int num_pfm_pmd = NUM_PMD;

#define NUM_PMC 1
#define FIRST_PMC 1
/* event 0x76 = number of unhalted CPU clocks */
#define EVT_SEL 0x76
static struct pfarg_pmc pc[] = {
	{.reg_num = FIRST_PMC,
	 .reg_value = EVT_SEL | OS_BIT | EN_BIT | INT_BIT,
	},
};
static int num_pfm_pmc = NUM_PMC;


static int pfm_register_setup(void)
{
	/* for the time being hard coded the events to monitor */
	int err = 0;

	err = pfmk_write_pmcs(desc, pc, num_pfm_pmc);
	if (err)
		goto out;
	
	err = pfmk_write_pmds(desc, pd, num_pfm_pmd);

 out:	
	return err;
}

static int cpu_pfm_init(void)
{
	int err = 0;

	/* set up context information */
	/* only does system-wide contexts */
	req.ctx_flags |= PFM_FL_SYSTEM_WIDE;

	err = pfmk_create_context(&req, NULL, 0, &c, &desc, NULL);
	if (err)
		goto cleanup;

	err = pfm_register_setup();
	if (err)
		goto cleanup2;

	err = pfmk_load_context(desc, &load_args);
	if (err) {
		printk("pfmk_load_context error\n");
		goto cleanup2;
	}

	err = pfmk_start(desc, &start_args);
	if (err) {
		printk("pfmk_start error\n");
		goto cleanup3;
	}

	return err;

 cleanup3:
	pfmk_unload_context(desc);
 cleanup2:
	pfmk_close(desc);
 cleanup:
	return err;
}

static int kpfm_test3_init_module(void)
{
	return cpu_pfm_init();
}

static void cpu_pfm_cleanup(void)
{
	long long a;

	/* read pmds twice in a row and see how many
	 * clock cycles elapse between reads
	 */	
	if (pfmk_read_pmds(desc, pd, num_pfm_pmd))
		printk( "pfm_read_pmds error\n");
	a = pd[0].reg_value;
	if (pfmk_read_pmds(desc, pd, num_pfm_pmd))
		printk( "pfm_read_pmds error\n");

	printk ("val0 = %lld, val1 = %lld, interval = %lld\n",
		a, pd[0].reg_value, pd[0].reg_value - a);

	if (pfmk_stop(desc))
		printk("pfmk_stop error\n");
	if (pfmk_unload_context(desc))
		printk ("pfmk_unload_context error\n");
	if (pfmk_close(desc))
		printk ("pfmk_unload_context error\n");;
}

static void kpfm_test3_cleanup_module(void)
{
	cpu_pfm_cleanup();
	return;
}

module_init(kpfm_test3_init_module);
module_exit(kpfm_test3_cleanup_module);

-- 
Chuck
 "You can't read a newspaper if you can't read."  --George W. Bush

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-07-14 15:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-07-13 22:21 Perfmon systemtap runtime support William Cohen
2006-07-14  4:59 Chuck Ebbert
2006-07-14 15:50 ` William Cohen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).