[PATCH] Add math benchmark latency test

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] Add math benchmark latency test
@ 2017-08-16 11:26 Wilco Dijkstra
  2017-08-16 13:07 ` Siddhesh Poyarekar
  0 siblings, 1 reply; 14+ messages in thread
From: Wilco Dijkstra @ 2017-08-16 11:26 UTC (permalink / raw)
  To: libc-alpha; +Cc: nd

This patch further improves math function benchmarking by adding a latency
test in addition to throughput.  This enables more accurate comparisons of the
math functions. The latency test works by creating a dependency on the previous
iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids
changing the input.

The powf test now shows:

   "workload-spec2006.wrf": {
    "throughput": 200,
    "latency": 100
   }

OK for commit?

ChangeLog:
2017-08-16  Wilco Dijkstra  <wdijkstr@arm.com>
  
        * benchtests/bench-skeleton.c (main): Add support for
        latency benchmarking.
        * benchtests/scripts/bench.py: Add support for latency benchmarking.

--
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 3c6dad705594ac0a53edcb4e09686252c13127cf..60753ede1aa3cc05cc0e9eccc74dd12a609a1a67 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -71,8 +71,10 @@ main (int argc, char **argv)
       bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
       double d_total_i = 0;
       timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
+      timing_t throughput = 0, latency = 0;
       int64_t c = 0;
       uint64_t cur;
+      BENCH_VARS;
       while (1)
 	{
 	  if (is_bench)
@@ -86,7 +88,16 @@ main (int argc, char **argv)
 		  BENCH_FUNC (v, i);
 	      TIMING_NOW (end);
 	      TIMING_DIFF (cur, start, end);
-	      TIMING_ACCUM (total, cur);
+	      TIMING_ACCUM (throughput, cur);
+
+	      TIMING_NOW (start);
+	      for (k = 0; k < iters; k++)
+		for (i = 0; i < NUM_SAMPLES (v); i++)
+		  BENCH_FUNC_LAT (v, i);
+	      TIMING_NOW (end);
+	      TIMING_DIFF (cur, start, end);
+	      TIMING_ACCUM (latency, cur);
+
 	      d_total_i += iters * NUM_SAMPLES (v);
 	    }
 	  else
@@ -131,12 +142,15 @@ main (int argc, char **argv)
       /* Begin variant.  */
       json_attr_object_begin (&json_ctx, VARIANT (v));
 
-      json_attr_double (&json_ctx, "duration", d_total_s);
-      json_attr_double (&json_ctx, "iterations", d_total_i);
       if (is_bench)
-	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
+	{
+	  json_attr_double (&json_ctx, "throughput", throughput / d_total_i);
+	  json_attr_double (&json_ctx, "latency", latency / d_total_i);
+	}
       else
 	{
+	  json_attr_double (&json_ctx, "duration", d_total_s);
+	  json_attr_double (&json_ctx, "iterations", d_total_i);
 	  json_attr_double (&json_ctx, "max", max / d_iters);
 	  json_attr_double (&json_ctx, "min", min / d_iters);
 	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755
--- a/benchtests/scripts/bench.py
+++ b/benchtests/scripts/bench.py
@@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
 # variant is represented by the _VARIANT structure.  The ARGS structure
 # represents a single set of arguments.
 STRUCT_TEMPLATE = '''
-#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
+#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
 
 struct args
 {
@@ -84,7 +84,9 @@ EPILOGUE = '''
 #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
 #define RESULT_ACCUM(r, v, i, old, new) \\
         ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
-#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
+#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
+#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
+#define BENCH_VARS %(defvar)s
 #define FUNCNAME "%(func)s"
 #include "bench-skeleton.c"'''
 
@@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals):
     # If we have a return value from the function, make sure it is
     # assigned to prevent the compiler from optimizing out the
     # call.
+    getret = ''
+    latarg = ''
+    defvar = ''
+
     if directives['ret']:
         print('static %s volatile ret;' % directives['ret'])
-        getret = 'ret = '
-    else:
-        getret = ''
+        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
+        getret = 'ret = func_res = '
+        latarg = 'func_res * zero +'
+        defvar = '%s func_res = 0;' % directives['ret']
 
     # Test initialization.
     if directives['init']:
         print('#define BENCH_INIT %s' % directives['init'])
 
-    print(EPILOGUE % {'getret': getret, 'func': func})
+    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
 
 
 def _print_arg_data(func, directives, all_vals):

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 11:26 [PATCH] Add math benchmark latency test Wilco Dijkstra
@ 2017-08-16 13:07 ` Siddhesh Poyarekar
  2017-08-16 13:43   ` Szabolcs Nagy
  2017-08-16 14:23   ` Arjan van de Ven
  0 siblings, 2 replies; 14+ messages in thread
From: Siddhesh Poyarekar @ 2017-08-16 13:07 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha; +Cc: nd

On Wednesday 16 August 2017 04:55 PM, Wilco Dijkstra wrote:
> This patch further improves math function benchmarking by adding a latency
> test in addition to throughput.  This enables more accurate comparisons of the
> math functions. The latency test works by creating a dependency on the previous
> iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids
> changing the input.
> 
> The powf test now shows:
> 
>    "workload-spec2006.wrf": {
>     "throughput": 200,
>     "latency": 100
>    }
> 
> OK for commit?

> ChangeLog:
> 2017-08-16  Wilco Dijkstra  <wdijkstr@arm.com>
>   
>         * benchtests/bench-skeleton.c (main): Add support for
>         latency benchmarking.
>         * benchtests/scripts/bench.py: Add support for latency benchmarking.
> 
> --
> diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
> index 3c6dad705594ac0a53edcb4e09686252c13127cf..60753ede1aa3cc05cc0e9eccc74dd12a609a1a67 100644
> --- a/benchtests/bench-skeleton.c
> +++ b/benchtests/bench-skeleton.c
> @@ -71,8 +71,10 @@ main (int argc, char **argv)
>        bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
>        double d_total_i = 0;
>        timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
> +      timing_t throughput = 0, latency = 0;
>        int64_t c = 0;
>        uint64_t cur;
> +      BENCH_VARS;
>        while (1)
>  	{
>  	  if (is_bench)
> @@ -86,7 +88,16 @@ main (int argc, char **argv)
>  		  BENCH_FUNC (v, i);
>  	      TIMING_NOW (end);
>  	      TIMING_DIFF (cur, start, end);
> -	      TIMING_ACCUM (total, cur);
> +	      TIMING_ACCUM (throughput, cur);
> +
> +	      TIMING_NOW (start);
> +	      for (k = 0; k < iters; k++)
> +		for (i = 0; i < NUM_SAMPLES (v); i++)
> +		  BENCH_FUNC_LAT (v, i);
> +	      TIMING_NOW (end);
> +	      TIMING_DIFF (cur, start, end);
> +	      TIMING_ACCUM (latency, cur);
> +
>  	      d_total_i += iters * NUM_SAMPLES (v);
>  	    }
>  	  else
> @@ -131,12 +142,15 @@ main (int argc, char **argv)
>        /* Begin variant.  */
>        json_attr_object_begin (&json_ctx, VARIANT (v));
>  
> -      json_attr_double (&json_ctx, "duration", d_total_s);
> -      json_attr_double (&json_ctx, "iterations", d_total_i);
>        if (is_bench)
> -	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
> +	{
> +	  json_attr_double (&json_ctx, "throughput", throughput / d_total_i);

I didn't notice this earlier, but shouldn't throughput be
iterations/cycle and not the other way around?  That is, throughput
should be the inverse of latency.

> +	  json_attr_double (&json_ctx, "latency", latency / d_total_i);
> +	}
>        else
>  	{
> +	  json_attr_double (&json_ctx, "duration", d_total_s);
> +	  json_attr_double (&json_ctx, "iterations", d_total_i);
>  	  json_attr_double (&json_ctx, "max", max / d_iters);
>  	  json_attr_double (&json_ctx, "min", min / d_iters);
>  	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
> diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
> index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755
> --- a/benchtests/scripts/bench.py
> +++ b/benchtests/scripts/bench.py
> @@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
>  # variant is represented by the _VARIANT structure.  The ARGS structure
>  # represents a single set of arguments.
>  STRUCT_TEMPLATE = '''
> -#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
> +#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
>  
>  struct args
>  {
> @@ -84,7 +84,9 @@ EPILOGUE = '''
>  #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
>  #define RESULT_ACCUM(r, v, i, old, new) \\
>          ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
> -#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
> +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
> +#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
> +#define BENCH_VARS %(defvar)s
>  #define FUNCNAME "%(func)s"
>  #include "bench-skeleton.c"'''
>  
> @@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals):
>      # If we have a return value from the function, make sure it is
>      # assigned to prevent the compiler from optimizing out the
>      # call.
> +    getret = ''
> +    latarg = ''
> +    defvar = ''
> +
>      if directives['ret']:
>          print('static %s volatile ret;' % directives['ret'])
> -        getret = 'ret = '
> -    else:
> -        getret = ''
> +        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
> +        getret = 'ret = func_res = '
> +        latarg = 'func_res * zero +'
> +        defvar = '%s func_res = 0;' % directives['ret']
>  
>      # Test initialization.
>      if directives['init']:
>          print('#define BENCH_INIT %s' % directives['init'])
>  
> -    print(EPILOGUE % {'getret': getret, 'func': func})
> +    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
>  
>  
>  def _print_arg_data(func, directives, all_vals):
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 13:07 ` Siddhesh Poyarekar
@ 2017-08-16 13:43   ` Szabolcs Nagy
  2017-08-16 14:23     ` Siddhesh Poyarekar
  2017-08-16 14:23   ` Arjan van de Ven
  1 sibling, 1 reply; 14+ messages in thread
From: Szabolcs Nagy @ 2017-08-16 13:43 UTC (permalink / raw)
  To: Siddhesh Poyarekar, Wilco Dijkstra, libc-alpha; +Cc: nd

On 16/08/17 14:07, Siddhesh Poyarekar wrote:
> On Wednesday 16 August 2017 04:55 PM, Wilco Dijkstra wrote:
>> This patch further improves math function benchmarking by adding a latency
>> test in addition to throughput.  This enables more accurate comparisons of the
>> math functions. The latency test works by creating a dependency on the previous
>> iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids
>> changing the input.
>>
...
>> -      json_attr_double (&json_ctx, "duration", d_total_s);
>> -      json_attr_double (&json_ctx, "iterations", d_total_i);
>>        if (is_bench)
>> -	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
>> +	{
>> +	  json_attr_double (&json_ctx, "throughput", throughput / d_total_i);
> 
> I didn't notice this earlier, but shouldn't throughput be
> iterations/cycle and not the other way around?  That is, throughput
> should be the inverse of latency.

yes in principle the throughput is number of calls/unit time,
but the inverse of that is just as useful, it's the rate at
which calls can be issued and then it can be easily compared
to the latency number, but i guess printing it as calls/sec
would work too.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 13:43   ` Szabolcs Nagy
@ 2017-08-16 14:23     ` Siddhesh Poyarekar
  2017-08-17 15:46       ` Wilco Dijkstra
  0 siblings, 1 reply; 14+ messages in thread
From: Siddhesh Poyarekar @ 2017-08-16 14:23 UTC (permalink / raw)
  To: Szabolcs Nagy, Wilco Dijkstra, libc-alpha; +Cc: nd

On Wednesday 16 August 2017 07:13 PM, Szabolcs Nagy wrote:
> yes in principle the throughput is number of calls/unit time,
> but the inverse of that is just as useful, it's the rate at
> which calls can be issued and then it can be easily compared
> to the latency number, but i guess printing it as calls/sec
> would work too.

I don't dispute its utility, I'd just like it to match what it is
called, so calls/unit time for throughput and time/call for latency.

BTW, this will only work for functions that have the same first argument
and return types.  This is OK for now, but it might be something to fix
in future.  A comment to that effect in the code would be very useful
because I'm pretty sure I'll forget about it in the next 15 minutes or
so and Wilco will too, maybe later than that :)

Siddhesh

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Re: [PATCH] Add math benchmark latency test
  2017-08-16 13:07 ` Siddhesh Poyarekar
  2017-08-16 13:43   ` Szabolcs Nagy
@ 2017-08-16 14:23   ` Arjan van de Ven
  2017-08-16 14:26     ` Siddhesh Poyarekar
  1 sibling, 1 reply; 14+ messages in thread
From: Arjan van de Ven @ 2017-08-16 14:23 UTC (permalink / raw)
  To: Siddhesh Poyarekar, Wilco Dijkstra, libc-alpha; +Cc: nd

On 8/16/2017 6:07 AM, Siddhesh Poyarekar wrote:
> I didn't notice this earlier, but shouldn't throughput be
> iterations/cycle and not the other way around?  That is, throughput
> should be the inverse of latency.

well not really...

I've been working on making expf() faster for x86 (see HJ's email earlier), and
with a massive out of order/pipelined cpu, latency and throughput are very distinct things.
expf() can run at a throughput of somewhere in the 10 to 11 cycles range, while the latency
can be in the 45 to 55 cycles range.
(not trying to do benchmarking here, just wanting to show an order of magnitude)

the latency is then the number of cycles it takes to get a result (on an empty cpu)
through from end to end, e.g.

printf("%e", expf(fl))

while throughput is the cost if  you put multiple consecutive through the cpu,
like

printf("%e", expf(f1) + expf(f2) + expf(f3) + expf(4))

(using "printf" as a proxy for 'make externally visible' sync point; of course in reality it could be many other things)

the out of order cpu will start execution of the second third and fourth expf() in parallel to the first, which will
hide the latency (so the result time is not 4x45 + time of 4 adds, but much less, closer to 45 + 3x11 + time of 4 adds)

I picked 4 expf()s here but theoretically throughput would be measured with the asymptote of 4...

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 14:23   ` Arjan van de Ven
@ 2017-08-16 14:26     ` Siddhesh Poyarekar
  2017-08-16 14:33       ` Arjan van de Ven
  0 siblings, 1 reply; 14+ messages in thread
From: Siddhesh Poyarekar @ 2017-08-16 14:26 UTC (permalink / raw)
  To: Arjan van de Ven, Wilco Dijkstra, libc-alpha; +Cc: nd

On Wednesday 16 August 2017 07:53 PM, Arjan van de Ven wrote:
> well not really...
> 
> I've been working on making expf() faster for x86 (see HJ's email
> earlier), and
> with a massive out of order/pipelined cpu, latency and throughput are
> very distinct things.
> expf() can run at a throughput of somewhere in the 10 to 11 cycles
> range, while the latency
> can be in the 45 to 55 cycles range.
> (not trying to do benchmarking here, just wanting to show an order of
> magnitude)

Ah sorry, I should have been clearer - inverse in terms of units, not
value, i.e. throughput is typically expressed in items/unit time while
latency is expressed in time/unit item.

Siddhesh

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 14:26     ` Siddhesh Poyarekar
@ 2017-08-16 14:33       ` Arjan van de Ven
  2017-08-16 14:36         ` Siddhesh Poyarekar
  2017-08-16 14:39         ` Alexander Monakov
  0 siblings, 2 replies; 14+ messages in thread
From: Arjan van de Ven @ 2017-08-16 14:33 UTC (permalink / raw)
  To: Siddhesh Poyarekar, Wilco Dijkstra, libc-alpha; +Cc: nd

On 8/16/2017 7:26 AM, Siddhesh Poyarekar wrote:
> On Wednesday 16 August 2017 07:53 PM, Arjan van de Ven wrote:
>> well not really...
>>
>> I've been working on making expf() faster for x86 (see HJ's email
>> earlier), and
>> with a massive out of order/pipelined cpu, latency and throughput are
>> very distinct things.
>> expf() can run at a throughput of somewhere in the 10 to 11 cycles
>> range, while the latency
>> can be in the 45 to 55 cycles range.
>> (not trying to do benchmarking here, just wanting to show an order of
>> magnitude)
>
> Ah sorry, I should have been clearer - inverse in terms of units, not
> value, i.e. throughput is typically expressed in items/unit time while
> latency is expressed in time/unit item.

maybe "throughput-time" ?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 14:33       ` Arjan van de Ven
@ 2017-08-16 14:36         ` Siddhesh Poyarekar
  2017-08-16 14:39         ` Alexander Monakov
  1 sibling, 0 replies; 14+ messages in thread
From: Siddhesh Poyarekar @ 2017-08-16 14:36 UTC (permalink / raw)
  To: Arjan van de Ven, Wilco Dijkstra, libc-alpha; +Cc: nd

On Wednesday 16 August 2017 08:03 PM, Arjan van de Ven wrote:
>> Ah sorry, I should have been clearer - inverse in terms of units, not
>> value, i.e. throughput is typically expressed in items/unit time while
>> latency is expressed in time/unit item.
> 
> maybe "throughput-time" ?

Sure, but throughput proper (i.e. comparisons) will give bigger numbers
and happier marketing teams :)

Siddhesh

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 14:33       ` Arjan van de Ven
  2017-08-16 14:36         ` Siddhesh Poyarekar
@ 2017-08-16 14:39         ` Alexander Monakov
  2017-08-16 15:22           ` Wilco Dijkstra
  1 sibling, 1 reply; 14+ messages in thread
From: Alexander Monakov @ 2017-08-16 14:39 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Siddhesh Poyarekar, Wilco Dijkstra, libc-alpha, nd

On Wed, 16 Aug 2017, Arjan van de Ven wrote:
> > Ah sorry, I should have been clearer - inverse in terms of units, not
> > value, i.e. throughput is typically expressed in items/unit time while
> > latency is expressed in time/unit item.
> 
> maybe "throughput-time" ?

I suggest using "reciprocal throughput" if you're looking for a short term
for 'independent executions per unit time'.  It's easier to recognize and
already used in practice (e.g. in docs by Agner Fog).

Alexander

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 14:39         ` Alexander Monakov
@ 2017-08-16 15:22           ` Wilco Dijkstra
  2017-08-16 21:01             ` Siddhesh Poyarekar
  0 siblings, 1 reply; 14+ messages in thread
From: Wilco Dijkstra @ 2017-08-16 15:22 UTC (permalink / raw)
  To: Alexander Monakov, Arjan van de Ven; +Cc: Siddhesh Poyarekar, libc-alpha, nd

Alexander Monakov wrote:
> I suggest using "reciprocal throughput" if you're looking for a short term
> for 'independent executions per unit time'.  It's easier to recognize and
> already used in practice (e.g. in docs by Agner Fog).

Right what about this?

   "workload-spec2006.wrf": {
    "reciprocal throughput (ns)": 20,
    "latency (ns)": 50,
    "throughput (iters/s)": 5.0e+07
   }

This leads to a question, some targets use an odd header hp-timing.h. What units
does this use? Or is it completely undefined (and could potentially change between
GLIBC versions)?


ChangeLog:
2017-08-16  Wilco Dijkstra  <wdijkstr@arm.com>
  
        * benchtests/bench-skeleton.c (main): Add support for
        latency benchmarking.
        * benchtests/scripts/bench.py: Add support for latency benchmarking.
--

diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 3c6dad705594ac0a53edcb4e09686252c13127cf..48287be93b432b3acfc2431d1f7959bd00815b3b 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -71,8 +71,10 @@ main (int argc, char **argv)
       bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
       double d_total_i = 0;
       timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
+      timing_t throughput = 0, latency = 0;
       int64_t c = 0;
       uint64_t cur;
+      BENCH_VARS;
       while (1)
 	{
 	  if (is_bench)
@@ -86,7 +88,16 @@ main (int argc, char **argv)
 		  BENCH_FUNC (v, i);
 	      TIMING_NOW (end);
 	      TIMING_DIFF (cur, start, end);
-	      TIMING_ACCUM (total, cur);
+	      TIMING_ACCUM (throughput, cur);
+
+	      TIMING_NOW (start);
+	      for (k = 0; k < iters; k++)
+		for (i = 0; i < NUM_SAMPLES (v); i++)
+		  BENCH_FUNC_LAT (v, i);
+	      TIMING_NOW (end);
+	      TIMING_DIFF (cur, start, end);
+	      TIMING_ACCUM (latency, cur);
+
 	      d_total_i += iters * NUM_SAMPLES (v);
 	    }
 	  else
@@ -131,12 +142,18 @@ main (int argc, char **argv)
       /* Begin variant.  */
       json_attr_object_begin (&json_ctx, VARIANT (v));
 
-      json_attr_double (&json_ctx, "duration", d_total_s);
-      json_attr_double (&json_ctx, "iterations", d_total_i);
       if (is_bench)
-	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
+	{
+	  json_attr_double (&json_ctx, "reciprocal throughput (ns)",
+			    throughput / d_total_i);
+	  json_attr_double (&json_ctx, "latency (ns)", latency / d_total_i);
+	  json_attr_double (&json_ctx, "throughput (iters/s)",
+			    d_total_i / throughput * 1000000000.0);
+	}
       else
 	{
+	  json_attr_double (&json_ctx, "duration", d_total_s);
+	  json_attr_double (&json_ctx, "iterations", d_total_i);
 	  json_attr_double (&json_ctx, "max", max / d_iters);
 	  json_attr_double (&json_ctx, "min", min / d_iters);
 	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755
--- a/benchtests/scripts/bench.py
+++ b/benchtests/scripts/bench.py
@@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
 # variant is represented by the _VARIANT structure.  The ARGS structure
 # represents a single set of arguments.
 STRUCT_TEMPLATE = '''
-#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
+#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
 
 struct args
 {
@@ -84,7 +84,9 @@ EPILOGUE = '''
 #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
 #define RESULT_ACCUM(r, v, i, old, new) \\
         ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
-#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
+#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
+#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
+#define BENCH_VARS %(defvar)s
 #define FUNCNAME "%(func)s"
 #include "bench-skeleton.c"'''
 
@@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals):
     # If we have a return value from the function, make sure it is
     # assigned to prevent the compiler from optimizing out the
     # call.
+    getret = ''
+    latarg = ''
+    defvar = ''
+
     if directives['ret']:
         print('static %s volatile ret;' % directives['ret'])
-        getret = 'ret = '
-    else:
-        getret = ''
+        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
+        getret = 'ret = func_res = '
+        latarg = 'func_res * zero +'
+        defvar = '%s func_res = 0;' % directives['ret']
 
     # Test initialization.
     if directives['init']:
         print('#define BENCH_INIT %s' % directives['init'])
 
-    print(EPILOGUE % {'getret': getret, 'func': func})
+    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
 
 
 def _print_arg_data(func, directives, all_vals):
    

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 15:22           ` Wilco Dijkstra
@ 2017-08-16 21:01             ` Siddhesh Poyarekar
  2017-08-17 13:08               ` [PATCH v3] " Wilco Dijkstra
  0 siblings, 1 reply; 14+ messages in thread
From: Siddhesh Poyarekar @ 2017-08-16 21:01 UTC (permalink / raw)
  To: Wilco Dijkstra, Alexander Monakov, Arjan van de Ven; +Cc: libc-alpha, nd

On Wednesday 16 August 2017 08:52 PM, Wilco Dijkstra wrote:
> Alexander Monakov wrote:
>> I suggest using "reciprocal throughput" if you're looking for a short term
>> for 'independent executions per unit time'.  It's easier to recognize and
>> already used in practice (e.g. in docs by Agner Fog).
> 
> Right what about this?
> 
>    "workload-spec2006.wrf": {
>     "reciprocal throughput (ns)": 20,
>     "latency (ns)": 50,
>     "throughput (iters/s)": 5.0e+07

Please drop the spaces and special chars in the key names; they need to
validate against benchout.schema and IIRC it doesn't allow spaces.
Simple reciprocal-throughput, latency, etc. ought to be sufficient.

>    }
> 
> This leads to a question, some targets use an odd header hp-timing.h. What units
> does this use? Or is it completely undefined (and could potentially change between
> GLIBC versions)?

These change between architectures and may change between glibc versions
if an architecture adds a hardware time source read later for example.

Siddhesh

> 
> 
> ChangeLog:
> 2017-08-16  Wilco Dijkstra  <wdijkstr@arm.com>
>   
>         * benchtests/bench-skeleton.c (main): Add support for
>         latency benchmarking.
>         * benchtests/scripts/bench.py: Add support for latency benchmarking.
> --
> 
> diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
> index 3c6dad705594ac0a53edcb4e09686252c13127cf..48287be93b432b3acfc2431d1f7959bd00815b3b 100644
> --- a/benchtests/bench-skeleton.c
> +++ b/benchtests/bench-skeleton.c
> @@ -71,8 +71,10 @@ main (int argc, char **argv)
>        bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
>        double d_total_i = 0;
>        timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
> +      timing_t throughput = 0, latency = 0;
>        int64_t c = 0;
>        uint64_t cur;
> +      BENCH_VARS;
>        while (1)
>  	{
>  	  if (is_bench)
> @@ -86,7 +88,16 @@ main (int argc, char **argv)
>  		  BENCH_FUNC (v, i);
>  	      TIMING_NOW (end);
>  	      TIMING_DIFF (cur, start, end);
> -	      TIMING_ACCUM (total, cur);
> +	      TIMING_ACCUM (throughput, cur);
> +
> +	      TIMING_NOW (start);
> +	      for (k = 0; k < iters; k++)
> +		for (i = 0; i < NUM_SAMPLES (v); i++)
> +		  BENCH_FUNC_LAT (v, i);
> +	      TIMING_NOW (end);
> +	      TIMING_DIFF (cur, start, end);
> +	      TIMING_ACCUM (latency, cur);
> +
>  	      d_total_i += iters * NUM_SAMPLES (v);
>  	    }
>  	  else
> @@ -131,12 +142,18 @@ main (int argc, char **argv)
>        /* Begin variant.  */
>        json_attr_object_begin (&json_ctx, VARIANT (v));
>  
> -      json_attr_double (&json_ctx, "duration", d_total_s);
> -      json_attr_double (&json_ctx, "iterations", d_total_i);
>        if (is_bench)
> -	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
> +	{
> +	  json_attr_double (&json_ctx, "reciprocal throughput (ns)",
> +			    throughput / d_total_i);
> +	  json_attr_double (&json_ctx, "latency (ns)", latency / d_total_i);
> +	  json_attr_double (&json_ctx, "throughput (iters/s)",
> +			    d_total_i / throughput * 1000000000.0);
> +	}
>        else
>  	{
> +	  json_attr_double (&json_ctx, "duration", d_total_s);
> +	  json_attr_double (&json_ctx, "iterations", d_total_i);
>  	  json_attr_double (&json_ctx, "max", max / d_iters);
>  	  json_attr_double (&json_ctx, "min", min / d_iters);
>  	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
> diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
> index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755
> --- a/benchtests/scripts/bench.py
> +++ b/benchtests/scripts/bench.py
> @@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
>  # variant is represented by the _VARIANT structure.  The ARGS structure
>  # represents a single set of arguments.
>  STRUCT_TEMPLATE = '''
> -#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
> +#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
>  
>  struct args
>  {
> @@ -84,7 +84,9 @@ EPILOGUE = '''
>  #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
>  #define RESULT_ACCUM(r, v, i, old, new) \\
>          ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
> -#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
> +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
> +#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
> +#define BENCH_VARS %(defvar)s
>  #define FUNCNAME "%(func)s"
>  #include "bench-skeleton.c"'''
>  
> @@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals):
>      # If we have a return value from the function, make sure it is
>      # assigned to prevent the compiler from optimizing out the
>      # call.
> +    getret = ''
> +    latarg = ''
> +    defvar = ''
> +
>      if directives['ret']:
>          print('static %s volatile ret;' % directives['ret'])
> -        getret = 'ret = '
> -    else:
> -        getret = ''
> +        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
> +        getret = 'ret = func_res = '
> +        latarg = 'func_res * zero +'
> +        defvar = '%s func_res = 0;' % directives['ret']
>  
>      # Test initialization.
>      if directives['init']:
>          print('#define BENCH_INIT %s' % directives['init'])
>  
> -    print(EPILOGUE % {'getret': getret, 'func': func})
> +    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
>  
>  
>  def _print_arg_data(func, directives, all_vals):
>     
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] Add math benchmark latency test
  2017-08-16 21:01             ` Siddhesh Poyarekar
@ 2017-08-17 13:08               ` Wilco Dijkstra
  2017-08-17 14:01                 ` Siddhesh Poyarekar
  0 siblings, 1 reply; 14+ messages in thread
From: Wilco Dijkstra @ 2017-08-17 13:08 UTC (permalink / raw)
  To: Siddhesh Poyarekar, Alexander Monakov, Arjan van de Ven; +Cc: libc-alpha, nd

OK, here is version 3:

This patch further improves math function benchmarking by adding a latency
test in addition to throughput.  This enables more accurate comparisons of the
math functions. The latency test works by creating a dependency on the previous
iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids
changing the input.

It reports reciprocal throughput and latency in nanoseconds (depending on the
timing header used) and max/min throughput in iterations per second:

   "workload-spec2006.wrf": {
    "reciprocal-throughput": 100,
    "latency": 200,
    "max-throughput": 1.0e+07,
    "min-throughput": 5.0e+06
   }

OK for commit?

ChangeLog:
2017-08-17  Wilco Dijkstra  <wdijkstr@arm.com>
  
        * benchtests/bench-skeleton.c (main): Add support for
        latency benchmarking.
        * benchtests/scripts/bench.py: Add support for latency benchmarking.
--

diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 3c6dad705594ac0a53edcb4e09686252c13127cf..955b2e1d2125775c6ffc10a004184a9564496e46 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -71,8 +71,10 @@ main (int argc, char **argv)
       bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
       double d_total_i = 0;
       timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
+      timing_t throughput = 0, latency = 0;
       int64_t c = 0;
       uint64_t cur;
+      BENCH_VARS;
       while (1)
 	{
 	  if (is_bench)
@@ -86,7 +88,16 @@ main (int argc, char **argv)
 		  BENCH_FUNC (v, i);
 	      TIMING_NOW (end);
 	      TIMING_DIFF (cur, start, end);
-	      TIMING_ACCUM (total, cur);
+	      TIMING_ACCUM (throughput, cur);
+
+	      TIMING_NOW (start);
+	      for (k = 0; k < iters; k++)
+		for (i = 0; i < NUM_SAMPLES (v); i++)
+		  BENCH_FUNC_LAT (v, i);
+	      TIMING_NOW (end);
+	      TIMING_DIFF (cur, start, end);
+	      TIMING_ACCUM (latency, cur);
+
 	      d_total_i += iters * NUM_SAMPLES (v);
 	    }
 	  else
@@ -131,12 +142,20 @@ main (int argc, char **argv)
       /* Begin variant.  */
       json_attr_object_begin (&json_ctx, VARIANT (v));
 
-      json_attr_double (&json_ctx, "duration", d_total_s);
-      json_attr_double (&json_ctx, "iterations", d_total_i);
       if (is_bench)
-	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
+	{
+	  json_attr_double (&json_ctx, "reciprocal-throughput",
+			    throughput / d_total_i);
+	  json_attr_double (&json_ctx, "latency", latency / d_total_i);
+	  json_attr_double (&json_ctx, "max-throughput",
+			    d_total_i / throughput * 1000000000.0);
+	  json_attr_double (&json_ctx, "min-throughput",
+			    d_total_i / latency * 1000000000.0);
+	}
       else
 	{
+	  json_attr_double (&json_ctx, "duration", d_total_s);
+	  json_attr_double (&json_ctx, "iterations", d_total_i);
 	  json_attr_double (&json_ctx, "max", max / d_iters);
 	  json_attr_double (&json_ctx, "min", min / d_iters);
 	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755
--- a/benchtests/scripts/bench.py
+++ b/benchtests/scripts/bench.py
@@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
 # variant is represented by the _VARIANT structure.  The ARGS structure
 # represents a single set of arguments.
 STRUCT_TEMPLATE = '''
-#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
+#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
 
 struct args
 {
@@ -84,7 +84,9 @@ EPILOGUE = '''
 #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
 #define RESULT_ACCUM(r, v, i, old, new) \\
         ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
-#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
+#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
+#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
+#define BENCH_VARS %(defvar)s
 #define FUNCNAME "%(func)s"
 #include "bench-skeleton.c"'''
 
@@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals):
     # If we have a return value from the function, make sure it is
     # assigned to prevent the compiler from optimizing out the
     # call.
+    getret = ''
+    latarg = ''
+    defvar = ''
+
     if directives['ret']:
         print('static %s volatile ret;' % directives['ret'])
-        getret = 'ret = '
-    else:
-        getret = ''
+        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
+        getret = 'ret = func_res = '
+        latarg = 'func_res * zero +'
+        defvar = '%s func_res = 0;' % directives['ret']
 
     # Test initialization.
     if directives['init']:
         print('#define BENCH_INIT %s' % directives['init'])
 
-    print(EPILOGUE % {'getret': getret, 'func': func})
+    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
 
 
 def _print_arg_data(func, directives, all_vals):

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] Add math benchmark latency test
  2017-08-17 13:08               ` [PATCH v3] " Wilco Dijkstra
@ 2017-08-17 14:01                 ` Siddhesh Poyarekar
  0 siblings, 0 replies; 14+ messages in thread
From: Siddhesh Poyarekar @ 2017-08-17 14:01 UTC (permalink / raw)
  To: Wilco Dijkstra, Alexander Monakov, Arjan van de Ven; +Cc: libc-alpha, nd

On Thursday 17 August 2017 06:38 PM, Wilco Dijkstra wrote:
> OK, here is version 3:
> 
> This patch further improves math function benchmarking by adding a latency
> test in addition to throughput.  This enables more accurate comparisons of the
> math functions. The latency test works by creating a dependency on the previous
> iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids
> changing the input.
> 
> It reports reciprocal throughput and latency in nanoseconds (depending on the
> timing header used) and max/min throughput in iterations per second:
> 
>    "workload-spec2006.wrf": {
>     "reciprocal-throughput": 100,
>     "latency": 200,
>     "max-throughput": 1.0e+07,
>     "min-throughput": 5.0e+06
>    }

This looks good, please commit.

Siddhesh

> 
> OK for commit?
> 
> ChangeLog:
> 2017-08-17  Wilco Dijkstra  <wdijkstr@arm.com>
>   
>         * benchtests/bench-skeleton.c (main): Add support for
>         latency benchmarking.
>         * benchtests/scripts/bench.py: Add support for latency benchmarking.
> --
> 
> diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
> index 3c6dad705594ac0a53edcb4e09686252c13127cf..955b2e1d2125775c6ffc10a004184a9564496e46 100644
> --- a/benchtests/bench-skeleton.c
> +++ b/benchtests/bench-skeleton.c
> @@ -71,8 +71,10 @@ main (int argc, char **argv)
>        bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
>        double d_total_i = 0;
>        timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
> +      timing_t throughput = 0, latency = 0;
>        int64_t c = 0;
>        uint64_t cur;
> +      BENCH_VARS;
>        while (1)
>  	{
>  	  if (is_bench)
> @@ -86,7 +88,16 @@ main (int argc, char **argv)
>  		  BENCH_FUNC (v, i);
>  	      TIMING_NOW (end);
>  	      TIMING_DIFF (cur, start, end);
> -	      TIMING_ACCUM (total, cur);
> +	      TIMING_ACCUM (throughput, cur);
> +
> +	      TIMING_NOW (start);
> +	      for (k = 0; k < iters; k++)
> +		for (i = 0; i < NUM_SAMPLES (v); i++)
> +		  BENCH_FUNC_LAT (v, i);
> +	      TIMING_NOW (end);
> +	      TIMING_DIFF (cur, start, end);
> +	      TIMING_ACCUM (latency, cur);
> +
>  	      d_total_i += iters * NUM_SAMPLES (v);
>  	    }
>  	  else
> @@ -131,12 +142,20 @@ main (int argc, char **argv)
>        /* Begin variant.  */
>        json_attr_object_begin (&json_ctx, VARIANT (v));
>  
> -      json_attr_double (&json_ctx, "duration", d_total_s);
> -      json_attr_double (&json_ctx, "iterations", d_total_i);
>        if (is_bench)
> -	json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i);
> +	{
> +	  json_attr_double (&json_ctx, "reciprocal-throughput",
> +			    throughput / d_total_i);
> +	  json_attr_double (&json_ctx, "latency", latency / d_total_i);
> +	  json_attr_double (&json_ctx, "max-throughput",
> +			    d_total_i / throughput * 1000000000.0);
> +	  json_attr_double (&json_ctx, "min-throughput",
> +			    d_total_i / latency * 1000000000.0);
> +	}
>        else
>  	{
> +	  json_attr_double (&json_ctx, "duration", d_total_s);
> +	  json_attr_double (&json_ctx, "iterations", d_total_i);
>  	  json_attr_double (&json_ctx, "max", max / d_iters);
>  	  json_attr_double (&json_ctx, "min", min / d_iters);
>  	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
> diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
> index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755
> --- a/benchtests/scripts/bench.py
> +++ b/benchtests/scripts/bench.py
> @@ -45,7 +45,7 @@ DEFINES_TEMPLATE = '''
>  # variant is represented by the _VARIANT structure.  The ARGS structure
>  # represents a single set of arguments.
>  STRUCT_TEMPLATE = '''
> -#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
> +#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s)
>  
>  struct args
>  {
> @@ -84,7 +84,9 @@ EPILOGUE = '''
>  #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
>  #define RESULT_ACCUM(r, v, i, old, new) \\
>          ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
> -#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
> +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );})
> +#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);})
> +#define BENCH_VARS %(defvar)s
>  #define FUNCNAME "%(func)s"
>  #include "bench-skeleton.c"'''
>  
> @@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals):
>      # If we have a return value from the function, make sure it is
>      # assigned to prevent the compiler from optimizing out the
>      # call.
> +    getret = ''
> +    latarg = ''
> +    defvar = ''
> +
>      if directives['ret']:
>          print('static %s volatile ret;' % directives['ret'])
> -        getret = 'ret = '
> -    else:
> -        getret = ''
> +        print('static %s zero __attribute__((used)) = 0;' % directives['ret'])
> +        getret = 'ret = func_res = '
> +        latarg = 'func_res * zero +'
> +        defvar = '%s func_res = 0;' % directives['ret']
>  
>      # Test initialization.
>      if directives['init']:
>          print('#define BENCH_INIT %s' % directives['init'])
>  
> -    print(EPILOGUE % {'getret': getret, 'func': func})
> +    print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar })
>  
>  
>  def _print_arg_data(func, directives, all_vals):
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Add math benchmark latency test
  2017-08-16 14:23     ` Siddhesh Poyarekar
@ 2017-08-17 15:46       ` Wilco Dijkstra
  0 siblings, 0 replies; 14+ messages in thread
From: Wilco Dijkstra @ 2017-08-17 15:46 UTC (permalink / raw)
  To: Siddhesh Poyarekar, Szabolcs Nagy, libc-alpha; +Cc: nd

Siddhesh Poyarekar wrote:
>
> BTW, this will only work for functions that have the same first argument
> and return types.  This is OK for now, but it might be something to fix
> in future.  A comment to that effect in the code would be very useful
> because I'm pretty sure I'll forget about it in the next 15 minutes or
> so and Wilco will too, maybe later than that :)

I've added a comment. It would fail on say pointer + float but works fine
if the types support addition between them. The latency test doesn't work
for sincos (somehow the madd is left out), but I'll fix that when I add a
workload for it.

Wilco

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-08-17 15:46 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-16 11:26 [PATCH] Add math benchmark latency test Wilco Dijkstra
2017-08-16 13:07 ` Siddhesh Poyarekar
2017-08-16 13:43   ` Szabolcs Nagy
2017-08-16 14:23     ` Siddhesh Poyarekar
2017-08-17 15:46       ` Wilco Dijkstra
2017-08-16 14:23   ` Arjan van de Ven
2017-08-16 14:26     ` Siddhesh Poyarekar
2017-08-16 14:33       ` Arjan van de Ven
2017-08-16 14:36         ` Siddhesh Poyarekar
2017-08-16 14:39         ` Alexander Monakov
2017-08-16 15:22           ` Wilco Dijkstra
2017-08-16 21:01             ` Siddhesh Poyarekar
2017-08-17 13:08               ` [PATCH v3] " Wilco Dijkstra
2017-08-17 14:01                 ` Siddhesh Poyarekar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).