mirror of
				https://sourceware.org/git/glibc.git
				synced 2025-11-03 20:53:13 +03:00 
			
		
		
		
	This patch further improves math function benchmarking by adding a latency
test in addition to throughput.  This enables more accurate comparisons of the
math functions. The latency test works by creating a dependency on the previous
iteration: func_res = F (func_res * zero + input[i]). The multiply by zero
avoids changing the input.
It reports reciprocal throughput and latency in nanoseconds (depending on the
timing header used) and max/min throughput in iterations per second:
   "workload-spec2006.wrf": {
    "reciprocal-throughput": 100,
    "latency": 200,
    "max-throughput": 1.0e+07,
    "min-throughput": 5.0e+06
   }
	* benchtests/bench-skeleton.c (main): Add support for
	latency benchmarking.
	* benchtests/scripts/bench.py: Add support for latency benchmarking.
		
	
		
			
				
	
	
		
			183 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			183 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* Skeleton for benchmark programs.
 | 
						|
   Copyright (C) 2013-2017 Free Software Foundation, Inc.
 | 
						|
   This file is part of the GNU C Library.
 | 
						|
 | 
						|
   The GNU C Library is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU Lesser General Public
 | 
						|
   License as published by the Free Software Foundation; either
 | 
						|
   version 2.1 of the License, or (at your option) any later version.
 | 
						|
 | 
						|
   The GNU C Library is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
   Lesser General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU Lesser General Public
 | 
						|
   License along with the GNU C Library; if not, see
 | 
						|
   <http://www.gnu.org/licenses/>.  */
 | 
						|
 | 
						|
#include <string.h>
 | 
						|
#include <stdint.h>
 | 
						|
#include <stdbool.h>
 | 
						|
#include <stdio.h>
 | 
						|
#include <time.h>
 | 
						|
#include <inttypes.h>
 | 
						|
#include "bench-timing.h"
 | 
						|
#include "json-lib.h"
 | 
						|
#include "bench-util.h"
 | 
						|
 | 
						|
#include "bench-util.c"
 | 
						|
 | 
						|
#define TIMESPEC_AFTER(a, b) \
 | 
						|
  (((a).tv_sec == (b).tv_sec) ?						      \
 | 
						|
     ((a).tv_nsec > (b).tv_nsec) :					      \
 | 
						|
	((a).tv_sec > (b).tv_sec))
 | 
						|
int
 | 
						|
main (int argc, char **argv)
 | 
						|
{
 | 
						|
  unsigned long i, k;
 | 
						|
  struct timespec runtime;
 | 
						|
  timing_t start, end;
 | 
						|
  bool detailed = false;
 | 
						|
  json_ctx_t json_ctx;
 | 
						|
 | 
						|
  if (argc == 2 && !strcmp (argv[1], "-d"))
 | 
						|
    detailed = true;
 | 
						|
 | 
						|
  bench_start ();
 | 
						|
 | 
						|
  memset (&runtime, 0, sizeof (runtime));
 | 
						|
 | 
						|
  unsigned long iters, res;
 | 
						|
 | 
						|
#ifdef BENCH_INIT
 | 
						|
  BENCH_INIT ();
 | 
						|
#endif
 | 
						|
  TIMING_INIT (res);
 | 
						|
 | 
						|
  iters = 1000 * res;
 | 
						|
 | 
						|
  json_init (&json_ctx, 2, stdout);
 | 
						|
 | 
						|
  /* Begin function.  */
 | 
						|
  json_attr_object_begin (&json_ctx, FUNCNAME);
 | 
						|
 | 
						|
  for (int v = 0; v < NUM_VARIANTS; v++)
 | 
						|
    {
 | 
						|
      /* Run for approximately DURATION seconds.  */
 | 
						|
      clock_gettime (CLOCK_MONOTONIC_RAW, &runtime);
 | 
						|
      runtime.tv_sec += DURATION;
 | 
						|
 | 
						|
      bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0;
 | 
						|
      double d_total_i = 0;
 | 
						|
      timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
 | 
						|
      timing_t throughput = 0, latency = 0;
 | 
						|
      int64_t c = 0;
 | 
						|
      uint64_t cur;
 | 
						|
      BENCH_VARS;
 | 
						|
      while (1)
 | 
						|
	{
 | 
						|
	  if (is_bench)
 | 
						|
	    {
 | 
						|
	      /* Benchmark a real trace of calls - all samples are iterated
 | 
						|
		 over once before repeating.  This models actual use more
 | 
						|
		 accurately than repeating the same sample many times.  */
 | 
						|
	      TIMING_NOW (start);
 | 
						|
	      for (k = 0; k < iters; k++)
 | 
						|
		for (i = 0; i < NUM_SAMPLES (v); i++)
 | 
						|
		  BENCH_FUNC (v, i);
 | 
						|
	      TIMING_NOW (end);
 | 
						|
	      TIMING_DIFF (cur, start, end);
 | 
						|
	      TIMING_ACCUM (throughput, cur);
 | 
						|
 | 
						|
	      TIMING_NOW (start);
 | 
						|
	      for (k = 0; k < iters; k++)
 | 
						|
		for (i = 0; i < NUM_SAMPLES (v); i++)
 | 
						|
		  BENCH_FUNC_LAT (v, i);
 | 
						|
	      TIMING_NOW (end);
 | 
						|
	      TIMING_DIFF (cur, start, end);
 | 
						|
	      TIMING_ACCUM (latency, cur);
 | 
						|
 | 
						|
	      d_total_i += iters * NUM_SAMPLES (v);
 | 
						|
	    }
 | 
						|
	  else
 | 
						|
	    for (i = 0; i < NUM_SAMPLES (v); i++)
 | 
						|
	      {
 | 
						|
		TIMING_NOW (start);
 | 
						|
		for (k = 0; k < iters; k++)
 | 
						|
		  BENCH_FUNC (v, i);
 | 
						|
		TIMING_NOW (end);
 | 
						|
 | 
						|
		TIMING_DIFF (cur, start, end);
 | 
						|
 | 
						|
		if (cur > max)
 | 
						|
		  max = cur;
 | 
						|
 | 
						|
		if (cur < min)
 | 
						|
		  min = cur;
 | 
						|
 | 
						|
		TIMING_ACCUM (total, cur);
 | 
						|
		/* Accumulate timings for the value.  In the end we will divide
 | 
						|
		   by the total iterations.  */
 | 
						|
		RESULT_ACCUM (cur, v, i, c * iters, (c + 1) * iters);
 | 
						|
 | 
						|
		d_total_i += iters;
 | 
						|
	      }
 | 
						|
	  c++;
 | 
						|
	  struct timespec curtime;
 | 
						|
 | 
						|
	  memset (&curtime, 0, sizeof (curtime));
 | 
						|
	  clock_gettime (CLOCK_MONOTONIC_RAW, &curtime);
 | 
						|
	  if (TIMESPEC_AFTER (curtime, runtime))
 | 
						|
	    goto done;
 | 
						|
	}
 | 
						|
 | 
						|
      double d_total_s;
 | 
						|
      double d_iters;
 | 
						|
 | 
						|
    done:
 | 
						|
      d_total_s = total;
 | 
						|
      d_iters = iters;
 | 
						|
 | 
						|
      /* Begin variant.  */
 | 
						|
      json_attr_object_begin (&json_ctx, VARIANT (v));
 | 
						|
 | 
						|
      if (is_bench)
 | 
						|
	{
 | 
						|
	  json_attr_double (&json_ctx, "reciprocal-throughput",
 | 
						|
			    throughput / d_total_i);
 | 
						|
	  json_attr_double (&json_ctx, "latency", latency / d_total_i);
 | 
						|
	  json_attr_double (&json_ctx, "max-throughput",
 | 
						|
			    d_total_i / throughput * 1000000000.0);
 | 
						|
	  json_attr_double (&json_ctx, "min-throughput",
 | 
						|
			    d_total_i / latency * 1000000000.0);
 | 
						|
	}
 | 
						|
      else
 | 
						|
	{
 | 
						|
	  json_attr_double (&json_ctx, "duration", d_total_s);
 | 
						|
	  json_attr_double (&json_ctx, "iterations", d_total_i);
 | 
						|
	  json_attr_double (&json_ctx, "max", max / d_iters);
 | 
						|
	  json_attr_double (&json_ctx, "min", min / d_iters);
 | 
						|
	  json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
 | 
						|
	}
 | 
						|
 | 
						|
      if (detailed && !is_bench)
 | 
						|
	{
 | 
						|
	  json_array_begin (&json_ctx, "timings");
 | 
						|
 | 
						|
	  for (int i = 0; i < NUM_SAMPLES (v); i++)
 | 
						|
	    json_element_double (&json_ctx, RESULT (v, i));
 | 
						|
 | 
						|
	  json_array_end (&json_ctx);
 | 
						|
	}
 | 
						|
 | 
						|
      /* End variant.  */
 | 
						|
      json_attr_object_end (&json_ctx);
 | 
						|
    }
 | 
						|
 | 
						|
  /* End function.  */
 | 
						|
  json_attr_object_end (&json_ctx);
 | 
						|
 | 
						|
  return 0;
 | 
						|
}
 |