mirror of
https://github.com/torvalds/linux.git
synced 2026-04-19 15:24:02 -04:00
We frequently consult with Jesper's out-of-tree page_pool benchmark to
evaluate page_pool changes.
Import the benchmark into the upstream linux kernel tree so that (a)
we're all running the same version, (b) pave the way for shared
improvements, and (c) maybe one day integrate it with nipa, if possible.
Import bench_page_pool_simple from commit 35b1716d0c30 ("Add
page_bench06_walk_all"), from this repository:
https://github.com/netoptimizer/prototype-kernel.git
Changes done during upstreaming:
- Fix checkpatch issues.
- Remove the tasklet logic not needed.
- Move under tools/testing
- Create ksft for the benchmark.
- Changed slightly how the benchmark gets build. Out of tree, time_bench
is built as an independent .ko. Here it is included in
bench_page_pool.ko
Steps to run:
```
mkdir -p /tmp/run-pp-bench
make -C ./tools/testing/selftests/net/bench
make -C ./tools/testing/selftests/net/bench install INSTALL_PATH=/tmp/run-pp-bench
rsync --delete -avz --progress /tmp/run-pp-bench mina@$SERVER:~/
ssh mina@$SERVER << EOF
cd ~/run-pp-bench && sudo ./test_bench_page_pool.sh
EOF
```
Note that by default, the Makefile will build the benchmark for the
currently installed kernel in /lib/modules/$(shell uname -r)/build. To
build against the current tree, do:
make KDIR=$(pwd) -C ./tools/testing/selftests/net/bench
Output (from Jesper):
```
sudo ./test_bench_page_pool.sh
(benchmark dmesg logs snipped)
Fast path results:
no-softirq-page_pool01 Per elem: 23 cycles(tsc) 6.571 ns
ptr_ring results:
no-softirq-page_pool02 Per elem: 60 cycles(tsc) 16.862 ns
slow path results:
no-softirq-page_pool03 Per elem: 265 cycles(tsc) 73.739 ns
```
Output (from me):
```
sudo ./test_bench_page_pool.sh
(benchmark dmesg logs snipped)
Fast path results:
no-softirq-page_pool01 Per elem: 11 cycles(tsc) 4.177 ns
ptr_ring results:
no-softirq-page_pool02 Per elem: 51 cycles(tsc) 19.117 ns
slow path results:
no-softirq-page_pool03 Per elem: 168 cycles(tsc) 62.469 ns
```
Results of course will vary based on hardware/kernel/configs, and some
variance may be there from run to run due to some noise.
Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20250619181519.3102426-1-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
239 lines
7.1 KiB
C
239 lines
7.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Benchmarking code execution time inside the kernel
|
|
*
|
|
* Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
|
|
* for licensing details see kernel-base/COPYING
|
|
*/
|
|
#ifndef _LINUX_TIME_BENCH_H
|
|
#define _LINUX_TIME_BENCH_H
|
|
|
|
/* Main structure used for recording a benchmark run */
|
|
struct time_bench_record {
|
|
uint32_t version_abi;
|
|
uint32_t loops; /* Requested loop invocations */
|
|
uint32_t step; /* option for e.g. bulk invocations */
|
|
|
|
uint32_t flags; /* Measurements types enabled */
|
|
#define TIME_BENCH_LOOP BIT(0)
|
|
#define TIME_BENCH_TSC BIT(1)
|
|
#define TIME_BENCH_WALLCLOCK BIT(2)
|
|
#define TIME_BENCH_PMU BIT(3)
|
|
|
|
uint32_t cpu; /* Used when embedded in time_bench_cpu */
|
|
|
|
/* Records */
|
|
uint64_t invoked_cnt; /* Returned actual invocations */
|
|
uint64_t tsc_start;
|
|
uint64_t tsc_stop;
|
|
struct timespec64 ts_start;
|
|
struct timespec64 ts_stop;
|
|
/* PMU counters for instruction and cycles
|
|
* instructions counter including pipelined instructions
|
|
*/
|
|
uint64_t pmc_inst_start;
|
|
uint64_t pmc_inst_stop;
|
|
/* CPU unhalted clock counter */
|
|
uint64_t pmc_clk_start;
|
|
uint64_t pmc_clk_stop;
|
|
|
|
/* Result records */
|
|
uint64_t tsc_interval;
|
|
uint64_t time_start, time_stop, time_interval; /* in nanosec */
|
|
uint64_t pmc_inst, pmc_clk;
|
|
|
|
/* Derived result records */
|
|
uint64_t tsc_cycles; // +decimal?
|
|
uint64_t ns_per_call_quotient, ns_per_call_decimal;
|
|
uint64_t time_sec;
|
|
uint32_t time_sec_remainder;
|
|
uint64_t pmc_ipc_quotient, pmc_ipc_decimal; /* inst per cycle */
|
|
};
|
|
|
|
/* For synchronizing parallel CPUs to run concurrently */
|
|
struct time_bench_sync {
|
|
atomic_t nr_tests_running;
|
|
struct completion start_event;
|
|
};
|
|
|
|
/* Keep track of CPUs executing our bench function.
|
|
*
|
|
* Embed a time_bench_record for storing info per cpu
|
|
*/
|
|
struct time_bench_cpu {
|
|
struct time_bench_record rec;
|
|
struct time_bench_sync *sync; /* back ptr */
|
|
struct task_struct *task;
|
|
/* "data" opaque could have been placed in time_bench_sync,
|
|
* but to avoid any false sharing, place it per CPU
|
|
*/
|
|
void *data;
|
|
/* Support masking outsome CPUs, mark if it ran */
|
|
bool did_bench_run;
|
|
/* int cpu; // note CPU stored in time_bench_record */
|
|
int (*bench_func)(struct time_bench_record *record, void *data);
|
|
};
|
|
|
|
/*
|
|
* Below TSC assembler code is not compatible with other archs, and
|
|
* can also fail on guests if cpu-flags are not correct.
|
|
*
|
|
* The way TSC reading is used, many iterations, does not require as
|
|
* high accuracy as described below (in Intel Doc #324264).
|
|
*
|
|
* Considering changing to use get_cycles() (#include <asm/timex.h>).
|
|
*/
|
|
|
|
/** TSC (Time-Stamp Counter) based **
|
|
* Recommend reading, to understand details of reading TSC accurately:
|
|
* Intel Doc #324264, "How to Benchmark Code Execution Times on Intel"
|
|
*
|
|
* Consider getting exclusive ownership of CPU by using:
|
|
* unsigned long flags;
|
|
* preempt_disable();
|
|
* raw_local_irq_save(flags);
|
|
* _your_code_
|
|
* raw_local_irq_restore(flags);
|
|
* preempt_enable();
|
|
*
|
|
* Clobbered registers: "%rax", "%rbx", "%rcx", "%rdx"
|
|
* RDTSC only change "%rax" and "%rdx" but
|
|
* CPUID clears the high 32-bits of all (rax/rbx/rcx/rdx)
|
|
*/
|
|
static __always_inline uint64_t tsc_start_clock(void)
|
|
{
|
|
/* See: Intel Doc #324264 */
|
|
unsigned int hi, lo;
|
|
|
|
asm volatile("CPUID\n\t"
|
|
"RDTSC\n\t"
|
|
"mov %%edx, %0\n\t"
|
|
"mov %%eax, %1\n\t"
|
|
: "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx");
|
|
//FIXME: on 32bit use clobbered %eax + %edx
|
|
return ((uint64_t)lo) | (((uint64_t)hi) << 32);
|
|
}
|
|
|
|
static __always_inline uint64_t tsc_stop_clock(void)
|
|
{
|
|
/* See: Intel Doc #324264 */
|
|
unsigned int hi, lo;
|
|
|
|
asm volatile("RDTSCP\n\t"
|
|
"mov %%edx, %0\n\t"
|
|
"mov %%eax, %1\n\t"
|
|
"CPUID\n\t"
|
|
: "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx");
|
|
return ((uint64_t)lo) | (((uint64_t)hi) << 32);
|
|
}
|
|
|
|
/** Wall-clock based **
|
|
*
|
|
* use: getnstimeofday()
|
|
* getnstimeofday(&rec->ts_start);
|
|
* getnstimeofday(&rec->ts_stop);
|
|
*
|
|
* API changed see: Documentation/core-api/timekeeping.rst
|
|
* https://www.kernel.org/doc/html/latest/core-api/timekeeping.html#c.getnstimeofday
|
|
*
|
|
* We should instead use: ktime_get_real_ts64() is a direct
|
|
* replacement, but consider using monotonic time (ktime_get_ts64())
|
|
* and/or a ktime_t based interface (ktime_get()/ktime_get_real()).
|
|
*/
|
|
|
|
/** PMU (Performance Monitor Unit) based **
|
|
*
|
|
* Needed for calculating: Instructions Per Cycle (IPC)
|
|
* - The IPC number tell how efficient the CPU pipelining were
|
|
*/
|
|
//lookup: perf_event_create_kernel_counter()
|
|
|
|
bool time_bench_PMU_config(bool enable);
|
|
|
|
/* Raw reading via rdpmc() using fixed counters
|
|
*
|
|
* From: https://github.com/andikleen/simple-pmu
|
|
*/
|
|
enum {
|
|
FIXED_SELECT = (1U << 30), /* == 0x40000000 */
|
|
FIXED_INST_RETIRED_ANY = 0,
|
|
FIXED_CPU_CLK_UNHALTED_CORE = 1,
|
|
FIXED_CPU_CLK_UNHALTED_REF = 2,
|
|
};
|
|
|
|
static __always_inline unsigned int long long p_rdpmc(unsigned int in)
|
|
{
|
|
unsigned int d, a;
|
|
|
|
asm volatile("rdpmc" : "=d"(d), "=a"(a) : "c"(in) : "memory");
|
|
return ((unsigned long long)d << 32) | a;
|
|
}
|
|
|
|
/* These PMU counter needs to be enabled, but I don't have the
|
|
* configure code implemented. My current hack is running:
|
|
* sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko
|
|
*/
|
|
/* Reading all pipelined instruction */
|
|
static __always_inline unsigned long long pmc_inst(void)
|
|
{
|
|
return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY);
|
|
}
|
|
|
|
/* Reading CPU clock cycles */
|
|
static __always_inline unsigned long long pmc_clk(void)
|
|
{
|
|
return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE);
|
|
}
|
|
|
|
/* Raw reading via MSR rdmsr() is likely wrong
|
|
* FIXME: How can I know which raw MSR registers are conf for what?
|
|
*/
|
|
#define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */
|
|
#define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */
|
|
#define MSR_IA32_PCM2 0x400000C3
|
|
static inline uint64_t msr_inst(unsigned long long *msr_result)
|
|
{
|
|
return rdmsrq_safe(MSR_IA32_PCM0, msr_result);
|
|
}
|
|
|
|
/** Generic functions **
|
|
*/
|
|
bool time_bench_loop(uint32_t loops, int step, char *txt, void *data,
|
|
int (*func)(struct time_bench_record *rec, void *data));
|
|
bool time_bench_calc_stats(struct time_bench_record *rec);
|
|
|
|
void time_bench_run_concurrent(uint32_t loops, int step, void *data,
|
|
const struct cpumask *mask, /* Support masking outsome CPUs*/
|
|
struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks,
|
|
int (*func)(struct time_bench_record *record, void *data));
|
|
void time_bench_print_stats_cpumask(const char *desc,
|
|
struct time_bench_cpu *cpu_tasks,
|
|
const struct cpumask *mask);
|
|
|
|
//FIXME: use rec->flags to select measurement, should be MACRO
|
|
static __always_inline void time_bench_start(struct time_bench_record *rec)
|
|
{
|
|
//getnstimeofday(&rec->ts_start);
|
|
ktime_get_real_ts64(&rec->ts_start);
|
|
if (rec->flags & TIME_BENCH_PMU) {
|
|
rec->pmc_inst_start = pmc_inst();
|
|
rec->pmc_clk_start = pmc_clk();
|
|
}
|
|
rec->tsc_start = tsc_start_clock();
|
|
}
|
|
|
|
static __always_inline void time_bench_stop(struct time_bench_record *rec,
|
|
uint64_t invoked_cnt)
|
|
{
|
|
rec->tsc_stop = tsc_stop_clock();
|
|
if (rec->flags & TIME_BENCH_PMU) {
|
|
rec->pmc_inst_stop = pmc_inst();
|
|
rec->pmc_clk_stop = pmc_clk();
|
|
}
|
|
//getnstimeofday(&rec->ts_stop);
|
|
ktime_get_real_ts64(&rec->ts_stop);
|
|
rec->invoked_cnt = invoked_cnt;
|
|
}
|
|
|
|
#endif /* _LINUX_TIME_BENCH_H */
|