Files
linux/tools/perf/examples/bpf/augmented_raw_syscalls.c
Arnaldo Carvalho de Melo 4cae8675ea perf augmented_raw_syscalls: Tell which args are filenames and how many bytes to copy
Since we know what args are strings from reading the syscall
descriptions in tracefs and also already mark such args to be beautified
using the syscall_arg__scnprintf_filename() helper, all we need is to
fill in this info in the 'syscalls' BPF map we were using to state which
syscalls the user is interested in, i.e. the syscall filter.

Right now just set that with PATH_MAX and unroll the syscall arg in the
BPF program, as the verifier isn't liking something clang generates when
unrolling the loop.

This also makes the augmented_raw_syscalls.c program support all arches,
since we removed that set of defines with the hard coded syscall
numbers, all should be automatically set for all arches, with the
syscall id mapping done correcly.

Doing baby steps here, i.e. just the first string arg for a syscall is
printed, syscalls with more than one, say, the various rename* syscalls,
need further work, but lets get first something that the BPF verifier
accepts before increasing the complexity

To test it, something like:

 # perf trace -e string -e /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c

With:

  # cat ~/.perfconfig
  [llvm]
	dump-obj = true
	clang-opt = -g
  [trace]
	#add_events = /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.c
	show_zeros = yes
	show_duration = no
	no_inherit = yes
	show_timestamp = no
	show_arg_names = no
	args_alignment = 40
	show_prefix = yes
  #

That commented add_events line is needed for developing this
augmented_raw_syscalls.c BPF program, as if we add it via the
'add_events' mechanism so as to shorten the 'perf trace' command lines,
then we end up not setting up the -v option which precludes us having
access to the bpf verifier log :-\

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Andrii Nakryiko <andriin@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Link: https://lkml.kernel.org/n/tip-dn863ya0cbsqycxuy0olvbt1@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-06-05 09:47:58 -03:00

220 lines
7.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
*
* Test it with:
*
* perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null
*
* This exactly matches what is marshalled into the raw_syscall:sys_enter
* payload expected by the 'perf trace' beautifiers.
*
* For now it just uses the existing tracepoint augmentation code in 'perf
* trace', in the next csets we'll hook up these with the sys_enter/sys_exit
* code that will combine entry/exit in a strace like way.
*/
#include <unistd.h>
#include <linux/limits.h>
#include <pid_filter.h>
/* bpf-output associated map */
bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
/*
* string_args_len: one per syscall arg, 0 means not a string or don't copy it,
* PATH_MAX for copying everything, any other value to limit
* it a la 'strace -s strsize'.
*/
struct syscall {
bool enabled;
u16 string_args_len[6];
};
bpf_map(syscalls, ARRAY, int, struct syscall, 512);
struct syscall_enter_args {
unsigned long long common_tp_fields;
long syscall_nr;
unsigned long args[6];
};
struct syscall_exit_args {
unsigned long long common_tp_fields;
long syscall_nr;
long ret;
};
struct augmented_filename {
unsigned int size;
int reserved;
char value[PATH_MAX];
};
pid_filter(pids_filtered);
struct augmented_args_filename {
struct syscall_enter_args args;
struct augmented_filename filename;
};
bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);
SEC("raw_syscalls:sys_enter")
int sys_enter(struct syscall_enter_args *args)
{
struct augmented_args_filename *augmented_args;
unsigned int len = sizeof(*augmented_args), filename_len;
const void *filename_arg = NULL;
struct syscall *syscall;
int key = 0;
augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
if (augmented_args == NULL)
return 1;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
if (syscall == NULL || !syscall->enabled)
return 0;
/*
* Yonghong and Edward Cree sayz:
*
* https://www.spinics.net/lists/netdev/msg531645.html
*
* >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
* >> 10: (bf) r1 = r6
* >> 11: (07) r1 += 16
* >> 12: (05) goto pc+2
* >> 15: (79) r3 = *(u64 *)(r1 +0)
* >> dereference of modified ctx ptr R1 off=16 disallowed
* > Aha, we at least got a different error message this time.
* > And indeed llvm has done that optimisation, rather than the more obvious
* > 11: r3 = *(u64 *)(r1 +16)
* > because it wants to have lots of reads share a single insn. You may be able
* > to defeat that optimisation by adding compiler barriers, idk. Maybe someone
* > with llvm knowledge can figure out how to stop it (ideally, llvm would know
* > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯
*
* The optimization mostly likes below:
*
* br1:
* ...
* r1 += 16
* goto merge
* br2:
* ...
* r1 += 20
* goto merge
* merge:
* *(u64 *)(r1 + 0)
*
* The compiler tries to merge common loads. There is no easy way to
* stop this compiler optimization without turning off a lot of other
* optimizations. The easiest way is to add barriers:
*
* __asm__ __volatile__("": : :"memory")
*
* after the ctx memory access to prevent their down stream merging.
*/
/*
* For now copy just the first string arg, we need to improve the protocol
* and have more than one.
*
* Using the unrolled loop is not working, only when we do it manually,
* check this out later...
u8 arg;
#pragma clang loop unroll(full)
for (arg = 0; arg < 6; ++arg) {
if (syscall->string_args_len[arg] != 0) {
filename_len = syscall->string_args_len[arg];
filename_arg = (const void *)args->args[arg];
__asm__ __volatile__("": : :"memory");
break;
}
}
verifier log:
; if (syscall->string_args_len[arg] != 0) {
37: (69) r3 = *(u16 *)(r0 +2)
R0=map_value(id=0,off=0,ks=4,vs=14,imm=0) R1_w=inv0 R2_w=map_value(id=0,off=2,ks=4,vs=14,imm=0) R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=4168,imm=0) R10=fp0,call_-1 fp-8=mmmmmmmm
; if (syscall->string_args_len[arg] != 0) {
38: (55) if r3 != 0x0 goto pc+5
R0=map_value(id=0,off=0,ks=4,vs=14,imm=0) R1=inv0 R2=map_value(id=0,off=2,ks=4,vs=14,imm=0) R3=inv0 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=4168,imm=0) R10=fp0,call_-1 fp-8=mmmmmmmm
39: (b7) r1 = 1
; if (syscall->string_args_len[arg] != 0) {
40: (bf) r2 = r0
41: (07) r2 += 4
42: (69) r3 = *(u16 *)(r0 +4)
R0=map_value(id=0,off=0,ks=4,vs=14,imm=0) R1_w=inv1 R2_w=map_value(id=0,off=4,ks=4,vs=14,imm=0) R3_w=inv0 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=4168,imm=0) R10=fp0,call_-1 fp-8=mmmmmmmm
; if (syscall->string_args_len[arg] != 0) {
43: (15) if r3 == 0x0 goto pc+32
R0=map_value(id=0,off=0,ks=4,vs=14,imm=0) R1=inv1 R2=map_value(id=0,off=4,ks=4,vs=14,imm=0) R3=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=4168,imm=0) R10=fp0,call_-1 fp-8=mmmmmmmm
; filename_arg = (const void *)args->args[arg];
44: (67) r1 <<= 3
45: (bf) r3 = r6
46: (0f) r3 += r1
47: (b7) r5 = 64
48: (79) r3 = *(u64 *)(r3 +16)
dereference of modified ctx ptr R3 off=8 disallowed
processed 46 insns (limit 1000000) max_states_per_insn 0 total_states 12 peak_states 12 mark_read 7
*/
#define __loop_iter(arg) \
if (syscall->string_args_len[arg] != 0) { \
filename_len = syscall->string_args_len[arg]; \
filename_arg = (const void *)args->args[arg];
#define loop_iter_first() __loop_iter(0); }
#define loop_iter(arg) else __loop_iter(arg); }
#define loop_iter_last(arg) else __loop_iter(arg); __asm__ __volatile__("": : :"memory"); }
loop_iter_first()
loop_iter(1)
loop_iter(2)
loop_iter(3)
loop_iter(4)
loop_iter_last(5)
if (filename_arg != NULL && filename_len <= sizeof(augmented_args->filename.value)) {
augmented_args->filename.reserved = 0;
augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
filename_len,
filename_arg);
if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
len &= sizeof(augmented_args->filename.value) - 1;
}
} else {
len = sizeof(augmented_args->args);
}
/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
}
SEC("raw_syscalls:sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
struct syscall_exit_args exit_args;
struct syscall *syscall;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&exit_args, sizeof(exit_args), args);
syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);
if (syscall == NULL || !syscall->enabled)
return 0;
return 1;
}
license(GPL);