augmented_raw_syscalls.c 8.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// SPDX-License-Identifier: GPL-2.0
/*
 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
 *
 * Test it with:
 *
 * perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null
 *
 * This exactly matches what is marshalled into the raw_syscall:sys_enter
 * payload expected by the 'perf trace' beautifiers.
 *
 * For now it just uses the existing tracepoint augmentation code in 'perf
 * trace', in the next csets we'll hook up these with the sys_enter/sys_exit
 * code that will combine entry/exit in a strace like way.
 */

17
#include <unistd.h>
18
#include <pid_filter.h>
19 20

/* bpf-output associated map */
21
bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
22

23 24 25 26
struct syscall {
	bool	enabled;
};

27
bpf_map(syscalls, ARRAY, int, struct syscall, 512);
28

29 30 31 32 33 34 35 36 37 38 39 40
struct syscall_enter_args {
	unsigned long long common_tp_fields;
	long		   syscall_nr;
	unsigned long	   args[6];
};

struct syscall_exit_args {
	unsigned long long common_tp_fields;
	long		   syscall_nr;
	long		   ret;
};

41 42 43 44 45 46
struct augmented_filename {
	unsigned int	size;
	int		reserved;
	char		value[256];
};

47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
/* syscalls where the first arg is a string */
#define SYS_OPEN                 2
#define SYS_STAT                 4
#define SYS_LSTAT                6
#define SYS_ACCESS              21
#define SYS_EXECVE              59
#define SYS_TRUNCATE            76
#define SYS_CHDIR               80
#define SYS_RENAME              82
#define SYS_MKDIR               83
#define SYS_RMDIR               84
#define SYS_CREAT               85
#define SYS_LINK                86
#define SYS_UNLINK              87
#define SYS_SYMLINK             88
#define SYS_READLINK            89
#define SYS_CHMOD               90
#define SYS_CHOWN               92
#define SYS_LCHOWN              94
#define SYS_MKNOD              133
#define SYS_STATFS             137
#define SYS_PIVOT_ROOT         155
#define SYS_CHROOT             161
#define SYS_ACCT               163
#define SYS_SWAPON             167
#define SYS_SWAPOFF            168
#define SYS_DELETE_MODULE      176
#define SYS_SETXATTR           188
#define SYS_LSETXATTR          189
#define SYS_GETXATTR           191
#define SYS_LGETXATTR          192
#define SYS_LISTXATTR          194
#define SYS_LLISTXATTR         195
#define SYS_REMOVEXATTR        197
#define SYS_LREMOVEXATTR       198
#define SYS_MQ_OPEN            240
#define SYS_MQ_UNLINK          241
#define SYS_ADD_KEY            248
#define SYS_REQUEST_KEY        249
#define SYS_SYMLINKAT          266
#define SYS_MEMFD_CREATE       319

/* syscalls where the first arg is a string */

#define SYS_PWRITE64            18
#define SYS_EXECVE              59
#define SYS_RENAME              82
#define SYS_QUOTACTL           179
#define SYS_FSETXATTR          190
#define SYS_FGETXATTR          193
#define SYS_FREMOVEXATTR       199
#define SYS_MQ_TIMEDSEND       242
#define SYS_REQUEST_KEY        249
#define SYS_INOTIFY_ADD_WATCH  254
#define SYS_OPENAT             257
#define SYS_MKDIRAT            258
#define SYS_MKNODAT            259
#define SYS_FCHOWNAT           260
#define SYS_FUTIMESAT          261
#define SYS_NEWFSTATAT         262
#define SYS_UNLINKAT           263
#define SYS_RENAMEAT           264
#define SYS_LINKAT             265
#define SYS_READLINKAT         267
#define SYS_FCHMODAT           268
#define SYS_FACCESSAT          269
#define SYS_UTIMENSAT          280
#define SYS_NAME_TO_HANDLE_AT  303
#define SYS_FINIT_MODULE       313
#define SYS_RENAMEAT2          316
#define SYS_EXECVEAT           322
#define SYS_STATX              332
119

120 121
pid_filter(pids_filtered);

122 123 124 125 126
SEC("raw_syscalls:sys_enter")
int sys_enter(struct syscall_enter_args *args)
{
	struct {
		struct syscall_enter_args args;
127
		struct augmented_filename filename;
128
	} augmented_args;
129
	struct syscall *syscall;
130
	unsigned int len = sizeof(augmented_args);
131
	const void *filename_arg = NULL;
132

133
	if (pid_filter__has(&pids_filtered, getpid()))
134 135
		return 0;

136
	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
137

138 139
	syscall = bpf_map_lookup_elem(&syscalls, &augmented_args.args.syscall_nr);
	if (syscall == NULL || !syscall->enabled)
140
		return 0;
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
	/*
	 * Yonghong and Edward Cree sayz:
	 *
	 * https://www.spinics.net/lists/netdev/msg531645.html
	 *
	 * >>   R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
	 * >> 10: (bf) r1 = r6
	 * >> 11: (07) r1 += 16
	 * >> 12: (05) goto pc+2
	 * >> 15: (79) r3 = *(u64 *)(r1 +0)
	 * >> dereference of modified ctx ptr R1 off=16 disallowed
	 * > Aha, we at least got a different error message this time.
	 * > And indeed llvm has done that optimisation, rather than the more obvious
	 * > 11: r3 = *(u64 *)(r1 +16)
	 * > because it wants to have lots of reads share a single insn.  You may be able
	 * > to defeat that optimisation by adding compiler barriers, idk.  Maybe someone
	 * > with llvm knowledge can figure out how to stop it (ideally, llvm would know
	 * > when it's generating for bpf backend and not do that).  -O0?  ¯\_(ツ)_/¯
	 *
	 * The optimization mostly likes below:
	 *
	 *	br1:
	 * 	...
	 *	r1 += 16
	 *	goto merge
	 *	br2:
	 *	...
	 *	r1 += 20
	 *	goto merge
	 *	merge:
	 *	*(u64 *)(r1 + 0)
	 *
	 * The compiler tries to merge common loads. There is no easy way to
	 * stop this compiler optimization without turning off a lot of other
	 * optimizations. The easiest way is to add barriers:
	 *
	 * 	 __asm__ __volatile__("": : :"memory")
	 *
	 * 	 after the ctx memory access to prevent their down stream merging.
	 */
181 182 183 184 185 186 187 188 189 190 191 192 193
	/*
	 * This table of what args are strings will be provided by userspace,
	 * in the syscalls map, i.e. we will already have to do the lookup to
	 * see if this specific syscall is filtered, so we can as well get more
	 * info about what syscall args are strings or pointers, and how many
	 * bytes to copy, per arg, etc.
	 *
	 * For now hard code it, till we have all the basic mechanisms in place
	 * to automate everything and make the kernel part be completely driven
	 * by information obtained in userspace for each kernel version and
	 * processor architecture, making the kernel part the same no matter what
	 * kernel version or processor architecture it runs on.
	 */
194
	switch (augmented_args.args.syscall_nr) {
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
	case SYS_ACCT:
	case SYS_ADD_KEY:
	case SYS_CHDIR:
	case SYS_CHMOD:
	case SYS_CHOWN:
	case SYS_CHROOT:
	case SYS_CREAT:
	case SYS_DELETE_MODULE:
	case SYS_EXECVE:
	case SYS_GETXATTR:
	case SYS_LCHOWN:
	case SYS_LGETXATTR:
	case SYS_LINK:
	case SYS_LISTXATTR:
	case SYS_LLISTXATTR:
	case SYS_LREMOVEXATTR:
	case SYS_LSETXATTR:
	case SYS_LSTAT:
	case SYS_MEMFD_CREATE:
	case SYS_MKDIR:
	case SYS_MKNOD:
	case SYS_MQ_OPEN:
	case SYS_MQ_UNLINK:
	case SYS_PIVOT_ROOT:
	case SYS_READLINK:
	case SYS_REMOVEXATTR:
	case SYS_RENAME:
	case SYS_REQUEST_KEY:
	case SYS_RMDIR:
	case SYS_SETXATTR:
	case SYS_STAT:
	case SYS_STATFS:
	case SYS_SWAPOFF:
	case SYS_SWAPON:
	case SYS_SYMLINK:
	case SYS_SYMLINKAT:
	case SYS_TRUNCATE:
	case SYS_UNLINK:
233
	case SYS_ACCESS:
234 235 236
	case SYS_OPEN:	 filename_arg = (const void *)args->args[0];
			__asm__ __volatile__("": : :"memory");
			 break;
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
	case SYS_EXECVEAT:
	case SYS_FACCESSAT:
	case SYS_FCHMODAT:
	case SYS_FCHOWNAT:
	case SYS_FGETXATTR:
	case SYS_FINIT_MODULE:
	case SYS_FREMOVEXATTR:
	case SYS_FSETXATTR:
	case SYS_FUTIMESAT:
	case SYS_INOTIFY_ADD_WATCH:
	case SYS_LINKAT:
	case SYS_MKDIRAT:
	case SYS_MKNODAT:
	case SYS_MQ_TIMEDSEND:
	case SYS_NAME_TO_HANDLE_AT:
	case SYS_NEWFSTATAT:
	case SYS_PWRITE64:
	case SYS_QUOTACTL:
	case SYS_READLINKAT:
	case SYS_RENAMEAT:
	case SYS_RENAMEAT2:
	case SYS_STATX:
	case SYS_UNLINKAT:
	case SYS_UTIMENSAT:
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
	case SYS_OPENAT: filename_arg = (const void *)args->args[1];
			 break;
	}

	if (filename_arg != NULL) {
		augmented_args.filename.reserved = 0;
		augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
							      sizeof(augmented_args.filename.value),
							      filename_arg);
		if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
			len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
			len &= sizeof(augmented_args.filename.value) - 1;
		}
	} else {
		len = sizeof(augmented_args.args);
	}

278 279
	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
280 281 282 283 284
}

SEC("raw_syscalls:sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
285
	struct syscall_exit_args exit_args;
286
	struct syscall *syscall;
287 288 289 290 291 292

	if (pid_filter__has(&pids_filtered, getpid()))
		return 0;

	probe_read(&exit_args, sizeof(exit_args), args);

293 294
	syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);
	if (syscall == NULL || !syscall->enabled)
295 296 297
		return 0;

	return 1;
298 299 300
}

license(GPL);