perf_event.h 38.1 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events:
T
Thomas Gleixner 已提交
3
 *
I
Ingo Molnar 已提交
4
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
5 6
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
T
Thomas Gleixner 已提交
7
 *
I
Ingo Molnar 已提交
8
 * Data type definitions, declarations, prototypes.
T
Thomas Gleixner 已提交
9
 *
I
Ingo Molnar 已提交
10
 *    Started by: Thomas Gleixner and Ingo Molnar
T
Thomas Gleixner 已提交
11
 *
I
Ingo Molnar 已提交
12
 * For licencing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
13
 */
14 15
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H
T
Thomas Gleixner 已提交
16

17 18
#include <linux/types.h>
#include <linux/ioctl.h>
19
#include <asm/byteorder.h>
T
Thomas Gleixner 已提交
20 21

/*
I
Ingo Molnar 已提交
22 23 24 25
 * User-space ABI bits:
 */

/*
26
 * attr.type
T
Thomas Gleixner 已提交
27
 */
P
Peter Zijlstra 已提交
28
enum perf_type_id {
I
Ingo Molnar 已提交
29 30 31 32 33
	PERF_TYPE_HARDWARE			= 0,
	PERF_TYPE_SOFTWARE			= 1,
	PERF_TYPE_TRACEPOINT			= 2,
	PERF_TYPE_HW_CACHE			= 3,
	PERF_TYPE_RAW				= 4,
34
	PERF_TYPE_BREAKPOINT			= 5,
35

I
Ingo Molnar 已提交
36
	PERF_TYPE_MAX,				/* non-ABI */
37
};
38

39
/*
40 41
 * Generalized performance event event_id types, used by the
 * attr.event_id parameter of the sys_perf_event_open()
I
Ingo Molnar 已提交
42
 * syscall:
43
 */
P
Peter Zijlstra 已提交
44
enum perf_hw_id {
I
Ingo Molnar 已提交
45
	/*
46
	 * Common hardware events, generalized by the kernel:
I
Ingo Molnar 已提交
47
	 */
48 49 50 51 52 53 54
	PERF_COUNT_HW_CPU_CYCLES		= 0,
	PERF_COUNT_HW_INSTRUCTIONS		= 1,
	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
	PERF_COUNT_HW_CACHE_MISSES		= 3,
	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
	PERF_COUNT_HW_BRANCH_MISSES		= 5,
	PERF_COUNT_HW_BUS_CYCLES		= 6,
55 56
	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
57
	PERF_COUNT_HW_REF_CPU_CYCLES		= 9,
58

I
Ingo Molnar 已提交
59
	PERF_COUNT_HW_MAX,			/* non-ABI */
60
};
61

62
/*
63
 * Generalized hardware cache events:
64
 *
65
 *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x
66 67 68
 *       { read, write, prefetch } x
 *       { accesses, misses }
 */
P
Peter Zijlstra 已提交
69
enum perf_hw_cache_id {
I
Ingo Molnar 已提交
70 71 72 73 74 75
	PERF_COUNT_HW_CACHE_L1D			= 0,
	PERF_COUNT_HW_CACHE_L1I			= 1,
	PERF_COUNT_HW_CACHE_LL			= 2,
	PERF_COUNT_HW_CACHE_DTLB		= 3,
	PERF_COUNT_HW_CACHE_ITLB		= 4,
	PERF_COUNT_HW_CACHE_BPU			= 5,
76
	PERF_COUNT_HW_CACHE_NODE		= 6,
I
Ingo Molnar 已提交
77 78

	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
79 80
};

P
Peter Zijlstra 已提交
81
enum perf_hw_cache_op_id {
I
Ingo Molnar 已提交
82 83 84
	PERF_COUNT_HW_CACHE_OP_READ		= 0,
	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
85

I
Ingo Molnar 已提交
86
	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
87 88
};

P
Peter Zijlstra 已提交
89 90 91
enum perf_hw_cache_op_result_id {
	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
92

I
Ingo Molnar 已提交
93
	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
94 95
};

96
/*
97 98
 * Special "software" events provided by the kernel, even if the hardware
 * does not support performance events. These events measure various
99 100 101
 * physical and sw events of the kernel (and allow the profiling of them as
 * well):
 */
P
Peter Zijlstra 已提交
102
enum perf_sw_ids {
I
Ingo Molnar 已提交
103 104 105 106 107 108 109
	PERF_COUNT_SW_CPU_CLOCK			= 0,
	PERF_COUNT_SW_TASK_CLOCK		= 1,
	PERF_COUNT_SW_PAGE_FAULTS		= 2,
	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
110 111
	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
I
Ingo Molnar 已提交
112 113

	PERF_COUNT_SW_MAX,			/* non-ABI */
T
Thomas Gleixner 已提交
114 115
};

116
/*
117
 * Bits that can be set in attr.sample_type to request information
118 119
 * in the overflow packets.
 */
120
enum perf_event_sample_format {
I
Ingo Molnar 已提交
121 122 123 124
	PERF_SAMPLE_IP				= 1U << 0,
	PERF_SAMPLE_TID				= 1U << 1,
	PERF_SAMPLE_TIME			= 1U << 2,
	PERF_SAMPLE_ADDR			= 1U << 3,
125
	PERF_SAMPLE_READ			= 1U << 4,
I
Ingo Molnar 已提交
126 127 128 129
	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
	PERF_SAMPLE_ID				= 1U << 6,
	PERF_SAMPLE_CPU				= 1U << 7,
	PERF_SAMPLE_PERIOD			= 1U << 8,
130
	PERF_SAMPLE_STREAM_ID			= 1U << 9,
131
	PERF_SAMPLE_RAW				= 1U << 10,
132
	PERF_SAMPLE_BRANCH_STACK		= 1U << 11,
133
	PERF_SAMPLE_REGS_USER			= 1U << 12,
134
	PERF_SAMPLE_STACK_USER			= 1U << 13,
135

136
	PERF_SAMPLE_MAX = 1U << 14,		/* non-ABI */
137 138
};

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
/*
 * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
 *
 * If the user does not pass priv level information via branch_sample_type,
 * the kernel uses the event's priv level. Branch and event priv levels do
 * not have to match. Branch priv level is checked for permissions.
 *
 * The branch types can be combined, however BRANCH_ANY covers all types
 * of branches and therefore it supersedes all the other types.
 */
enum perf_branch_sample_type {
	PERF_SAMPLE_BRANCH_USER		= 1U << 0, /* user branches */
	PERF_SAMPLE_BRANCH_KERNEL	= 1U << 1, /* kernel branches */
	PERF_SAMPLE_BRANCH_HV		= 1U << 2, /* hypervisor branches */

	PERF_SAMPLE_BRANCH_ANY		= 1U << 3, /* any branch types */
	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */

	PERF_SAMPLE_BRANCH_MAX		= 1U << 7, /* non-ABI */
};

#define PERF_SAMPLE_BRANCH_PLM_ALL \
	(PERF_SAMPLE_BRANCH_USER|\
	 PERF_SAMPLE_BRANCH_KERNEL|\
	 PERF_SAMPLE_BRANCH_HV)

167 168 169 170 171 172 173 174 175
/*
 * Values to determine ABI of the registers dump.
 */
enum perf_sample_regs_abi {
	PERF_SAMPLE_REGS_ABI_NONE	= 0,
	PERF_SAMPLE_REGS_ABI_32		= 1,
	PERF_SAMPLE_REGS_ABI_64		= 2,
};

176
/*
177
 * The format of the data returned by read() on a perf event fd,
178 179 180
 * as specified by attr.read_format:
 *
 * struct read_format {
I
Ingo Molnar 已提交
181
 *	{ u64		value;
182 183
 *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
 *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
I
Ingo Molnar 已提交
184 185
 *	  { u64		id;           } && PERF_FORMAT_ID
 *	} && !PERF_FORMAT_GROUP
186
 *
I
Ingo Molnar 已提交
187
 *	{ u64		nr;
188 189
 *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
 *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
I
Ingo Molnar 已提交
190 191 192 193
 *	  { u64		value;
 *	    { u64	id;           } && PERF_FORMAT_ID
 *	  }		cntr[nr];
 *	} && PERF_FORMAT_GROUP
194
 * };
195
 */
196
enum perf_event_read_format {
I
Ingo Molnar 已提交
197 198 199
	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
	PERF_FORMAT_ID				= 1U << 2,
200
	PERF_FORMAT_GROUP			= 1U << 3,
201

I
Ingo Molnar 已提交
202
	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
203 204
};

205
#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
S
Stephane Eranian 已提交
206 207
#define PERF_ATTR_SIZE_VER1	72	/* add: config2 */
#define PERF_ATTR_SIZE_VER2	80	/* add: branch_sample_type */
208 209
#define PERF_ATTR_SIZE_VER3	96	/* add: sample_regs_user */
					/* add: sample_stack_user */
210

I
Ingo Molnar 已提交
211
/*
212
 * Hardware event_id to monitor via a performance monitoring event:
I
Ingo Molnar 已提交
213
 */
214
struct perf_event_attr {
215

216
	/*
217 218 219
	 * Major type: hardware/software/tracepoint/etc.
	 */
	__u32			type;
220 221 222 223 224

	/*
	 * Size of the attr structure, for fwd/bwd compat.
	 */
	__u32			size;
225 226 227

	/*
	 * Type specific configuration information.
228 229
	 */
	__u64			config;
I
Ingo Molnar 已提交
230

231
	union {
232 233
		__u64		sample_period;
		__u64		sample_freq;
234 235
	};

236 237
	__u64			sample_type;
	__u64			read_format;
I
Ingo Molnar 已提交
238

239
	__u64			disabled       :  1, /* off by default        */
240 241 242 243 244 245
				inherit	       :  1, /* children inherit it   */
				pinned	       :  1, /* must always be on PMU */
				exclusive      :  1, /* only group on PMU     */
				exclude_user   :  1, /* don't count user      */
				exclude_kernel :  1, /* ditto kernel          */
				exclude_hv     :  1, /* ditto hypervisor      */
246
				exclude_idle   :  1, /* don't count when idle */
247
				mmap           :  1, /* include mmap data     */
248
				comm	       :  1, /* include comm data     */
249
				freq           :  1, /* use freq, not period  */
250
				inherit_stat   :  1, /* per task counts       */
251
				enable_on_exec :  1, /* next exec enables     */
P
Peter Zijlstra 已提交
252
				task           :  1, /* trace fork/exit       */
253
				watermark      :  1, /* wakeup_watermark      */
P
Peter Zijlstra 已提交
254 255 256 257 258 259 260 261 262 263 264
				/*
				 * precise_ip:
				 *
				 *  0 - SAMPLE_IP can have arbitrary skid
				 *  1 - SAMPLE_IP must have constant skid
				 *  2 - SAMPLE_IP requested to have 0 skid
				 *  3 - SAMPLE_IP must have 0 skid
				 *
				 *  See also PERF_RECORD_MISC_EXACT_IP
				 */
				precise_ip     :  2, /* skid constraint       */
265
				mmap_data      :  1, /* non-exec mmap data    */
266
				sample_id_all  :  1, /* sample_type all events */
P
Peter Zijlstra 已提交
267

268 269 270
				exclude_host   :  1, /* don't count in host   */
				exclude_guest  :  1, /* don't count in guest  */

271 272 273 274
				exclude_callchain_kernel : 1, /* exclude kernel callchains */
				exclude_callchain_user   : 1, /* exclude user callchains */

				__reserved_1   : 41;
275

276 277 278 279
	union {
		__u32		wakeup_events;	  /* wakeup every n events */
		__u32		wakeup_watermark; /* bytes before wakeup   */
	};
280

281
	__u32			bp_type;
282 283 284 285 286 287 288 289
	union {
		__u64		bp_addr;
		__u64		config1; /* extension of config */
	};
	union {
		__u64		bp_len;
		__u64		config2; /* extension of config1 */
	};
290 291 292 293 294 295 296
	__u64	branch_sample_type; /* enum perf_branch_sample_type */

	/*
	 * Defines set of user regs to dump on samples.
	 * See asm/perf_regs.h for details.
	 */
	__u64	sample_regs_user;
297 298 299 300 301 302 303 304

	/*
	 * Defines size of the user stack to dump on samples.
	 */
	__u32	sample_stack_user;

	/* Align to u64. */
	__u32	__reserved_2;
305 306
};

307 308
#define perf_flags(attr)	(*(&(attr)->read_format + 1))

309
/*
310
 * Ioctls that can be done on a perf event fd:
311
 */
312
#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
I
Ingo Molnar 已提交
313 314
#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
315
#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
316
#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
317
#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
L
Li Zefan 已提交
318
#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
319 320

enum perf_event_ioc_flags {
P
Peter Zijlstra 已提交
321 322
	PERF_IOC_FLAG_GROUP		= 1U << 0,
};
323

324 325 326
/*
 * Structure of the page that can be mapped via mmap
 */
327
struct perf_event_mmap_page {
328 329
	__u32	version;		/* version number of this structure */
	__u32	compat_version;		/* lowest version this is compat with */
330 331

	/*
332
	 * Bits needed to read the hw events in user-space.
333
	 *
334 335 336 337
	 *   u32 seq, time_mult, time_shift, idx, width;
	 *   u64 count, enabled, running;
	 *   u64 cyc, time_offset;
	 *   s64 pmc = 0;
338
	 *
339 340 341
	 *   do {
	 *     seq = pc->lock;
	 *     barrier()
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
	 *
	 *     enabled = pc->time_enabled;
	 *     running = pc->time_running;
	 *
	 *     if (pc->cap_usr_time && enabled != running) {
	 *       cyc = rdtsc();
	 *       time_offset = pc->time_offset;
	 *       time_mult   = pc->time_mult;
	 *       time_shift  = pc->time_shift;
	 *     }
	 *
	 *     idx = pc->index;
	 *     count = pc->offset;
	 *     if (pc->cap_usr_rdpmc && idx) {
	 *       width = pc->pmc_width;
	 *       pmc = rdpmc(idx - 1);
	 *     }
359
	 *
360 361
	 *     barrier();
	 *   } while (pc->lock != seq);
362
	 *
363 364
	 * NOTE: for obvious reason this only works on self-monitoring
	 *       processes.
365
	 */
366
	__u32	lock;			/* seqlock for synchronization */
367 368 369 370
	__u32	index;			/* hardware event identifier */
	__s64	offset;			/* add to hardware event value */
	__u64	time_enabled;		/* time event active */
	__u64	time_running;		/* time event on cpu */
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
	union {
		__u64	capabilities;
		__u64	cap_usr_time  : 1,
			cap_usr_rdpmc : 1,
			cap_____res   : 62;
	};

	/*
	 * If cap_usr_rdpmc this field provides the bit-width of the value
	 * read using the rdpmc() or equivalent instruction. This can be used
	 * to sign extend the result like:
	 *
	 *   pmc <<= 64 - width;
	 *   pmc >>= 64 - width; // signed shift right
	 *   count += pmc;
	 */
	__u16	pmc_width;

	/*
	 * If cap_usr_time the below fields can be used to compute the time
	 * delta since time_enabled (in ns) using rdtsc or similar.
	 *
	 *   u64 quot, rem;
	 *   u64 delta;
	 *
	 *   quot = (cyc >> time_shift);
	 *   rem = cyc & ((1 << time_shift) - 1);
	 *   delta = time_offset + quot * time_mult +
	 *              ((rem * time_mult) >> time_shift);
	 *
	 * Where time_offset,time_mult,time_shift and cyc are read in the
	 * seqcount loop described above. This delta can then be added to
	 * enabled and possible running (if idx), improving the scaling:
	 *
	 *   enabled += delta;
	 *   if (idx)
	 *     running += delta;
	 *
	 *   quot = count / running;
	 *   rem  = count % running;
	 *   count = quot * enabled + (rem * enabled) / running;
	 */
	__u16	time_shift;
	__u32	time_mult;
415
	__u64	time_offset;
416

417 418 419 420
		/*
		 * Hole for extension of the self monitor capabilities
		 */

421
	__u64	__reserved[120];	/* align to 1k */
422

423 424 425
	/*
	 * Control data for the mmap() data buffer.
	 *
426 427
	 * User-space reading the @data_head value should issue an rmb(), on
	 * SMP capable platforms, after reading this value -- see
428
	 * perf_event_wakeup().
429 430 431 432
	 *
	 * When the mapping is PROT_WRITE the @data_tail value should be
	 * written by userspace to reflect the last read data. In this case
	 * the kernel will not over-write unread data.
433
	 */
434
	__u64   data_head;		/* head in the data section */
435
	__u64	data_tail;		/* user-space written tail */
436 437
};

438
#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
439
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
440 441 442
#define PERF_RECORD_MISC_KERNEL			(1 << 0)
#define PERF_RECORD_MISC_USER			(2 << 0)
#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
443 444
#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
445

P
Peter Zijlstra 已提交
446 447 448 449 450 451
/*
 * Indicates that the content of PERF_SAMPLE_IP points to
 * the actual instruction that triggered the event. See also
 * perf_event_attr::precise_ip.
 */
#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
452 453 454 455 456
/*
 * Reserve the last bit to indicate some extended misc field
 */
#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)

P
Peter Zijlstra 已提交
457 458
struct perf_event_header {
	__u32	type;
459 460
	__u16	misc;
	__u16	size;
P
Peter Zijlstra 已提交
461 462 463
};

enum perf_event_type {
464

465
	/*
466 467 468 469 470 471 472 473 474
	 * If perf_event_attr.sample_id_all is set then all event types will
	 * have the sample_type selected fields related to where/when
	 * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
	 * described in PERF_RECORD_SAMPLE below, it will be stashed just after
	 * the perf_event_header and the fields already present for the existing
	 * fields, i.e. at the end of the payload. That way a newer perf.data
	 * file will be supported by older perf tools, with these new optional
	 * fields being ignored.
	 *
475 476 477 478
	 * The MMAP events record the PROT_EXEC mappings so that we can
	 * correlate userspace IPs to code. They have the following structure:
	 *
	 * struct {
I
Ingo Molnar 已提交
479
	 *	struct perf_event_header	header;
480
	 *
I
Ingo Molnar 已提交
481 482 483 484 485
	 *	u32				pid, tid;
	 *	u64				addr;
	 *	u64				len;
	 *	u64				pgoff;
	 *	char				filename[];
486 487
	 * };
	 */
488
	PERF_RECORD_MMAP			= 1,
489

490 491
	/*
	 * struct {
I
Ingo Molnar 已提交
492 493 494
	 *	struct perf_event_header	header;
	 *	u64				id;
	 *	u64				lost;
495 496
	 * };
	 */
497
	PERF_RECORD_LOST			= 2,
498

499 500
	/*
	 * struct {
I
Ingo Molnar 已提交
501
	 *	struct perf_event_header	header;
502
	 *
I
Ingo Molnar 已提交
503 504
	 *	u32				pid, tid;
	 *	char				comm[];
505 506
	 * };
	 */
507
	PERF_RECORD_COMM			= 3,
508

P
Peter Zijlstra 已提交
509 510 511 512 513
	/*
	 * struct {
	 *	struct perf_event_header	header;
	 *	u32				pid, ppid;
	 *	u32				tid, ptid;
514
	 *	u64				time;
P
Peter Zijlstra 已提交
515 516
	 * };
	 */
517
	PERF_RECORD_EXIT			= 4,
P
Peter Zijlstra 已提交
518

519 520
	/*
	 * struct {
I
Ingo Molnar 已提交
521 522
	 *	struct perf_event_header	header;
	 *	u64				time;
523
	 *	u64				id;
524
	 *	u64				stream_id;
525 526
	 * };
	 */
527 528
	PERF_RECORD_THROTTLE			= 5,
	PERF_RECORD_UNTHROTTLE			= 6,
529

P
Peter Zijlstra 已提交
530 531
	/*
	 * struct {
532 533
	 *	struct perf_event_header	header;
	 *	u32				pid, ppid;
P
Peter Zijlstra 已提交
534
	 *	u32				tid, ptid;
535
	 *	u64				time;
P
Peter Zijlstra 已提交
536 537
	 * };
	 */
538
	PERF_RECORD_FORK			= 7,
P
Peter Zijlstra 已提交
539

540 541
	/*
	 * struct {
542 543
	 *	struct perf_event_header	header;
	 *	u32				pid, tid;
544
	 *
545
	 *	struct read_format		values;
546 547
	 * };
	 */
548
	PERF_RECORD_READ			= 8,
549

550
	/*
551
	 * struct {
I
Ingo Molnar 已提交
552
	 *	struct perf_event_header	header;
553
	 *
554 555 556 557
	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
	 *	{ u64			time;     } && PERF_SAMPLE_TIME
	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
558
	 *	{ u64			id;	  } && PERF_SAMPLE_ID
559
	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
560
	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
I
Ingo Molnar 已提交
561
	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
562
	 *
563
	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
564
	 *
565
	 *	{ u64			nr,
566
	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
567
	 *
I
Ingo Molnar 已提交
568 569 570 571 572 573 574 575 576 577
	 *	#
	 *	# The RAW record below is opaque data wrt the ABI
	 *	#
	 *	# That is, the ABI doesn't make any promises wrt to
	 *	# the stability of its content, it may vary depending
	 *	# on event, hardware, kernel version and phase of
	 *	# the moon.
	 *	#
	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
	 *	#
578
	 *
579 580
	 *	{ u32			size;
	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
581 582
	 *
	 *	{ u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
583 584 585
	 *
	 * 	{ u64			abi; # enum perf_sample_regs_abi
	 * 	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
586 587 588 589
	 *
	 * 	{ u64			size;
	 * 	  char			data[size];
	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
590
	 * };
591
	 */
592
	PERF_RECORD_SAMPLE			= 9,
593

594
	PERF_RECORD_MAX,			/* non-ABI */
P
Peter Zijlstra 已提交
595 596
};

A
Arun Sharma 已提交
597
#define PERF_MAX_STACK_DEPTH		127
598

599 600 601 602
enum perf_callchain_context {
	PERF_CONTEXT_HV			= (__u64)-32,
	PERF_CONTEXT_KERNEL		= (__u64)-128,
	PERF_CONTEXT_USER		= (__u64)-512,
603

604 605 606 607 608
	PERF_CONTEXT_GUEST		= (__u64)-2048,
	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,

	PERF_CONTEXT_MAX		= (__u64)-4095,
609 610
};

611 612 613
#define PERF_FLAG_FD_NO_GROUP		(1U << 0)
#define PERF_FLAG_FD_OUTPUT		(1U << 1)
#define PERF_FLAG_PID_CGROUP		(1U << 2) /* pid=cgroup id, per-cpu mode only */
614

615
#ifdef __KERNEL__
I
Ingo Molnar 已提交
616
/*
617
 * Kernel-internal data types and definitions:
I
Ingo Molnar 已提交
618 619
 */

620
#ifdef CONFIG_PERF_EVENTS
S
Stephane Eranian 已提交
621
# include <linux/cgroup.h>
622
# include <asm/perf_event.h>
623
# include <asm/local64.h>
624 625
#endif

626
struct perf_guest_info_callbacks {
627 628 629
	int				(*is_in_guest)(void);
	int				(*is_user_mode)(void);
	unsigned long			(*get_guest_ip)(void);
630 631
};

632 633 634 635
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif

636 637 638 639 640
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
641
#include <linux/hrtimer.h>
P
Peter Zijlstra 已提交
642
#include <linux/fs.h>
643
#include <linux/pid_namespace.h>
644
#include <linux/workqueue.h>
645
#include <linux/ftrace.h>
646
#include <linux/cpu.h>
647
#include <linux/irq_work.h>
648
#include <linux/static_key.h>
A
Arun Sharma 已提交
649
#include <linux/atomic.h>
650
#include <linux/sysfs.h>
651
#include <linux/perf_regs.h>
652
#include <asm/local.h>
653

654 655 656 657 658
struct perf_callchain_entry {
	__u64				nr;
	__u64				ip[PERF_MAX_STACK_DEPTH];
};

659 660 661
struct perf_raw_record {
	u32				size;
	void				*data;
662 663
};

664 665 666 667 668 669 670 671 672 673 674
/*
 * single taken branch record layout:
 *
 *      from: source instruction (may not always be a branch insn)
 *        to: branch target
 *   mispred: branch target was mispredicted
 * predicted: branch target was predicted
 *
 * support for mispred, predicted is optional. In case it
 * is not supported mispred = predicted = 0.
 */
675
struct perf_branch_entry {
676 677 678 679 680
	__u64	from;
	__u64	to;
	__u64	mispred:1,  /* target mispredicted */
		predicted:1,/* target predicted */
		reserved:62;
681 682
};

683 684 685 686 687 688 689 690 691
/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 */
692 693 694 695 696
struct perf_branch_stack {
	__u64				nr;
	struct perf_branch_entry	entries[0];
};

697 698 699 700 701
struct perf_regs_user {
	__u64		abi;
	struct pt_regs	*regs;
};

702 703
struct task_struct;

704 705 706 707 708 709 710 711 712 713
/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
	u64		config;	/* register value */
	unsigned int	reg;	/* register address or index */
	int		alloc;	/* extra register already allocated */
	int		idx;	/* index in shared_regs->regs[] */
};

T
Thomas Gleixner 已提交
714
/**
715
 * struct hw_perf_event - performance event hardware details:
T
Thomas Gleixner 已提交
716
 */
717 718
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
719 720
	union {
		struct { /* hardware */
I
Ingo Molnar 已提交
721
			u64		config;
722
			u64		last_tag;
I
Ingo Molnar 已提交
723
			unsigned long	config_base;
724
			unsigned long	event_base;
725
			int		event_base_rdpmc;
I
Ingo Molnar 已提交
726
			int		idx;
727
			int		last_cpu;
728

729
			struct hw_perf_event_extra extra_reg;
730
			struct hw_perf_event_extra branch_reg;
731
		};
732
		struct { /* software */
I
Ingo Molnar 已提交
733
			struct hrtimer	hrtimer;
734
		};
735
#ifdef CONFIG_HAVE_HW_BREAKPOINT
736 737 738
		struct { /* breakpoint */
			struct arch_hw_breakpoint	info;
			struct list_head		bp_list;
739 740 741 742 743 744
			/*
			 * Crufty hack to avoid the chicken and egg
			 * problem hw_breakpoint has with context
			 * creation and event initalization.
			 */
			struct task_struct		*bp_target;
745
		};
746
#endif
747
	};
P
Peter Zijlstra 已提交
748
	int				state;
749
	local64_t			prev_count;
750
	u64				sample_period;
751
	u64				last_period;
752
	local64_t			period_left;
753
	u64                             interrupts_seq;
754
	u64				interrupts;
755

756 757
	u64				freq_time_stamp;
	u64				freq_count_stamp;
758
#endif
T
Thomas Gleixner 已提交
759 760
};

P
Peter Zijlstra 已提交
761 762 763 764 765 766 767
/*
 * hw_perf_event::state flags
 */
#define PERF_HES_STOPPED	0x01 /* the counter is stopped */
#define PERF_HES_UPTODATE	0x02 /* event->count up-to-date */
#define PERF_HES_ARCH		0x04

768
struct perf_event;
I
Ingo Molnar 已提交
769

770 771 772 773
/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_EVENT_TXN 0x1
774

I
Ingo Molnar 已提交
775
/**
776
 * struct pmu - generic performance monitoring unit
I
Ingo Molnar 已提交
777
 */
778
struct pmu {
779 780
	struct list_head		entry;

P
Peter Zijlstra 已提交
781
	struct device			*dev;
782
	const struct attribute_group	**attr_groups;
P
Peter Zijlstra 已提交
783 784 785
	char				*name;
	int				type;

P
Peter Zijlstra 已提交
786 787
	int * __percpu			pmu_disable_count;
	struct perf_cpu_context * __percpu pmu_cpu_context;
P
Peter Zijlstra 已提交
788
	int				task_ctx_nr;
789 790

	/*
P
Peter Zijlstra 已提交
791 792
	 * Fully disable/enable this PMU, can be used to protect from the PMI
	 * as well as for lazy/batch writing of the MSRs.
793
	 */
P
Peter Zijlstra 已提交
794 795
	void (*pmu_enable)		(struct pmu *pmu); /* optional */
	void (*pmu_disable)		(struct pmu *pmu); /* optional */
796

797
	/*
P
Peter Zijlstra 已提交
798
	 * Try and initialize the event for this PMU.
799
	 * Should return -ENOENT when the @event doesn't match this PMU.
800
	 */
801 802
	int (*event_init)		(struct perf_event *event);

P
Peter Zijlstra 已提交
803 804 805 806
#define PERF_EF_START	0x01		/* start the counter when adding    */
#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */

807
	/*
P
Peter Zijlstra 已提交
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
	 * Adds/Removes a counter to/from the PMU, can be done inside
	 * a transaction, see the ->*_txn() methods.
	 */
	int  (*add)			(struct perf_event *event, int flags);
	void (*del)			(struct perf_event *event, int flags);

	/*
	 * Starts/Stops a counter present on the PMU. The PMI handler
	 * should stop the counter when perf_event_overflow() returns
	 * !0. ->start() will be used to continue.
	 */
	void (*start)			(struct perf_event *event, int flags);
	void (*stop)			(struct perf_event *event, int flags);

	/*
	 * Updates the counter value of the event.
	 */
825
	void (*read)			(struct perf_event *event);
826 827

	/*
828 829 830
	 * Group events scheduling is treated as a transaction, add
	 * group events as a whole and perform one schedulability test.
	 * If the test fails, roll back the whole group
P
Peter Zijlstra 已提交
831 832
	 *
	 * Start the transaction, after this ->add() doesn't need to
833
	 * do schedulability tests.
834
	 */
835
	void (*start_txn)		(struct pmu *pmu); /* optional */
836
	/*
P
Peter Zijlstra 已提交
837
	 * If ->start_txn() disabled the ->add() schedulability test
838 839 840 841
	 * then ->commit_txn() is required to perform one. On success
	 * the transaction is closed. On error the transaction is kept
	 * open until ->cancel_txn() is called.
	 */
842
	int  (*commit_txn)		(struct pmu *pmu); /* optional */
843
	/*
P
Peter Zijlstra 已提交
844
	 * Will cancel the transaction, assumes ->del() is called
L
Lucas De Marchi 已提交
845
	 * for each successful ->add() during the transaction.
846
	 */
847
	void (*cancel_txn)		(struct pmu *pmu); /* optional */
848 849 850 851 852 853

	/*
	 * Will return the value for perf_event_mmap_page::index for this event,
	 * if no implementation is provided it will default to: event->hw.idx + 1.
	 */
	int (*event_idx)		(struct perf_event *event); /*optional */
854 855 856 857 858

	/*
	 * flush branch stack on context-switches (needed in cpu-wide mode)
	 */
	void (*flush_branch_stack)	(void);
I
Ingo Molnar 已提交
859 860
};

861
/**
862
 * enum perf_event_active_state - the states of a event
863
 */
864
enum perf_event_active_state {
I
Ingo Molnar 已提交
865
	PERF_EVENT_STATE_ERROR		= -2,
866 867
	PERF_EVENT_STATE_OFF		= -1,
	PERF_EVENT_STATE_INACTIVE	=  0,
I
Ingo Molnar 已提交
868
	PERF_EVENT_STATE_ACTIVE		=  1,
869 870
};

871
struct file;
872 873
struct perf_sample_data;

874
typedef void (*perf_overflow_handler_t)(struct perf_event *,
875 876 877
					struct perf_sample_data *,
					struct pt_regs *regs);

878
enum perf_group_flag {
879
	PERF_GROUP_SOFTWARE		= 0x1,
880 881
};

882 883
#define SWEVENT_HLIST_BITS		8
#define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
884 885

struct swevent_hlist {
886 887
	struct hlist_head		heads[SWEVENT_HLIST_SIZE];
	struct rcu_head			rcu_head;
888 889
};

890 891
#define PERF_ATTACH_CONTEXT	0x01
#define PERF_ATTACH_GROUP	0x02
892
#define PERF_ATTACH_TASK	0x04
893

S
Stephane Eranian 已提交
894 895 896 897 898 899
#ifdef CONFIG_CGROUP_PERF
/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
900 901
	u64				time;
	u64				timestamp;
S
Stephane Eranian 已提交
902 903 904
};

struct perf_cgroup {
905 906
	struct				cgroup_subsys_state css;
	struct				perf_cgroup_info *info;	/* timing info, one per cpu */
S
Stephane Eranian 已提交
907 908 909
};
#endif

910 911
struct ring_buffer;

T
Thomas Gleixner 已提交
912
/**
913
 * struct perf_event - performance event kernel representation:
T
Thomas Gleixner 已提交
914
 */
915 916
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
917
	struct list_head		group_entry;
P
Peter Zijlstra 已提交
918
	struct list_head		event_entry;
919
	struct list_head		sibling_list;
920
	struct hlist_node		hlist_entry;
I
Ingo Molnar 已提交
921
	int				nr_siblings;
922
	int				group_flags;
923
	struct perf_event		*group_leader;
P
Peter Zijlstra 已提交
924
	struct pmu			*pmu;
925

926
	enum perf_event_active_state	state;
927
	unsigned int			attach_state;
928
	local64_t			count;
929
	atomic64_t			child_count;
930

931
	/*
932
	 * These are the total time in nanoseconds that the event
933
	 * has been enabled (i.e. eligible to run, and the task has
934
	 * been scheduled in, if this is a per-task event)
935 936 937
	 * and running (scheduled onto the CPU), respectively.
	 *
	 * They are computed from tstamp_enabled, tstamp_running and
938
	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
939 940 941 942 943 944
	 */
	u64				total_time_enabled;
	u64				total_time_running;

	/*
	 * These are timestamps used for computing total_time_enabled
945
	 * and total_time_running when the event is in INACTIVE or
946 947
	 * ACTIVE state, measured in nanoseconds from an arbitrary point
	 * in time.
948 949
	 * tstamp_enabled: the notional time when the event was enabled
	 * tstamp_running: the notional time when the event was scheduled on
950
	 * tstamp_stopped: in INACTIVE state, the notional time when the
951
	 *	event was scheduled off.
952 953 954 955 956
	 */
	u64				tstamp_enabled;
	u64				tstamp_running;
	u64				tstamp_stopped;

957 958 959 960 961 962 963 964 965 966
	/*
	 * timestamp shadows the actual context timing but it can
	 * be safely used in NMI interrupt context. It reflects the
	 * context time as it was when the event was last scheduled in.
	 *
	 * ctx_time already accounts for ctx->timestamp. Therefore to
	 * compute ctx_time for a sample, simply add perf_clock().
	 */
	u64				shadow_ctx_time;

967
	struct perf_event_attr		attr;
968
	u16				header_size;
969
	u16				id_header_size;
970
	u16				read_size;
971
	struct hw_perf_event		hw;
T
Thomas Gleixner 已提交
972

973
	struct perf_event_context	*ctx;
974
	atomic_long_t			refcount;
T
Thomas Gleixner 已提交
975

976 977
	/*
	 * These accumulate total time (in nanoseconds) that children
978
	 * events have been enabled and running, respectively.
979 980 981 982
	 */
	atomic64_t			child_total_time_enabled;
	atomic64_t			child_total_time_running;

T
Thomas Gleixner 已提交
983
	/*
984
	 * Protect attach/detach and child_list:
T
Thomas Gleixner 已提交
985
	 */
986 987
	struct mutex			child_mutex;
	struct list_head		child_list;
988
	struct perf_event		*parent;
T
Thomas Gleixner 已提交
989 990 991 992

	int				oncpu;
	int				cpu;

993 994 995
	struct list_head		owner_entry;
	struct task_struct		*owner;

996 997 998
	/* mmap bits */
	struct mutex			mmap_mutex;
	atomic_t			mmap_count;
999 1000
	int				mmap_locked;
	struct user_struct		*mmap_user;
1001
	struct ring_buffer		*rb;
1002
	struct list_head		rb_entry;
1003

1004
	/* poll related */
T
Thomas Gleixner 已提交
1005
	wait_queue_head_t		waitq;
P
Peter Zijlstra 已提交
1006
	struct fasync_struct		*fasync;
1007 1008 1009

	/* delayed work for NMIs and such */
	int				pending_wakeup;
1010
	int				pending_kill;
1011
	int				pending_disable;
1012
	struct irq_work			pending;
P
Peter Zijlstra 已提交
1013

1014 1015
	atomic_t			event_limit;

1016
	void (*destroy)(struct perf_event *);
P
Peter Zijlstra 已提交
1017
	struct rcu_head			rcu_head;
1018 1019

	struct pid_namespace		*ns;
1020
	u64				id;
L
Li Zefan 已提交
1021

1022
	perf_overflow_handler_t		overflow_handler;
1023
	void				*overflow_handler_context;
1024

1025
#ifdef CONFIG_EVENT_TRACING
1026
	struct ftrace_event_call	*tp_event;
L
Li Zefan 已提交
1027
	struct event_filter		*filter;
1028 1029 1030
#ifdef CONFIG_FUNCTION_TRACER
	struct ftrace_ops               ftrace_ops;
#endif
1031
#endif
L
Li Zefan 已提交
1032

S
Stephane Eranian 已提交
1033 1034 1035 1036 1037
#ifdef CONFIG_CGROUP_PERF
	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
	int				cgrp_defer_enabled;
#endif

L
Li Zefan 已提交
1038
#endif /* CONFIG_PERF_EVENTS */
T
Thomas Gleixner 已提交
1039 1040
};

1041 1042 1043 1044 1045
enum perf_event_context_type {
	task_context,
	cpu_context,
};

T
Thomas Gleixner 已提交
1046
/**
1047
 * struct perf_event_context - event context structure
T
Thomas Gleixner 已提交
1048
 *
1049
 * Used as a container for task events and CPU events as well:
T
Thomas Gleixner 已提交
1050
 */
1051
struct perf_event_context {
P
Peter Zijlstra 已提交
1052
	struct pmu			*pmu;
1053
	enum perf_event_context_type	type;
T
Thomas Gleixner 已提交
1054
	/*
1055
	 * Protect the states of the events in the list,
1056
	 * nr_active, and the list:
T
Thomas Gleixner 已提交
1057
	 */
1058
	raw_spinlock_t			lock;
1059
	/*
1060
	 * Protect the list of events.  Locking either mutex or lock
1061 1062 1063
	 * is sufficient to ensure the list doesn't change; to change
	 * the list you need to lock both the mutex and the spinlock.
	 */
I
Ingo Molnar 已提交
1064
	struct mutex			mutex;
1065

1066 1067
	struct list_head		pinned_groups;
	struct list_head		flexible_groups;
I
Ingo Molnar 已提交
1068
	struct list_head		event_list;
1069
	int				nr_events;
I
Ingo Molnar 已提交
1070 1071
	int				nr_active;
	int				is_active;
1072
	int				nr_stat;
1073
	int				nr_freq;
1074
	int				rotate_disable;
I
Ingo Molnar 已提交
1075 1076
	atomic_t			refcount;
	struct task_struct		*task;
1077 1078

	/*
1079
	 * Context clock, runs when context enabled.
1080
	 */
I
Ingo Molnar 已提交
1081 1082
	u64				time;
	u64				timestamp;
1083 1084 1085 1086 1087

	/*
	 * These fields let us detect when two contexts have both
	 * been cloned (inherited) from a common ancestor.
	 */
1088
	struct perf_event_context	*parent_ctx;
I
Ingo Molnar 已提交
1089 1090 1091
	u64				parent_gen;
	u64				generation;
	int				pin_count;
1092 1093
	int				nr_cgroups;	 /* cgroup evts */
	int				nr_branch_stack; /* branch_stack evt */
1094
	struct rcu_head			rcu_head;
T
Thomas Gleixner 已提交
1095 1096
};

1097 1098
/*
 * Number of contexts where an event can trigger:
1099
 *	task, softirq, hardirq, nmi.
1100 1101 1102
 */
#define PERF_NR_CONTEXTS	4

T
Thomas Gleixner 已提交
1103
/**
1104
 * struct perf_event_cpu_context - per cpu event context structure
T
Thomas Gleixner 已提交
1105 1106
 */
struct perf_cpu_context {
1107 1108
	struct perf_event_context	ctx;
	struct perf_event_context	*task_ctx;
T
Thomas Gleixner 已提交
1109
	int				active_oncpu;
1110
	int				exclusive;
1111 1112
	struct list_head		rotation_list;
	int				jiffies_interval;
1113
	struct pmu			*active_pmu;
S
Stephane Eranian 已提交
1114
	struct perf_cgroup		*cgrp;
T
Thomas Gleixner 已提交
1115 1116
};

1117
struct perf_output_handle {
I
Ingo Molnar 已提交
1118
	struct perf_event		*event;
1119
	struct ring_buffer		*rb;
1120
	unsigned long			wakeup;
1121 1122 1123
	unsigned long			size;
	void				*addr;
	int				page;
1124 1125
};

1126
#ifdef CONFIG_PERF_EVENTS
1127

P
Peter Zijlstra 已提交
1128
extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
1129
extern void perf_pmu_unregister(struct pmu *pmu);
I
Ingo Molnar 已提交
1130

1131
extern int perf_num_counters(void);
1132
extern const char *perf_pmu_name(void);
1133 1134 1135 1136
extern void __perf_event_task_sched_in(struct task_struct *prev,
				       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
					struct task_struct *next);
1137 1138 1139
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
1140
extern void perf_event_delayed_put(struct task_struct *task);
1141
extern void perf_event_print_debug(void);
P
Peter Zijlstra 已提交
1142 1143
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
1144 1145
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
1146
extern int perf_event_refresh(struct perf_event *event, int refresh);
1147
extern void perf_event_update_userpage(struct perf_event *event);
1148 1149 1150 1151
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
				int cpu,
M
Matt Helsley 已提交
1152
				struct task_struct *task,
1153 1154
				perf_overflow_handler_t callback,
				void *context);
1155 1156
extern void perf_pmu_migrate_context(struct pmu *pmu,
				int src_cpu, int dst_cpu);
1157 1158
extern u64 perf_event_read_value(struct perf_event *event,
				 u64 *enabled, u64 *running);
1159

1160

1161
struct perf_sample_data {
1162 1163 1164 1165 1166 1167 1168 1169
	u64				type;

	u64				ip;
	struct {
		u32	pid;
		u32	tid;
	}				tid_entry;
	u64				time;
I
Ingo Molnar 已提交
1170
	u64				addr;
1171 1172 1173 1174 1175 1176
	u64				id;
	u64				stream_id;
	struct {
		u32	cpu;
		u32	reserved;
	}				cpu_entry;
I
Ingo Molnar 已提交
1177
	u64				period;
1178
	struct perf_callchain_entry	*callchain;
1179
	struct perf_raw_record		*raw;
1180
	struct perf_branch_stack	*br_stack;
1181
	struct perf_regs_user		regs_user;
1182
	u64				stack_user_size;
1183 1184
};

1185 1186
static inline void perf_sample_data_init(struct perf_sample_data *data,
					 u64 addr, u64 period)
1187
{
1188
	/* remaining struct members initialized in perf_prepare_sample() */
1189 1190
	data->addr = addr;
	data->raw  = NULL;
1191
	data->br_stack = NULL;
1192 1193 1194
	data->period = period;
	data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
	data->regs_user.regs = NULL;
1195
	data->stack_user_size = 0;
1196 1197
}

1198 1199 1200
extern void perf_output_sample(struct perf_output_handle *handle,
			       struct perf_event_header *header,
			       struct perf_sample_data *data,
1201
			       struct perf_event *event);
1202 1203
extern void perf_prepare_sample(struct perf_event_header *header,
				struct perf_sample_data *data,
1204
				struct perf_event *event,
1205 1206
				struct pt_regs *regs);

1207
extern int perf_event_overflow(struct perf_event *event,
1208 1209
				 struct perf_sample_data *data,
				 struct pt_regs *regs);
1210

1211 1212 1213 1214 1215
static inline bool is_sampling_event(struct perf_event *event)
{
	return event->attr.sample_period != 0;
}

1216
/*
1217
 * Return 1 for a software event, 0 for a hardware event
1218
 */
1219
static inline int is_software_event(struct perf_event *event)
1220
{
1221
	return event->pmu->task_ctx_nr == perf_sw_context;
1222 1223
}

1224
extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
1225

1226
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
1227

1228
#ifndef perf_arch_fetch_caller_regs
1229
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
1230
#endif
1231 1232 1233 1234 1235 1236 1237 1238 1239

/*
 * Take a snapshot of the regs. Skip ip and frame pointer to
 * the nth caller. We only need a few of the regs:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - bp for callchains
 * - eflags, for future purposes, just in case
 */
1240
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
1241 1242 1243
{
	memset(regs, 0, sizeof(*regs));

1244
	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
1245 1246
}

P
Peter Zijlstra 已提交
1247
static __always_inline void
1248
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
1249
{
P
Peter Zijlstra 已提交
1250 1251
	struct pt_regs hot_regs;

1252
	if (static_key_false(&perf_swevent_enabled[event_id])) {
1253 1254 1255 1256
		if (!regs) {
			perf_fetch_caller_regs(&hot_regs);
			regs = &hot_regs;
		}
1257
		__perf_sw_event(event_id, nr, regs, addr);
1258 1259 1260
	}
}

1261
extern struct static_key_deferred perf_sched_events;
1262

1263
static inline void perf_event_task_sched_in(struct task_struct *prev,
1264
					    struct task_struct *task)
1265 1266 1267 1268 1269 1270 1271
{
	if (static_key_false(&perf_sched_events.key))
		__perf_event_task_sched_in(prev, task);
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
					     struct task_struct *next)
1272
{
1273
	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
1274

1275
	if (static_key_false(&perf_sched_events.key))
1276
		__perf_event_task_sched_out(prev, next);
1277 1278
}

1279
extern void perf_event_mmap(struct vm_area_struct *vma);
1280
extern struct perf_guest_info_callbacks *perf_guest_cbs;
1281 1282
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1283

1284 1285
extern void perf_event_comm(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
1286

1287 1288 1289
/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

1290 1291
extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
1292

1293
static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
1294 1295 1296 1297
{
	if (entry->nr < PERF_MAX_STACK_DEPTH)
		entry->ip[entry->nr++] = ip;
}
1298

1299 1300 1301
extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
1302

P
Peter Zijlstra 已提交
1303 1304 1305 1306
extern int perf_proc_update_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *lenp,
		loff_t *ppos);

1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
static inline bool perf_paranoid_tracepoint_raw(void)
{
	return sysctl_perf_event_paranoid > -1;
}

static inline bool perf_paranoid_cpu(void)
{
	return sysctl_perf_event_paranoid > 0;
}

static inline bool perf_paranoid_kernel(void)
{
	return sysctl_perf_event_paranoid > 1;
}

1322
extern void perf_event_init(void);
1323 1324
extern void perf_tp_event(u64 addr, u64 count, void *record,
			  int entry_size, struct pt_regs *regs,
1325 1326
			  struct hlist_head *head, int rctx,
			  struct task_struct *task);
1327
extern void perf_bp_event(struct perf_event *event, void *data);
1328

1329
#ifndef perf_misc_flags
1330 1331 1332
# define perf_misc_flags(regs) \
		(user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_instruction_pointer(regs)	instruction_pointer(regs)
1333 1334
#endif

1335 1336 1337 1338 1339
static inline bool has_branch_stack(struct perf_event *event)
{
	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

1340
extern int perf_output_begin(struct perf_output_handle *handle,
1341
			     struct perf_event *event, unsigned int size);
1342
extern void perf_output_end(struct perf_output_handle *handle);
1343
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
1344
			     const void *buf, unsigned int len);
1345 1346
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
				     unsigned int len);
1347 1348
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
1349 1350
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
1351
extern int __perf_event_disable(void *info);
1352
extern void perf_event_task_tick(void);
T
Thomas Gleixner 已提交
1353 1354
#else
static inline void
1355 1356 1357 1358 1359
perf_event_task_sched_in(struct task_struct *prev,
			 struct task_struct *task)			{ }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
			  struct task_struct *next)			{ }
1360 1361 1362
static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
static inline void perf_event_exit_task(struct task_struct *child)	{ }
static inline void perf_event_free_task(struct task_struct *task)	{ }
1363
static inline void perf_event_delayed_put(struct task_struct *task)	{ }
I
Ingo Molnar 已提交
1364 1365 1366
static inline void perf_event_print_debug(void)				{ }
static inline int perf_event_task_disable(void)				{ return -EINVAL; }
static inline int perf_event_task_enable(void)				{ return -EINVAL; }
1367 1368 1369 1370
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
	return -EINVAL;
}
1371

1372
static inline void
1373
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
1374
static inline void
1375
perf_bp_event(struct perf_event *event, void *data)			{ }
1376

1377
static inline int perf_register_guest_info_callbacks
1378
(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
1379
static inline int perf_unregister_guest_info_callbacks
1380
(struct perf_guest_info_callbacks *callbacks)				{ return 0; }
1381

I
Ingo Molnar 已提交
1382
static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
1383 1384 1385
static inline void perf_event_comm(struct task_struct *tsk)		{ }
static inline void perf_event_fork(struct task_struct *tsk)		{ }
static inline void perf_event_init(void)				{ }
1386
static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
1387
static inline void perf_swevent_put_recursion_context(int rctx)		{ }
1388 1389
static inline void perf_event_enable(struct perf_event *event)		{ }
static inline void perf_event_disable(struct perf_event *event)		{ }
1390
static inline int __perf_event_disable(void *info)			{ return -1; }
1391
static inline void perf_event_task_tick(void)				{ }
T
Thomas Gleixner 已提交
1392 1393
#endif

1394
#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
1395

1396 1397 1398
/*
 * This has to have a higher priority than migration_notifier in sched.c.
 */
1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
#define perf_cpu_notifier(fn)						\
do {									\
	static struct notifier_block fn##_nb __cpuinitdata =		\
		{ .notifier_call = fn, .priority = CPU_PRI_PERF };	\
	fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,			\
		(void *)(unsigned long)smp_processor_id());		\
	fn(&fn##_nb, (unsigned long)CPU_STARTING,			\
		(void *)(unsigned long)smp_processor_id());		\
	fn(&fn##_nb, (unsigned long)CPU_ONLINE,				\
		(void *)(unsigned long)smp_processor_id());		\
	register_cpu_notifier(&fn##_nb);				\
1410 1411
} while (0)

1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424

#define PMU_FORMAT_ATTR(_name, _format)					\
static ssize_t								\
_name##_show(struct device *dev,					\
			       struct device_attribute *attr,		\
			       char *page)				\
{									\
	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
	return sprintf(page, _format "\n");				\
}									\
									\
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

1425
#endif /* __KERNEL__ */
1426
#endif /* _LINUX_PERF_EVENT_H */