perf_event.h 27.4 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
I
Ingo Molnar 已提交
2
 * Performance events:
T
Thomas Gleixner 已提交
3
 *
I
Ingo Molnar 已提交
4 5 6
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
T
Thomas Gleixner 已提交
7
 *
I
Ingo Molnar 已提交
8
 * Data type definitions, declarations, prototypes.
T
Thomas Gleixner 已提交
9
 *
I
Ingo Molnar 已提交
10
 *    Started by: Thomas Gleixner and Ingo Molnar
T
Thomas Gleixner 已提交
11
 *
I
Ingo Molnar 已提交
12
 * For licencing details see kernel-base/COPYING
T
Thomas Gleixner 已提交
13
 */
14 15
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H
T
Thomas Gleixner 已提交
16

17 18
#include <linux/types.h>
#include <linux/ioctl.h>
19
#include <asm/byteorder.h>
T
Thomas Gleixner 已提交
20 21

/*
I
Ingo Molnar 已提交
22 23 24 25
 * User-space ABI bits:
 */

/*
26
 * attr.type
T
Thomas Gleixner 已提交
27
 */
P
Peter Zijlstra 已提交
28
enum perf_type_id {
I
Ingo Molnar 已提交
29 30 31 32 33
	PERF_TYPE_HARDWARE			= 0,
	PERF_TYPE_SOFTWARE			= 1,
	PERF_TYPE_TRACEPOINT			= 2,
	PERF_TYPE_HW_CACHE			= 3,
	PERF_TYPE_RAW				= 4,
34
	PERF_TYPE_BREAKPOINT			= 5,
35

I
Ingo Molnar 已提交
36
	PERF_TYPE_MAX,				/* non-ABI */
37
};
38

39
/*
40 41
 * Generalized performance event event_id types, used by the
 * attr.event_id parameter of the sys_perf_event_open()
I
Ingo Molnar 已提交
42
 * syscall:
43
 */
P
Peter Zijlstra 已提交
44
enum perf_hw_id {
I
Ingo Molnar 已提交
45
	/*
46
	 * Common hardware events, generalized by the kernel:
I
Ingo Molnar 已提交
47
	 */
48 49 50 51 52 53 54 55
	PERF_COUNT_HW_CPU_CYCLES		= 0,
	PERF_COUNT_HW_INSTRUCTIONS		= 1,
	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
	PERF_COUNT_HW_CACHE_MISSES		= 3,
	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
	PERF_COUNT_HW_BRANCH_MISSES		= 5,
	PERF_COUNT_HW_BUS_CYCLES		= 6,

I
Ingo Molnar 已提交
56
	PERF_COUNT_HW_MAX,			/* non-ABI */
57
};
58

59
/*
60
 * Generalized hardware cache events:
61
 *
62
 *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
63 64 65
 *       { read, write, prefetch } x
 *       { accesses, misses }
 */
P
Peter Zijlstra 已提交
66
enum perf_hw_cache_id {
I
Ingo Molnar 已提交
67 68 69 70 71 72 73 74
	PERF_COUNT_HW_CACHE_L1D			= 0,
	PERF_COUNT_HW_CACHE_L1I			= 1,
	PERF_COUNT_HW_CACHE_LL			= 2,
	PERF_COUNT_HW_CACHE_DTLB		= 3,
	PERF_COUNT_HW_CACHE_ITLB		= 4,
	PERF_COUNT_HW_CACHE_BPU			= 5,

	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
75 76
};

P
Peter Zijlstra 已提交
77
enum perf_hw_cache_op_id {
I
Ingo Molnar 已提交
78 79 80
	PERF_COUNT_HW_CACHE_OP_READ		= 0,
	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
81

I
Ingo Molnar 已提交
82
	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
83 84
};

P
Peter Zijlstra 已提交
85 86 87
enum perf_hw_cache_op_result_id {
	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
88

I
Ingo Molnar 已提交
89
	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
90 91
};

92
/*
93 94
 * Special "software" events provided by the kernel, even if the hardware
 * does not support performance events. These events measure various
95 96 97
 * physical and sw events of the kernel (and allow the profiling of them as
 * well):
 */
P
Peter Zijlstra 已提交
98
enum perf_sw_ids {
I
Ingo Molnar 已提交
99 100 101 102 103 104 105
	PERF_COUNT_SW_CPU_CLOCK			= 0,
	PERF_COUNT_SW_TASK_CLOCK		= 1,
	PERF_COUNT_SW_PAGE_FAULTS		= 2,
	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
106 107
	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
I
Ingo Molnar 已提交
108 109

	PERF_COUNT_SW_MAX,			/* non-ABI */
T
Thomas Gleixner 已提交
110 111
};

112
/*
113
 * Bits that can be set in attr.sample_type to request information
114 115
 * in the overflow packets.
 */
116
enum perf_event_sample_format {
I
Ingo Molnar 已提交
117 118 119 120
	PERF_SAMPLE_IP				= 1U << 0,
	PERF_SAMPLE_TID				= 1U << 1,
	PERF_SAMPLE_TIME			= 1U << 2,
	PERF_SAMPLE_ADDR			= 1U << 3,
121
	PERF_SAMPLE_READ			= 1U << 4,
I
Ingo Molnar 已提交
122 123 124 125
	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
	PERF_SAMPLE_ID				= 1U << 6,
	PERF_SAMPLE_CPU				= 1U << 7,
	PERF_SAMPLE_PERIOD			= 1U << 8,
126
	PERF_SAMPLE_STREAM_ID			= 1U << 9,
127
	PERF_SAMPLE_RAW				= 1U << 10,
128

129
	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
130 131
};

132
/*
133
 * The format of the data returned by read() on a perf event fd,
134 135 136
 * as specified by attr.read_format:
 *
 * struct read_format {
I
Ingo Molnar 已提交
137 138 139 140 141
 *	{ u64		value;
 *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
 *	  { u64		time_running; } && PERF_FORMAT_RUNNING
 *	  { u64		id;           } && PERF_FORMAT_ID
 *	} && !PERF_FORMAT_GROUP
142
 *
I
Ingo Molnar 已提交
143 144 145 146 147 148 149
 *	{ u64		nr;
 *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
 *	  { u64		time_running; } && PERF_FORMAT_RUNNING
 *	  { u64		value;
 *	    { u64	id;           } && PERF_FORMAT_ID
 *	  }		cntr[nr];
 *	} && PERF_FORMAT_GROUP
150
 * };
151
 */
152
enum perf_event_read_format {
I
Ingo Molnar 已提交
153 154 155
	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
	PERF_FORMAT_ID				= 1U << 2,
156
	PERF_FORMAT_GROUP			= 1U << 3,
157

I
Ingo Molnar 已提交
158
	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
159 160
};

161 162
#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */

I
Ingo Molnar 已提交
163
/*
164
 * Hardware event_id to monitor via a performance monitoring event:
I
Ingo Molnar 已提交
165
 */
166
struct perf_event_attr {
167

168
	/*
169 170 171
	 * Major type: hardware/software/tracepoint/etc.
	 */
	__u32			type;
172 173 174 175 176

	/*
	 * Size of the attr structure, for fwd/bwd compat.
	 */
	__u32			size;
177 178 179

	/*
	 * Type specific configuration information.
180 181
	 */
	__u64			config;
I
Ingo Molnar 已提交
182

183
	union {
184 185
		__u64		sample_period;
		__u64		sample_freq;
186 187
	};

188 189
	__u64			sample_type;
	__u64			read_format;
I
Ingo Molnar 已提交
190

191
	__u64			disabled       :  1, /* off by default        */
192 193 194 195 196 197
				inherit	       :  1, /* children inherit it   */
				pinned	       :  1, /* must always be on PMU */
				exclusive      :  1, /* only group on PMU     */
				exclude_user   :  1, /* don't count user      */
				exclude_kernel :  1, /* ditto kernel          */
				exclude_hv     :  1, /* ditto hypervisor      */
198
				exclude_idle   :  1, /* don't count when idle */
199
				mmap           :  1, /* include mmap data     */
200
				comm	       :  1, /* include comm data     */
201
				freq           :  1, /* use freq, not period  */
202
				inherit_stat   :  1, /* per task counts       */
203
				enable_on_exec :  1, /* next exec enables     */
P
Peter Zijlstra 已提交
204
				task           :  1, /* trace fork/exit       */
205
				watermark      :  1, /* wakeup_watermark      */
P
Peter Zijlstra 已提交
206 207 208 209 210 211 212 213 214 215 216
				/*
				 * precise_ip:
				 *
				 *  0 - SAMPLE_IP can have arbitrary skid
				 *  1 - SAMPLE_IP must have constant skid
				 *  2 - SAMPLE_IP requested to have 0 skid
				 *  3 - SAMPLE_IP must have 0 skid
				 *
				 *  See also PERF_RECORD_MISC_EXACT_IP
				 */
				precise_ip     :  2, /* skid constraint       */
217
				mmap_data      :  1, /* non-exec mmap data    */
P
Peter Zijlstra 已提交
218

219
				__reserved_1   : 46;
220

221 222 223 224
	union {
		__u32		wakeup_events;	  /* wakeup every n events */
		__u32		wakeup_watermark; /* bytes before wakeup   */
	};
225

226
	__u32			bp_type;
227 228
	__u64			bp_addr;
	__u64			bp_len;
229 230
};

231
/*
232
 * Ioctls that can be done on a perf event fd:
233
 */
234
#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
I
Ingo Molnar 已提交
235 236
#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
237
#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
238
#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
239
#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
L
Li Zefan 已提交
240
#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
241 242

enum perf_event_ioc_flags {
P
Peter Zijlstra 已提交
243 244
	PERF_IOC_FLAG_GROUP		= 1U << 0,
};
245

246 247 248
/*
 * Structure of the page that can be mapped via mmap
 */
249
struct perf_event_mmap_page {
250 251
	__u32	version;		/* version number of this structure */
	__u32	compat_version;		/* lowest version this is compat with */
252 253

	/*
254
	 * Bits needed to read the hw events in user-space.
255
	 *
256 257
	 *   u32 seq;
	 *   s64 count;
258
	 *
259 260
	 *   do {
	 *     seq = pc->lock;
261
	 *
262 263 264 265 266 267
	 *     barrier()
	 *     if (pc->index) {
	 *       count = pmc_read(pc->index - 1);
	 *       count += pc->offset;
	 *     } else
	 *       goto regular_read;
268
	 *
269 270
	 *     barrier();
	 *   } while (pc->lock != seq);
271
	 *
272 273
	 * NOTE: for obvious reason this only works on self-monitoring
	 *       processes.
274
	 */
275
	__u32	lock;			/* seqlock for synchronization */
276 277 278 279
	__u32	index;			/* hardware event identifier */
	__s64	offset;			/* add to hardware event value */
	__u64	time_enabled;		/* time event active */
	__u64	time_running;		/* time event on cpu */
280

281 282 283 284
		/*
		 * Hole for extension of the self monitor capabilities
		 */

285
	__u64	__reserved[123];	/* align to 1k */
286

287 288 289
	/*
	 * Control data for the mmap() data buffer.
	 *
290 291
	 * User-space reading the @data_head value should issue an rmb(), on
	 * SMP capable platforms, after reading this value -- see
292
	 * perf_event_wakeup().
293 294 295 296
	 *
	 * When the mapping is PROT_WRITE the @data_tail value should be
	 * written by userspace to reflect the last read data. In this case
	 * the kernel will not over-write unread data.
297
	 */
298
	__u64   data_head;		/* head in the data section */
299
	__u64	data_tail;		/* user-space written tail */
300 301
};

302
#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
303
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
304 305 306
#define PERF_RECORD_MISC_KERNEL			(1 << 0)
#define PERF_RECORD_MISC_USER			(2 << 0)
#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
307 308
#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
309

P
Peter Zijlstra 已提交
310 311 312 313 314 315
/*
 * Indicates that the content of PERF_SAMPLE_IP points to
 * the actual instruction that triggered the event. See also
 * perf_event_attr::precise_ip.
 */
#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
316 317 318 319 320
/*
 * Reserve the last bit to indicate some extended misc field
 */
#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)

P
Peter Zijlstra 已提交
321 322
struct perf_event_header {
	__u32	type;
323 324
	__u16	misc;
	__u16	size;
P
Peter Zijlstra 已提交
325 326 327
};

enum perf_event_type {
328

329 330 331 332 333
	/*
	 * The MMAP events record the PROT_EXEC mappings so that we can
	 * correlate userspace IPs to code. They have the following structure:
	 *
	 * struct {
I
Ingo Molnar 已提交
334
	 *	struct perf_event_header	header;
335
	 *
I
Ingo Molnar 已提交
336 337 338 339 340
	 *	u32				pid, tid;
	 *	u64				addr;
	 *	u64				len;
	 *	u64				pgoff;
	 *	char				filename[];
341 342
	 * };
	 */
343
	PERF_RECORD_MMAP			= 1,
344

345 346
	/*
	 * struct {
I
Ingo Molnar 已提交
347 348 349
	 *	struct perf_event_header	header;
	 *	u64				id;
	 *	u64				lost;
350 351
	 * };
	 */
352
	PERF_RECORD_LOST			= 2,
353

354 355
	/*
	 * struct {
I
Ingo Molnar 已提交
356
	 *	struct perf_event_header	header;
357
	 *
I
Ingo Molnar 已提交
358 359
	 *	u32				pid, tid;
	 *	char				comm[];
360 361
	 * };
	 */
362
	PERF_RECORD_COMM			= 3,
363

P
Peter Zijlstra 已提交
364 365 366 367 368
	/*
	 * struct {
	 *	struct perf_event_header	header;
	 *	u32				pid, ppid;
	 *	u32				tid, ptid;
369
	 *	u64				time;
P
Peter Zijlstra 已提交
370 371
	 * };
	 */
372
	PERF_RECORD_EXIT			= 4,
P
Peter Zijlstra 已提交
373

374 375
	/*
	 * struct {
I
Ingo Molnar 已提交
376 377
	 *	struct perf_event_header	header;
	 *	u64				time;
378
	 *	u64				id;
379
	 *	u64				stream_id;
380 381
	 * };
	 */
382 383
	PERF_RECORD_THROTTLE			= 5,
	PERF_RECORD_UNTHROTTLE			= 6,
384

P
Peter Zijlstra 已提交
385 386
	/*
	 * struct {
387 388
	 *	struct perf_event_header	header;
	 *	u32				pid, ppid;
P
Peter Zijlstra 已提交
389
	 *	u32				tid, ptid;
390
	 *	u64				time;
P
Peter Zijlstra 已提交
391 392
	 * };
	 */
393
	PERF_RECORD_FORK			= 7,
P
Peter Zijlstra 已提交
394

395 396
	/*
	 * struct {
397 398
	 *	struct perf_event_header	header;
	 *	u32				pid, tid;
399
	 *
400
	 *	struct read_format		values;
401 402
	 * };
	 */
403
	PERF_RECORD_READ			= 8,
404

405
	/*
406
	 * struct {
I
Ingo Molnar 已提交
407
	 *	struct perf_event_header	header;
408
	 *
409 410 411 412
	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
	 *	{ u64			time;     } && PERF_SAMPLE_TIME
	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
413
	 *	{ u64			id;	  } && PERF_SAMPLE_ID
414
	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
415
	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
I
Ingo Molnar 已提交
416
	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
417
	 *
418
	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
419
	 *
420
	 *	{ u64			nr,
421
	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
422
	 *
I
Ingo Molnar 已提交
423 424 425 426 427 428 429 430 431 432
	 *	#
	 *	# The RAW record below is opaque data wrt the ABI
	 *	#
	 *	# That is, the ABI doesn't make any promises wrt to
	 *	# the stability of its content, it may vary depending
	 *	# on event, hardware, kernel version and phase of
	 *	# the moon.
	 *	#
	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
	 *	#
433
	 *
434 435
	 *	{ u32			size;
	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
436
	 * };
437
	 */
438
	PERF_RECORD_SAMPLE			= 9,
439

440
	PERF_RECORD_MAX,			/* non-ABI */
P
Peter Zijlstra 已提交
441 442
};

443 444 445 446
enum perf_callchain_context {
	PERF_CONTEXT_HV			= (__u64)-32,
	PERF_CONTEXT_KERNEL		= (__u64)-128,
	PERF_CONTEXT_USER		= (__u64)-512,
447

448 449 450 451 452
	PERF_CONTEXT_GUEST		= (__u64)-2048,
	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,

	PERF_CONTEXT_MAX		= (__u64)-4095,
453 454
};

455 456 457
#define PERF_FLAG_FD_NO_GROUP	(1U << 0)
#define PERF_FLAG_FD_OUTPUT	(1U << 1)

458
#ifdef __KERNEL__
I
Ingo Molnar 已提交
459
/*
460
 * Kernel-internal data types and definitions:
I
Ingo Molnar 已提交
461 462
 */

463 464
#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
465 466
#endif

467 468 469 470 471 472
struct perf_guest_info_callbacks {
	int (*is_in_guest) (void);
	int (*is_user_mode) (void);
	unsigned long (*get_guest_ip) (void);
};

473 474 475 476
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif

477 478 479 480 481
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
482
#include <linux/hrtimer.h>
P
Peter Zijlstra 已提交
483
#include <linux/fs.h>
484
#include <linux/pid_namespace.h>
485
#include <linux/workqueue.h>
486
#include <linux/ftrace.h>
487
#include <linux/cpu.h>
488
#include <asm/atomic.h>
489
#include <asm/local.h>
490

491 492 493 494 495 496 497
#define PERF_MAX_STACK_DEPTH		255

struct perf_callchain_entry {
	__u64				nr;
	__u64				ip[PERF_MAX_STACK_DEPTH];
};

498 499 500
struct perf_raw_record {
	u32				size;
	void				*data;
501 502
};

503 504 505 506 507 508 509 510 511 512 513
struct perf_branch_entry {
	__u64				from;
	__u64				to;
	__u64				flags;
};

struct perf_branch_stack {
	__u64				nr;
	struct perf_branch_entry	entries[0];
};

514 515
struct task_struct;

T
Thomas Gleixner 已提交
516
/**
517
 * struct hw_perf_event - performance event hardware details:
T
Thomas Gleixner 已提交
518
 */
519 520
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
521 522
	union {
		struct { /* hardware */
I
Ingo Molnar 已提交
523
			u64		config;
524
			u64		last_tag;
I
Ingo Molnar 已提交
525
			unsigned long	config_base;
526
			unsigned long	event_base;
I
Ingo Molnar 已提交
527
			int		idx;
528
			int		last_cpu;
529
		};
530 531
		struct { /* software */
			s64		remaining;
I
Ingo Molnar 已提交
532
			struct hrtimer	hrtimer;
533
		};
534
#ifdef CONFIG_HAVE_HW_BREAKPOINT
535 536
		/* breakpoint */
		struct arch_hw_breakpoint	info;
537
#endif
538
	};
539
	atomic64_t			prev_count;
540
	u64				sample_period;
541
	u64				last_period;
542
	atomic64_t			period_left;
543
	u64				interrupts;
544

545 546
	u64				freq_time_stamp;
	u64				freq_count_stamp;
547
#endif
T
Thomas Gleixner 已提交
548 549
};

550
struct perf_event;
I
Ingo Molnar 已提交
551

552 553 554 555
/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_EVENT_TXN 0x1
556

I
Ingo Molnar 已提交
557
/**
558
 * struct pmu - generic performance monitoring unit
I
Ingo Molnar 已提交
559
 */
560
struct pmu {
561 562
	int (*enable)			(struct perf_event *event);
	void (*disable)			(struct perf_event *event);
563 564
	int (*start)			(struct perf_event *event);
	void (*stop)			(struct perf_event *event);
565 566
	void (*read)			(struct perf_event *event);
	void (*unthrottle)		(struct perf_event *event);
567 568

	/*
569 570 571
	 * Group events scheduling is treated as a transaction, add group
	 * events as a whole and perform one schedulability test. If the test
	 * fails, roll back the whole group
572 573
	 */

574 575 576 577
	/*
	 * Start the transaction, after this ->enable() doesn't need
	 * to do schedulability tests.
	 */
578
	void (*start_txn)	(const struct pmu *pmu);
579 580 581 582 583 584
	/*
	 * If ->start_txn() disabled the ->enable() schedulability test
	 * then ->commit_txn() is required to perform one. On success
	 * the transaction is closed. On error the transaction is kept
	 * open until ->cancel_txn() is called.
	 */
585
	int  (*commit_txn)	(const struct pmu *pmu);
586 587 588 589 590
	/*
	 * Will cancel the transaction, assumes ->disable() is called for
	 * each successfull ->enable() during the transaction.
	 */
	void (*cancel_txn)	(const struct pmu *pmu);
I
Ingo Molnar 已提交
591 592
};

593
/**
594
 * enum perf_event_active_state - the states of a event
595
 */
596
enum perf_event_active_state {
I
Ingo Molnar 已提交
597
	PERF_EVENT_STATE_ERROR		= -2,
598 599
	PERF_EVENT_STATE_OFF		= -1,
	PERF_EVENT_STATE_INACTIVE	=  0,
I
Ingo Molnar 已提交
600
	PERF_EVENT_STATE_ACTIVE		=  1,
601 602
};

603 604
struct file;

605 606
#define PERF_BUFFER_WRITABLE		0x01

607
struct perf_buffer {
608
	atomic_t			refcount;
609
	struct rcu_head			rcu_head;
610 611
#ifdef CONFIG_PERF_USE_VMALLOC
	struct work_struct		work;
612
	int				page_order;	/* allocation order  */
613
#endif
P
Peter Zijlstra 已提交
614
	int				nr_pages;	/* nr of data pages  */
615
	int				writable;	/* are we writable   */
P
Peter Zijlstra 已提交
616

617
	atomic_t			poll;		/* POLL_ for wakeups */
P
Peter Zijlstra 已提交
618

619 620 621
	local_t				head;		/* write position    */
	local_t				nest;		/* nested writers    */
	local_t				events;		/* event limit       */
622
	local_t				wakeup;		/* wakeup stamp      */
623
	local_t				lost;		/* nr records lost   */
624

625 626
	long				watermark;	/* wakeup watermark  */

I
Ingo Molnar 已提交
627
	struct perf_event_mmap_page	*user_page;
I
Ingo Molnar 已提交
628
	void				*data_pages[0];
629 630
};

631 632 633
struct perf_pending_entry {
	struct perf_pending_entry *next;
	void (*func)(struct perf_pending_entry *);
634 635
};

636 637
struct perf_sample_data;

638 639 640 641
typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
					struct perf_sample_data *,
					struct pt_regs *regs);

642 643 644 645
enum perf_group_flag {
	PERF_GROUP_SOFTWARE = 0x1,
};

646 647 648 649 650 651 652 653
#define SWEVENT_HLIST_BITS	8
#define SWEVENT_HLIST_SIZE	(1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
	struct hlist_head	heads[SWEVENT_HLIST_SIZE];
	struct rcu_head		rcu_head;
};

654 655 656
#define PERF_ATTACH_CONTEXT	0x01
#define PERF_ATTACH_GROUP	0x02

T
Thomas Gleixner 已提交
657
/**
658
 * struct perf_event - performance event kernel representation:
T
Thomas Gleixner 已提交
659
 */
660 661
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
662
	struct list_head		group_entry;
P
Peter Zijlstra 已提交
663
	struct list_head		event_entry;
664
	struct list_head		sibling_list;
665
	struct hlist_node		hlist_entry;
I
Ingo Molnar 已提交
666
	int				nr_siblings;
667
	int				group_flags;
668
	struct perf_event		*group_leader;
669
	const struct pmu		*pmu;
670

671
	enum perf_event_active_state	state;
672
	unsigned int			attach_state;
T
Thomas Gleixner 已提交
673
	atomic64_t			count;
674

675
	/*
676
	 * These are the total time in nanoseconds that the event
677
	 * has been enabled (i.e. eligible to run, and the task has
678
	 * been scheduled in, if this is a per-task event)
679 680 681
	 * and running (scheduled onto the CPU), respectively.
	 *
	 * They are computed from tstamp_enabled, tstamp_running and
682
	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
683 684 685 686 687 688
	 */
	u64				total_time_enabled;
	u64				total_time_running;

	/*
	 * These are timestamps used for computing total_time_enabled
689
	 * and total_time_running when the event is in INACTIVE or
690 691
	 * ACTIVE state, measured in nanoseconds from an arbitrary point
	 * in time.
692 693
	 * tstamp_enabled: the notional time when the event was enabled
	 * tstamp_running: the notional time when the event was scheduled on
694
	 * tstamp_stopped: in INACTIVE state, the notional time when the
695
	 *	event was scheduled off.
696 697 698 699 700
	 */
	u64				tstamp_enabled;
	u64				tstamp_running;
	u64				tstamp_stopped;

701
	struct perf_event_attr		attr;
702
	struct hw_perf_event		hw;
T
Thomas Gleixner 已提交
703

704
	struct perf_event_context	*ctx;
705
	struct file			*filp;
T
Thomas Gleixner 已提交
706

707 708
	/*
	 * These accumulate total time (in nanoseconds) that children
709
	 * events have been enabled and running, respectively.
710 711 712 713
	 */
	atomic64_t			child_total_time_enabled;
	atomic64_t			child_total_time_running;

T
Thomas Gleixner 已提交
714
	/*
715
	 * Protect attach/detach and child_list:
T
Thomas Gleixner 已提交
716
	 */
717 718
	struct mutex			child_mutex;
	struct list_head		child_list;
719
	struct perf_event		*parent;
T
Thomas Gleixner 已提交
720 721 722 723

	int				oncpu;
	int				cpu;

724 725 726
	struct list_head		owner_entry;
	struct task_struct		*owner;

727 728 729
	/* mmap bits */
	struct mutex			mmap_mutex;
	atomic_t			mmap_count;
730 731
	int				mmap_locked;
	struct user_struct		*mmap_user;
732
	struct perf_buffer		*buffer;
733

734
	/* poll related */
T
Thomas Gleixner 已提交
735
	wait_queue_head_t		waitq;
P
Peter Zijlstra 已提交
736
	struct fasync_struct		*fasync;
737 738 739

	/* delayed work for NMIs and such */
	int				pending_wakeup;
740
	int				pending_kill;
741
	int				pending_disable;
742
	struct perf_pending_entry	pending;
P
Peter Zijlstra 已提交
743

744 745
	atomic_t			event_limit;

746
	void (*destroy)(struct perf_event *);
P
Peter Zijlstra 已提交
747
	struct rcu_head			rcu_head;
748 749

	struct pid_namespace		*ns;
750
	u64				id;
L
Li Zefan 已提交
751

752
	perf_overflow_handler_t		overflow_handler;
753

754
#ifdef CONFIG_EVENT_TRACING
755
	struct ftrace_event_call	*tp_event;
L
Li Zefan 已提交
756
	struct event_filter		*filter;
757
#endif
L
Li Zefan 已提交
758 759

#endif /* CONFIG_PERF_EVENTS */
T
Thomas Gleixner 已提交
760 761 762
};

/**
763
 * struct perf_event_context - event context structure
T
Thomas Gleixner 已提交
764
 *
765
 * Used as a container for task events and CPU events as well:
T
Thomas Gleixner 已提交
766
 */
767
struct perf_event_context {
T
Thomas Gleixner 已提交
768
	/*
769
	 * Protect the states of the events in the list,
770
	 * nr_active, and the list:
T
Thomas Gleixner 已提交
771
	 */
772
	raw_spinlock_t			lock;
773
	/*
774
	 * Protect the list of events.  Locking either mutex or lock
775 776 777
	 * is sufficient to ensure the list doesn't change; to change
	 * the list you need to lock both the mutex and the spinlock.
	 */
I
Ingo Molnar 已提交
778
	struct mutex			mutex;
779

780 781
	struct list_head		pinned_groups;
	struct list_head		flexible_groups;
I
Ingo Molnar 已提交
782
	struct list_head		event_list;
783
	int				nr_events;
I
Ingo Molnar 已提交
784 785
	int				nr_active;
	int				is_active;
786
	int				nr_stat;
I
Ingo Molnar 已提交
787 788
	atomic_t			refcount;
	struct task_struct		*task;
789 790

	/*
791
	 * Context clock, runs when context enabled.
792
	 */
I
Ingo Molnar 已提交
793 794
	u64				time;
	u64				timestamp;
795 796 797 798 799

	/*
	 * These fields let us detect when two contexts have both
	 * been cloned (inherited) from a common ancestor.
	 */
800
	struct perf_event_context	*parent_ctx;
I
Ingo Molnar 已提交
801 802 803 804
	u64				parent_gen;
	u64				generation;
	int				pin_count;
	struct rcu_head			rcu_head;
T
Thomas Gleixner 已提交
805 806 807
};

/**
808
 * struct perf_event_cpu_context - per cpu event context structure
T
Thomas Gleixner 已提交
809 810
 */
struct perf_cpu_context {
811 812
	struct perf_event_context	ctx;
	struct perf_event_context	*task_ctx;
T
Thomas Gleixner 已提交
813 814
	int				active_oncpu;
	int				max_pertask;
815
	int				exclusive;
816 817 818
	struct swevent_hlist		*swevent_hlist;
	struct mutex			hlist_mutex;
	int				hlist_refcount;
P
Peter Zijlstra 已提交
819 820 821 822 823 824

	/*
	 * Recursion avoidance:
	 *
	 * task, softirq, irq, nmi context
	 */
825
	int				recursion[4];
T
Thomas Gleixner 已提交
826 827
};

828
struct perf_output_handle {
I
Ingo Molnar 已提交
829
	struct perf_event		*event;
830
	struct perf_buffer		*buffer;
831
	unsigned long			wakeup;
832 833 834
	unsigned long			size;
	void				*addr;
	int				page;
I
Ingo Molnar 已提交
835 836
	int				nmi;
	int				sample;
837 838
};

839
#ifdef CONFIG_PERF_EVENTS
840

T
Thomas Gleixner 已提交
841 842 843
/*
 * Set by architecture code:
 */
844
extern int perf_max_events;
T
Thomas Gleixner 已提交
845

846
extern const struct pmu *hw_perf_event_init(struct perf_event *event);
I
Ingo Molnar 已提交
847

848
extern void perf_event_task_sched_in(struct task_struct *task);
849
extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
850
extern void perf_event_task_tick(struct task_struct *task);
851 852 853 854 855 856
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void set_perf_event_pending(void);
extern void perf_event_do_pending(void);
extern void perf_event_print_debug(void);
857 858 859 860
extern void __perf_disable(void);
extern bool __perf_enable(void);
extern void perf_disable(void);
extern void perf_enable(void);
861 862 863
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
extern void perf_event_update_userpage(struct perf_event *event);
864 865 866 867
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
				int cpu,
868
				pid_t pid,
869
				perf_overflow_handler_t callback);
870 871
extern u64 perf_event_read_value(struct perf_event *event,
				 u64 *enabled, u64 *running);
872

873
struct perf_sample_data {
874 875 876 877 878 879 880 881
	u64				type;

	u64				ip;
	struct {
		u32	pid;
		u32	tid;
	}				tid_entry;
	u64				time;
I
Ingo Molnar 已提交
882
	u64				addr;
883 884 885 886 887 888
	u64				id;
	u64				stream_id;
	struct {
		u32	cpu;
		u32	reserved;
	}				cpu_entry;
I
Ingo Molnar 已提交
889
	u64				period;
890
	struct perf_callchain_entry	*callchain;
891
	struct perf_raw_record		*raw;
892 893
};

894 895 896 897 898 899 900
static inline
void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
{
	data->addr = addr;
	data->raw  = NULL;
}

901 902 903
extern void perf_output_sample(struct perf_output_handle *handle,
			       struct perf_event_header *header,
			       struct perf_sample_data *data,
904
			       struct perf_event *event);
905 906
extern void perf_prepare_sample(struct perf_event_header *header,
				struct perf_sample_data *data,
907
				struct perf_event *event,
908 909
				struct pt_regs *regs);

910
extern int perf_event_overflow(struct perf_event *event, int nmi,
911 912
				 struct perf_sample_data *data,
				 struct pt_regs *regs);
913

914
/*
915
 * Return 1 for a software event, 0 for a hardware event
916
 */
917
static inline int is_software_event(struct perf_event *event)
918
{
919 920 921 922 923 924 925 926
	switch (event->attr.type) {
	case PERF_TYPE_SOFTWARE:
	case PERF_TYPE_TRACEPOINT:
	/* for now the breakpoint stuff also works as software event */
	case PERF_TYPE_BREAKPOINT:
		return 1;
	}
	return 0;
927 928
}

929
extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
930

931
extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
932

933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
extern void
perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);

/*
 * Take a snapshot of the regs. Skip ip and frame pointer to
 * the nth caller. We only need a few of the regs:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - bp for callchains
 * - eflags, for future purposes, just in case
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
{
	unsigned long ip;

	memset(regs, 0, sizeof(*regs));

	switch (skip) {
	case 1 :
		ip = CALLER_ADDR0;
		break;
	case 2 :
		ip = CALLER_ADDR1;
		break;
	case 3 :
		ip = CALLER_ADDR2;
		break;
	case 4:
		ip = CALLER_ADDR3;
		break;
	/* No need to support further for now */
	default:
		ip = 0;
	}

	return perf_arch_fetch_caller_regs(regs, ip, skip);
}

971 972 973 974 975 976 977 978 979 980 981 982 983 984
static inline void
perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
{
	if (atomic_read(&perf_swevent_enabled[event_id])) {
		struct pt_regs hot_regs;

		if (!regs) {
			perf_fetch_caller_regs(&hot_regs, 1);
			regs = &hot_regs;
		}
		__perf_sw_event(event_id, nr, nmi, regs, addr);
	}
}

985
extern void perf_event_mmap(struct vm_area_struct *vma);
986
extern struct perf_guest_info_callbacks *perf_guest_cbs;
987 988
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
989

990 991
extern void perf_event_comm(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
992

993 994
extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);

995 996 997
extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
998

999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
static inline bool perf_paranoid_tracepoint_raw(void)
{
	return sysctl_perf_event_paranoid > -1;
}

static inline bool perf_paranoid_cpu(void)
{
	return sysctl_perf_event_paranoid > 0;
}

static inline bool perf_paranoid_kernel(void)
{
	return sysctl_perf_event_paranoid > 1;
}

1014
extern void perf_event_init(void);
1015 1016
extern void perf_tp_event(u64 addr, u64 count, void *record,
			  int entry_size, struct pt_regs *regs,
1017
			  struct hlist_head *head, int rctx);
1018
extern void perf_bp_event(struct perf_event *event, void *data);
1019

1020
#ifndef perf_misc_flags
1021 1022
#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_RECORD_MISC_USER : \
				 PERF_RECORD_MISC_KERNEL)
1023 1024 1025
#define perf_instruction_pointer(regs)	instruction_pointer(regs)
#endif

1026
extern int perf_output_begin(struct perf_output_handle *handle,
1027
			     struct perf_event *event, unsigned int size,
1028 1029 1030 1031
			     int nmi, int sample);
extern void perf_output_end(struct perf_output_handle *handle);
extern void perf_output_copy(struct perf_output_handle *handle,
			     const void *buf, unsigned int len);
1032 1033
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
1034 1035
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
T
Thomas Gleixner 已提交
1036 1037
#else
static inline void
1038
perf_event_task_sched_in(struct task_struct *task)			{ }
T
Thomas Gleixner 已提交
1039
static inline void
1040
perf_event_task_sched_out(struct task_struct *task,
1041
			    struct task_struct *next)			{ }
T
Thomas Gleixner 已提交
1042
static inline void
1043
perf_event_task_tick(struct task_struct *task)				{ }
1044 1045 1046
static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
static inline void perf_event_exit_task(struct task_struct *child)	{ }
static inline void perf_event_free_task(struct task_struct *task)	{ }
I
Ingo Molnar 已提交
1047 1048
static inline void perf_event_do_pending(void)				{ }
static inline void perf_event_print_debug(void)				{ }
1049 1050
static inline void perf_disable(void)					{ }
static inline void perf_enable(void)					{ }
I
Ingo Molnar 已提交
1051 1052
static inline int perf_event_task_disable(void)				{ return -EINVAL; }
static inline int perf_event_task_enable(void)				{ return -EINVAL; }
1053

1054
static inline void
1055
perf_sw_event(u32 event_id, u64 nr, int nmi,
1056
		     struct pt_regs *regs, u64 addr)			{ }
1057
static inline void
1058
perf_bp_event(struct perf_event *event, void *data)			{ }
1059

1060
static inline int perf_register_guest_info_callbacks
1061
(struct perf_guest_info_callbacks *callbacks) { return 0; }
1062
static inline int perf_unregister_guest_info_callbacks
1063
(struct perf_guest_info_callbacks *callbacks) { return 0; }
1064

I
Ingo Molnar 已提交
1065
static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
1066 1067 1068
static inline void perf_event_comm(struct task_struct *tsk)		{ }
static inline void perf_event_fork(struct task_struct *tsk)		{ }
static inline void perf_event_init(void)				{ }
1069
static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
1070
static inline void perf_swevent_put_recursion_context(int rctx)		{ }
1071 1072
static inline void perf_event_enable(struct perf_event *event)		{ }
static inline void perf_event_disable(struct perf_event *event)		{ }
T
Thomas Gleixner 已提交
1073 1074
#endif

1075 1076 1077
#define perf_output_put(handle, x) \
	perf_output_copy((handle), &(x), sizeof(x))

1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
/*
 * This has to have a higher priority than migration_notifier in sched.c.
 */
#define perf_cpu_notifier(fn)					\
do {								\
	static struct notifier_block fn##_nb __cpuinitdata =	\
		{ .notifier_call = fn, .priority = 20 };	\
	fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,		\
		(void *)(unsigned long)smp_processor_id());	\
	fn(&fn##_nb, (unsigned long)CPU_STARTING,		\
		(void *)(unsigned long)smp_processor_id());	\
	fn(&fn##_nb, (unsigned long)CPU_ONLINE,			\
		(void *)(unsigned long)smp_processor_id());	\
	register_cpu_notifier(&fn##_nb);			\
} while (0)

1094
#endif /* __KERNEL__ */
1095
#endif /* _LINUX_PERF_EVENT_H */