cpu_buffer.c 10.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/**
 * @file cpu_buffer.c
 *
4
 * @remark Copyright 2002-2009 OProfile authors
L
Linus Torvalds 已提交
5 6 7
 * @remark Read the file COPYING
 *
 * @author John Levon <levon@movementarian.org>
8
 * @author Barry Kasindorf <barry.kasindorf@amd.com>
9
 * @author Robert Richter <robert.richter@amd.com>
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *
 * Each CPU has a local buffer that stores PC value/event
 * pairs. We also log context switches when we notice them.
 * Eventually each CPU's buffer is processed into the global
 * event buffer by sync_buffer().
 *
 * We use a local buffer for two reasons: an NMI or similar
 * interrupt cannot synchronise, and high sampling rates
 * would lead to catastrophic global synchronisation if
 * a global buffer was used.
 */

#include <linux/sched.h>
#include <linux/oprofile.h>
#include <linux/errno.h>
25

L
Linus Torvalds 已提交
26 27 28 29 30
#include "event_buffer.h"
#include "cpu_buffer.h"
#include "buffer_sync.h"
#include "oprof.h"

31 32
#define OP_BUFFER_FLAGS	0

33
static struct ring_buffer *op_ring_buffer;
34
DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
L
Linus Torvalds 已提交
35

D
David Howells 已提交
36
static void wq_sync_buffer(struct work_struct *work);
L
Linus Torvalds 已提交
37 38 39 40

#define DEFAULT_TIMER_EXPIRE (HZ / 10)
static int work_enabled;

41 42
unsigned long oprofile_get_cpu_buffer_size(void)
{
43
	return oprofile_cpu_buffer_size;
44 45 46 47
}

void oprofile_cpu_buffer_inc_smpl_lost(void)
{
48
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
49 50 51 52

	cpu_buf->sample_lost_overflow++;
}

53 54
void free_cpu_buffers(void)
{
55 56 57
	if (op_ring_buffer)
		ring_buffer_free(op_ring_buffer);
	op_ring_buffer = NULL;
58 59
}

R
Robert Richter 已提交
60 61
#define RB_EVENT_HDR_SIZE 4

L
Linus Torvalds 已提交
62 63 64
int alloc_cpu_buffers(void)
{
	int i;
65

66
	unsigned long buffer_size = oprofile_cpu_buffer_size;
R
Robert Richter 已提交
67 68
	unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
						 RB_EVENT_HDR_SIZE);
69

70 71
	op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
	if (!op_ring_buffer)
72 73
		goto fail;

C
Chris J Arges 已提交
74
	for_each_possible_cpu(i) {
75
		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
76

L
Linus Torvalds 已提交
77 78 79 80 81 82
		b->last_task = NULL;
		b->last_is_kernel = -1;
		b->tracing = 0;
		b->buffer_size = buffer_size;
		b->sample_received = 0;
		b->sample_lost_overflow = 0;
83 84
		b->backtrace_aborted = 0;
		b->sample_invalid_eip = 0;
L
Linus Torvalds 已提交
85
		b->cpu = i;
D
David Howells 已提交
86
		INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
L
Linus Torvalds 已提交
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
	}
	return 0;

fail:
	free_cpu_buffers();
	return -ENOMEM;
}

void start_cpu_work(void)
{
	int i;

	work_enabled = 1;

	for_each_online_cpu(i) {
102
		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
L
Linus Torvalds 已提交
103 104 105 106 107 108 109 110 111 112 113 114

		/*
		 * Spread the work by 1 jiffy per cpu so they dont all
		 * fire at once.
		 */
		schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
	}
}

void end_cpu_work(void)
{
	work_enabled = 0;
115 116 117 118 119
}

void flush_cpu_work(void)
{
	int i;
L
Linus Torvalds 已提交
120 121

	for_each_online_cpu(i) {
122
		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
L
Linus Torvalds 已提交
123

124 125
		/* these works are per-cpu, no need for flush_sync */
		flush_delayed_work(&b->work);
L
Linus Torvalds 已提交
126 127 128
	}
}

129 130 131 132 133 134 135 136 137 138 139 140 141 142
/*
 * This function prepares the cpu buffer to write a sample.
 *
 * Struct op_entry is used during operations on the ring buffer while
 * struct op_sample contains the data that is stored in the ring
 * buffer. Struct entry can be uninitialized. The function reserves a
 * data array that is specified by size. Use
 * op_cpu_buffer_write_commit() after preparing the sample. In case of
 * errors a null pointer is returned, otherwise the pointer to the
 * sample.
 *
 */
struct op_sample
*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
143
{
144
	entry->event = ring_buffer_lock_reserve
145
		(op_ring_buffer, sizeof(struct op_sample) +
146
		 size * sizeof(entry->sample->data[0]));
147
	if (!entry->event)
148
		return NULL;
149
	entry->sample = ring_buffer_event_data(entry->event);
150 151 152 153
	entry->size = size;
	entry->data = entry->sample->data;

	return entry->sample;
154 155 156 157
}

int op_cpu_buffer_write_commit(struct op_entry *entry)
{
158
	return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
159 160
}

161
struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
162 163
{
	struct ring_buffer_event *e;
164
	e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL);
165
	if (!e)
166
		return NULL;
167 168 169 170 171 172 173

	entry->event = e;
	entry->sample = ring_buffer_event_data(e);
	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
		/ sizeof(entry->sample->data[0]);
	entry->data = entry->sample->data;
	return entry->sample;
174 175 176 177
}

unsigned long op_cpu_buffer_entries(int cpu)
{
178
	return ring_buffer_entries_cpu(op_ring_buffer, cpu);
179 180
}

181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
static int
op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
	    int is_kernel, struct task_struct *task)
{
	struct op_entry entry;
	struct op_sample *sample;
	unsigned long flags;
	int size;

	flags = 0;

	if (backtrace)
		flags |= TRACE_BEGIN;

	/* notice a switch from user->kernel or vice versa */
	is_kernel = !!is_kernel;
	if (cpu_buf->last_is_kernel != is_kernel) {
		cpu_buf->last_is_kernel = is_kernel;
		flags |= KERNEL_CTX_SWITCH;
		if (is_kernel)
			flags |= IS_KERNEL;
	}

	/* notice a task switch */
	if (cpu_buf->last_task != task) {
		cpu_buf->last_task = task;
		flags |= USER_CTX_SWITCH;
	}

	if (!flags)
		/* nothing to do */
		return 0;

	if (flags & USER_CTX_SWITCH)
		size = 1;
	else
		size = 0;

	sample = op_cpu_buffer_write_reserve(&entry, size);
	if (!sample)
		return -ENOMEM;

	sample->eip = ESCAPE_CODE;
	sample->event = flags;

	if (size)
227
		op_cpu_buffer_add_data(&entry, (unsigned long)task);
228 229 230 231 232 233

	op_cpu_buffer_write_commit(&entry);

	return 0;
}

R
Robert Richter 已提交
234
static inline int
235 236
op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
	      unsigned long pc, unsigned long event)
L
Linus Torvalds 已提交
237
{
238
	struct op_entry entry;
239
	struct op_sample *sample;
240

241 242 243
	sample = op_cpu_buffer_write_reserve(&entry, 0);
	if (!sample)
		return -ENOMEM;
244

245 246
	sample->eip = pc;
	sample->event = event;
247

248
	return op_cpu_buffer_write_commit(&entry);
L
Linus Torvalds 已提交
249 250
}

251 252
/*
 * This must be safe from any context.
L
Linus Torvalds 已提交
253 254 255 256 257 258
 *
 * is_kernel is needed because on some architectures you cannot
 * tell if you are in kernel or user space simply by looking at
 * pc. We tag this in the buffer by generating kernel enter/exit
 * events whenever is_kernel changes
 */
259 260
static int
log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
261 262
	   unsigned long backtrace, int is_kernel, unsigned long event,
	   struct task_struct *task)
L
Linus Torvalds 已提交
263
{
264
	struct task_struct *tsk = task ? task : current;
L
Linus Torvalds 已提交
265 266
	cpu_buf->sample_received++;

267 268 269 270 271
	if (pc == ESCAPE_CODE) {
		cpu_buf->sample_invalid_eip++;
		return 0;
	}

272
	if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
273
		goto fail;
274

275
	if (op_add_sample(cpu_buf, pc, event))
R
Robert Richter 已提交
276 277
		goto fail;

L
Linus Torvalds 已提交
278
	return 1;
R
Robert Richter 已提交
279 280 281 282

fail:
	cpu_buf->sample_lost_overflow++;
	return 0;
L
Linus Torvalds 已提交
283 284
}

285
static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
L
Linus Torvalds 已提交
286 287 288 289
{
	cpu_buf->tracing = 1;
}

290
static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
L
Linus Torvalds 已提交
291 292 293 294
{
	cpu_buf->tracing = 0;
}

295 296
static inline void
__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
297 298
			  unsigned long event, int is_kernel,
			  struct task_struct *task)
L
Linus Torvalds 已提交
299
{
300
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
301
	unsigned long backtrace = oprofile_backtrace_depth;
L
Linus Torvalds 已提交
302

R
Robert Richter 已提交
303 304 305 306
	/*
	 * if log_sample() fail we can't backtrace since we lost the
	 * source of this event
	 */
307
	if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task))
308 309 310 311 312
		/* failed */
		return;

	if (!backtrace)
		return;
313

314 315
	oprofile_begin_trace(cpu_buf);
	oprofile_ops.backtrace(regs, backtrace);
L
Linus Torvalds 已提交
316 317 318
	oprofile_end_trace(cpu_buf);
}

319 320 321 322 323 324 325
void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
				unsigned long event, int is_kernel,
				struct task_struct *task)
{
	__oprofile_add_ext_sample(pc, regs, event, is_kernel, task);
}

326 327 328
void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
			     unsigned long event, int is_kernel)
{
329
	__oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
330 331
}

332 333
void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
{
334 335 336 337 338 339 340 341 342 343
	int is_kernel;
	unsigned long pc;

	if (likely(regs)) {
		is_kernel = !user_mode(regs);
		pc = profile_pc(regs);
	} else {
		is_kernel = 0;    /* This value will not be used */
		pc = ESCAPE_CODE; /* as this causes an early return. */
	}
344

345
	__oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
346 347
}

348 349 350
/*
 * Add samples with data to the ring buffer.
 *
351 352
 * Use oprofile_add_data(&entry, val) to add data and
 * oprofile_write_commit(&entry) to commit the sample.
353
 */
354 355
void
oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
356
		       unsigned long pc, int code, int size)
357
{
358
	struct op_sample *sample;
359
	int is_kernel = !user_mode(regs);
360
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
361 362 363

	cpu_buf->sample_received++;

364 365 366 367 368 369 370 371 372
	/* no backtraces for samples with data */
	if (op_add_code(cpu_buf, 0, is_kernel, current))
		goto fail;

	sample = op_cpu_buffer_write_reserve(entry, size + 2);
	if (!sample)
		goto fail;
	sample->eip = ESCAPE_CODE;
	sample->event = 0;		/* no flags */
373

374 375
	op_cpu_buffer_add_data(entry, code);
	op_cpu_buffer_add_data(entry, pc);
376

377
	return;
378

379
fail:
380
	entry->event = NULL;
381
	cpu_buf->sample_lost_overflow++;
382 383
}

384 385
int oprofile_add_data(struct op_entry *entry, unsigned long val)
{
386 387
	if (!entry->event)
		return 0;
388 389 390
	return op_cpu_buffer_add_data(entry, val);
}

391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
int oprofile_add_data64(struct op_entry *entry, u64 val)
{
	if (!entry->event)
		return 0;
	if (op_cpu_buffer_get_size(entry) < 2)
		/*
		 * the function returns 0 to indicate a too small
		 * buffer, even if there is some space left
		 */
		return 0;
	if (!op_cpu_buffer_add_data(entry, (u32)val))
		return 0;
	return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
}

406 407
int oprofile_write_commit(struct op_entry *entry)
{
408 409
	if (!entry->event)
		return -EINVAL;
410 411 412
	return op_cpu_buffer_write_commit(entry);
}

L
Linus Torvalds 已提交
413 414
void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
{
415
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
416
	log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
L
Linus Torvalds 已提交
417 418 419 420
}

void oprofile_add_trace(unsigned long pc)
{
421
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
L
Linus Torvalds 已提交
422 423 424 425

	if (!cpu_buf->tracing)
		return;

R
Robert Richter 已提交
426 427 428 429
	/*
	 * broken frame can give an eip with the same value as an
	 * escape code, abort the trace if we get it
	 */
R
Robert Richter 已提交
430 431 432
	if (pc == ESCAPE_CODE)
		goto fail;

433
	if (op_add_sample(cpu_buf, pc, 0))
R
Robert Richter 已提交
434
		goto fail;
L
Linus Torvalds 已提交
435

R
Robert Richter 已提交
436 437 438 439 440
	return;
fail:
	cpu_buf->tracing = 0;
	cpu_buf->backtrace_aborted++;
	return;
L
Linus Torvalds 已提交
441 442 443 444 445 446 447 448 449
}

/*
 * This serves to avoid cpu buffer overflow, and makes sure
 * the task mortuary progresses
 *
 * By using schedule_delayed_work_on and then schedule_delayed_work
 * we guarantee this will stay on the correct cpu
 */
D
David Howells 已提交
450
static void wq_sync_buffer(struct work_struct *work)
L
Linus Torvalds 已提交
451
{
R
Robert Richter 已提交
452
	struct oprofile_cpu_buffer *b =
D
David Howells 已提交
453
		container_of(work, struct oprofile_cpu_buffer, work.work);
L
Linus Torvalds 已提交
454
	if (b->cpu != smp_processor_id()) {
455
		printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n",
L
Linus Torvalds 已提交
456
		       smp_processor_id(), b->cpu);
C
Chris J Arges 已提交
457 458 459 460 461

		if (!cpu_online(b->cpu)) {
			cancel_delayed_work(&b->work);
			return;
		}
L
Linus Torvalds 已提交
462 463 464 465 466 467 468
	}
	sync_buffer(b->cpu);

	/* don't re-add the work if we're shutting down */
	if (work_enabled)
		schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
}