cpu_buffer.c 10.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/**
 * @file cpu_buffer.c
 *
4
 * @remark Copyright 2002-2009 OProfile authors
L
Linus Torvalds 已提交
5 6 7
 * @remark Read the file COPYING
 *
 * @author John Levon <levon@movementarian.org>
8
 * @author Barry Kasindorf <barry.kasindorf@amd.com>
9
 * @author Robert Richter <robert.richter@amd.com>
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 *
 * Each CPU has a local buffer that stores PC value/event
 * pairs. We also log context switches when we notice them.
 * Eventually each CPU's buffer is processed into the global
 * event buffer by sync_buffer().
 *
 * We use a local buffer for two reasons: an NMI or similar
 * interrupt cannot synchronise, and high sampling rates
 * would lead to catastrophic global synchronisation if
 * a global buffer was used.
 */

#include <linux/sched.h>
#include <linux/oprofile.h>
#include <linux/vmalloc.h>
#include <linux/errno.h>
26

L
Linus Torvalds 已提交
27 28 29 30 31
#include "event_buffer.h"
#include "cpu_buffer.h"
#include "buffer_sync.h"
#include "oprof.h"

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
#define OP_BUFFER_FLAGS	0

/*
 * Read and write access is using spin locking. Thus, writing to the
 * buffer by NMI handler (x86) could occur also during critical
 * sections when reading the buffer. To avoid this, there are 2
 * buffers for independent read and write access. Read access is in
 * process context only, write access only in the NMI handler. If the
 * read buffer runs empty, both buffers are swapped atomically. There
 * is potentially a small window during swapping where the buffers are
 * disabled and samples could be lost.
 *
 * Using 2 buffers is a little bit overhead, but the solution is clear
 * and does not require changes in the ring buffer implementation. It
 * can be changed to a single buffer solution when the ring buffer
 * access is implemented as non-locking atomic code.
 */
49 50
static struct ring_buffer *op_ring_buffer_read;
static struct ring_buffer *op_ring_buffer_write;
51
DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
L
Linus Torvalds 已提交
52

D
David Howells 已提交
53
static void wq_sync_buffer(struct work_struct *work);
L
Linus Torvalds 已提交
54 55 56 57

#define DEFAULT_TIMER_EXPIRE (HZ / 10)
static int work_enabled;

58 59
unsigned long oprofile_get_cpu_buffer_size(void)
{
60
	return oprofile_cpu_buffer_size;
61 62 63 64 65 66 67 68 69 70
}

void oprofile_cpu_buffer_inc_smpl_lost(void)
{
	struct oprofile_cpu_buffer *cpu_buf
		= &__get_cpu_var(cpu_buffer);

	cpu_buf->sample_lost_overflow++;
}

71 72 73 74 75 76 77 78 79 80
void free_cpu_buffers(void)
{
	if (op_ring_buffer_read)
		ring_buffer_free(op_ring_buffer_read);
	op_ring_buffer_read = NULL;
	if (op_ring_buffer_write)
		ring_buffer_free(op_ring_buffer_write);
	op_ring_buffer_write = NULL;
}

L
Linus Torvalds 已提交
81 82 83
int alloc_cpu_buffers(void)
{
	int i;
84

85
	unsigned long buffer_size = oprofile_cpu_buffer_size;
86

87 88 89 90 91 92 93
	op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS);
	if (!op_ring_buffer_read)
		goto fail;
	op_ring_buffer_write = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS);
	if (!op_ring_buffer_write)
		goto fail;

C
Chris J Arges 已提交
94
	for_each_possible_cpu(i) {
95
		struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
96

L
Linus Torvalds 已提交
97 98 99 100 101 102
		b->last_task = NULL;
		b->last_is_kernel = -1;
		b->tracing = 0;
		b->buffer_size = buffer_size;
		b->sample_received = 0;
		b->sample_lost_overflow = 0;
103 104
		b->backtrace_aborted = 0;
		b->sample_invalid_eip = 0;
L
Linus Torvalds 已提交
105
		b->cpu = i;
D
David Howells 已提交
106
		INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
L
Linus Torvalds 已提交
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
	}
	return 0;

fail:
	free_cpu_buffers();
	return -ENOMEM;
}

void start_cpu_work(void)
{
	int i;

	work_enabled = 1;

	for_each_online_cpu(i) {
122
		struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
L
Linus Torvalds 已提交
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138

		/*
		 * Spread the work by 1 jiffy per cpu so they dont all
		 * fire at once.
		 */
		schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
	}
}

void end_cpu_work(void)
{
	int i;

	work_enabled = 0;

	for_each_online_cpu(i) {
139
		struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
L
Linus Torvalds 已提交
140 141 142 143 144 145 146

		cancel_delayed_work(&b->work);
	}

	flush_scheduled_work();
}

147 148 149 150 151 152 153 154 155 156 157 158 159 160
/*
 * This function prepares the cpu buffer to write a sample.
 *
 * Struct op_entry is used during operations on the ring buffer while
 * struct op_sample contains the data that is stored in the ring
 * buffer. Struct entry can be uninitialized. The function reserves a
 * data array that is specified by size. Use
 * op_cpu_buffer_write_commit() after preparing the sample. In case of
 * errors a null pointer is returned, otherwise the pointer to the
 * sample.
 *
 */
struct op_sample
*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
161
{
162 163 164
	entry->event = ring_buffer_lock_reserve
		(op_ring_buffer_write, sizeof(struct op_sample) +
		 size * sizeof(entry->sample->data[0]), &entry->irq_flags);
165 166 167 168 169 170
	if (entry->event)
		entry->sample = ring_buffer_event_data(entry->event);
	else
		entry->sample = NULL;

	if (!entry->sample)
171
		return NULL;
172

173 174 175 176
	entry->size = size;
	entry->data = entry->sample->data;

	return entry->sample;
177 178 179 180 181 182 183 184
}

int op_cpu_buffer_write_commit(struct op_entry *entry)
{
	return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event,
					 entry->irq_flags);
}

185
struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
186 187 188 189
{
	struct ring_buffer_event *e;
	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
	if (e)
190
		goto event;
191 192 193 194 195 196
	if (ring_buffer_swap_cpu(op_ring_buffer_read,
				 op_ring_buffer_write,
				 cpu))
		return NULL;
	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
	if (e)
197
		goto event;
198
	return NULL;
199 200 201 202 203 204 205 206

event:
	entry->event = e;
	entry->sample = ring_buffer_event_data(e);
	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
		/ sizeof(entry->sample->data[0]);
	entry->data = entry->sample->data;
	return entry->sample;
207 208 209 210 211 212 213 214
}

unsigned long op_cpu_buffer_entries(int cpu)
{
	return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
		+ ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
static int
op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
	    int is_kernel, struct task_struct *task)
{
	struct op_entry entry;
	struct op_sample *sample;
	unsigned long flags;
	int size;

	flags = 0;

	if (backtrace)
		flags |= TRACE_BEGIN;

	/* notice a switch from user->kernel or vice versa */
	is_kernel = !!is_kernel;
	if (cpu_buf->last_is_kernel != is_kernel) {
		cpu_buf->last_is_kernel = is_kernel;
		flags |= KERNEL_CTX_SWITCH;
		if (is_kernel)
			flags |= IS_KERNEL;
	}

	/* notice a task switch */
	if (cpu_buf->last_task != task) {
		cpu_buf->last_task = task;
		flags |= USER_CTX_SWITCH;
	}

	if (!flags)
		/* nothing to do */
		return 0;

	if (flags & USER_CTX_SWITCH)
		size = 1;
	else
		size = 0;

	sample = op_cpu_buffer_write_reserve(&entry, size);
	if (!sample)
		return -ENOMEM;

	sample->eip = ESCAPE_CODE;
	sample->event = flags;

	if (size)
		sample->data[0] = (unsigned long)task;

	op_cpu_buffer_write_commit(&entry);

	return 0;
}

R
Robert Richter 已提交
268
static inline int
269 270
op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
	      unsigned long pc, unsigned long event)
L
Linus Torvalds 已提交
271
{
272
	struct op_entry entry;
273
	struct op_sample *sample;
274

275 276 277
	sample = op_cpu_buffer_write_reserve(&entry, 0);
	if (!sample)
		return -ENOMEM;
278

279 280
	sample->eip = pc;
	sample->event = event;
281

282
	return op_cpu_buffer_write_commit(&entry);
L
Linus Torvalds 已提交
283 284
}

285 286
/*
 * This must be safe from any context.
L
Linus Torvalds 已提交
287 288 289 290 291 292
 *
 * is_kernel is needed because on some architectures you cannot
 * tell if you are in kernel or user space simply by looking at
 * pc. We tag this in the buffer by generating kernel enter/exit
 * events whenever is_kernel changes
 */
293 294 295
static int
log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
	   unsigned long backtrace, int is_kernel, unsigned long event)
L
Linus Torvalds 已提交
296 297 298
{
	cpu_buf->sample_received++;

299 300 301 302 303
	if (pc == ESCAPE_CODE) {
		cpu_buf->sample_invalid_eip++;
		return 0;
	}

304 305
	if (op_add_code(cpu_buf, backtrace, is_kernel, current))
		goto fail;
306

307
	if (op_add_sample(cpu_buf, pc, event))
R
Robert Richter 已提交
308 309
		goto fail;

L
Linus Torvalds 已提交
310
	return 1;
R
Robert Richter 已提交
311 312 313 314

fail:
	cpu_buf->sample_lost_overflow++;
	return 0;
L
Linus Torvalds 已提交
315 316
}

317
static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
L
Linus Torvalds 已提交
318 319 320 321
{
	cpu_buf->tracing = 1;
}

322
static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
L
Linus Torvalds 已提交
323 324 325 326
{
	cpu_buf->tracing = 0;
}

327 328 329
static inline void
__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
			  unsigned long event, int is_kernel)
L
Linus Torvalds 已提交
330
{
331
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
332
	unsigned long backtrace = oprofile_backtrace_depth;
L
Linus Torvalds 已提交
333

R
Robert Richter 已提交
334 335 336 337
	/*
	 * if log_sample() fail we can't backtrace since we lost the
	 * source of this event
	 */
338 339 340 341 342 343
	if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event))
		/* failed */
		return;

	if (!backtrace)
		return;
344

345 346
	oprofile_begin_trace(cpu_buf);
	oprofile_ops.backtrace(regs, backtrace);
L
Linus Torvalds 已提交
347 348 349
	oprofile_end_trace(cpu_buf);
}

350 351 352 353 354 355
void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
			     unsigned long event, int is_kernel)
{
	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
}

356 357 358 359 360
void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
{
	int is_kernel = !user_mode(regs);
	unsigned long pc = profile_pc(regs);

361
	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
362 363
}

364 365
#ifdef CONFIG_OPROFILE_IBS

R
Robert Richter 已提交
366 367
void oprofile_add_ibs_sample(struct pt_regs * const regs,
			     unsigned int * const ibs_sample, int ibs_code)
368
{
369 370
	int is_kernel = !user_mode(regs);
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
R
Robert Richter 已提交
371
	int fail = 0;
372 373 374

	cpu_buf->sample_received++;

375 376
	/* backtraces disabled for ibs */
	fail = fail || op_add_code(cpu_buf, 0, is_kernel, current);
377

378
	fail = fail || op_add_sample(cpu_buf, ESCAPE_CODE,   ibs_code);
379 380 381
	fail = fail || op_add_sample(cpu_buf, ibs_sample[0], ibs_sample[1]);
	fail = fail || op_add_sample(cpu_buf, ibs_sample[2], ibs_sample[3]);
	fail = fail || op_add_sample(cpu_buf, ibs_sample[4], ibs_sample[5]);
382 383

	if (ibs_code == IBS_OP_BEGIN) {
384 385 386
		fail = fail || op_add_sample(cpu_buf, ibs_sample[6], ibs_sample[7]);
		fail = fail || op_add_sample(cpu_buf, ibs_sample[8], ibs_sample[9]);
		fail = fail || op_add_sample(cpu_buf, ibs_sample[10], ibs_sample[11]);
387 388
	}

389 390
	if (fail)
		cpu_buf->sample_lost_overflow++;
391 392
}

393 394
#endif

L
Linus Torvalds 已提交
395 396
void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
{
397
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
398
	log_sample(cpu_buf, pc, 0, is_kernel, event);
L
Linus Torvalds 已提交
399 400 401 402
}

void oprofile_add_trace(unsigned long pc)
{
403
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
L
Linus Torvalds 已提交
404 405 406 407

	if (!cpu_buf->tracing)
		return;

R
Robert Richter 已提交
408 409 410 411
	/*
	 * broken frame can give an eip with the same value as an
	 * escape code, abort the trace if we get it
	 */
R
Robert Richter 已提交
412 413 414
	if (pc == ESCAPE_CODE)
		goto fail;

415
	if (op_add_sample(cpu_buf, pc, 0))
R
Robert Richter 已提交
416
		goto fail;
L
Linus Torvalds 已提交
417

R
Robert Richter 已提交
418 419 420 421 422
	return;
fail:
	cpu_buf->tracing = 0;
	cpu_buf->backtrace_aborted++;
	return;
L
Linus Torvalds 已提交
423 424 425 426 427 428 429 430 431
}

/*
 * This serves to avoid cpu buffer overflow, and makes sure
 * the task mortuary progresses
 *
 * By using schedule_delayed_work_on and then schedule_delayed_work
 * we guarantee this will stay on the correct cpu
 */
D
David Howells 已提交
432
static void wq_sync_buffer(struct work_struct *work)
L
Linus Torvalds 已提交
433
{
R
Robert Richter 已提交
434
	struct oprofile_cpu_buffer *b =
D
David Howells 已提交
435
		container_of(work, struct oprofile_cpu_buffer, work.work);
L
Linus Torvalds 已提交
436
	if (b->cpu != smp_processor_id()) {
437
		printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n",
L
Linus Torvalds 已提交
438
		       smp_processor_id(), b->cpu);
C
Chris J Arges 已提交
439 440 441 442 443

		if (!cpu_online(b->cpu)) {
			cancel_delayed_work(&b->work);
			return;
		}
L
Linus Torvalds 已提交
444 445 446 447 448 449 450
	}
	sync_buffer(b->cpu);

	/* don't re-add the work if we're shutting down */
	if (work_enabled)
		schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
}