cpu_buffer.c 10.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/**
 * @file cpu_buffer.c
 *
4
 * @remark Copyright 2002-2009 OProfile authors
L
Linus Torvalds 已提交
5 6 7
 * @remark Read the file COPYING
 *
 * @author John Levon <levon@movementarian.org>
8
 * @author Barry Kasindorf <barry.kasindorf@amd.com>
9
 * @author Robert Richter <robert.richter@amd.com>
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *
 * Each CPU has a local buffer that stores PC value/event
 * pairs. We also log context switches when we notice them.
 * Eventually each CPU's buffer is processed into the global
 * event buffer by sync_buffer().
 *
 * We use a local buffer for two reasons: an NMI or similar
 * interrupt cannot synchronise, and high sampling rates
 * would lead to catastrophic global synchronisation if
 * a global buffer was used.
 */

#include <linux/sched.h>
#include <linux/oprofile.h>
#include <linux/errno.h>
25

L
Linus Torvalds 已提交
26 27 28 29 30
#include "event_buffer.h"
#include "cpu_buffer.h"
#include "buffer_sync.h"
#include "oprof.h"

31 32
#define OP_BUFFER_FLAGS	0

33
static struct ring_buffer *op_ring_buffer;
34
DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
L
Linus Torvalds 已提交
35

D
David Howells 已提交
36
static void wq_sync_buffer(struct work_struct *work);
L
Linus Torvalds 已提交
37 38 39 40

#define DEFAULT_TIMER_EXPIRE (HZ / 10)
static int work_enabled;

41 42
unsigned long oprofile_get_cpu_buffer_size(void)
{
43
	return oprofile_cpu_buffer_size;
44 45 46 47
}

void oprofile_cpu_buffer_inc_smpl_lost(void)
{
48
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
49 50 51 52

	cpu_buf->sample_lost_overflow++;
}

53 54
void free_cpu_buffers(void)
{
55 56 57
	if (op_ring_buffer)
		ring_buffer_free(op_ring_buffer);
	op_ring_buffer = NULL;
58 59
}

R
Robert Richter 已提交
60 61
#define RB_EVENT_HDR_SIZE 4

L
Linus Torvalds 已提交
62 63 64
int alloc_cpu_buffers(void)
{
	int i;
65

66
	unsigned long buffer_size = oprofile_cpu_buffer_size;
R
Robert Richter 已提交
67 68
	unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
						 RB_EVENT_HDR_SIZE);
69

70 71
	op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
	if (!op_ring_buffer)
72 73
		goto fail;

C
Chris J Arges 已提交
74
	for_each_possible_cpu(i) {
75
		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
76

L
Linus Torvalds 已提交
77 78 79 80 81 82
		b->last_task = NULL;
		b->last_is_kernel = -1;
		b->tracing = 0;
		b->buffer_size = buffer_size;
		b->sample_received = 0;
		b->sample_lost_overflow = 0;
83 84
		b->backtrace_aborted = 0;
		b->sample_invalid_eip = 0;
L
Linus Torvalds 已提交
85
		b->cpu = i;
D
David Howells 已提交
86
		INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
L
Linus Torvalds 已提交
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
	}
	return 0;

fail:
	free_cpu_buffers();
	return -ENOMEM;
}

void start_cpu_work(void)
{
	int i;

	work_enabled = 1;

	for_each_online_cpu(i) {
102
		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
L
Linus Torvalds 已提交
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118

		/*
		 * Spread the work by 1 jiffy per cpu so they dont all
		 * fire at once.
		 */
		schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
	}
}

void end_cpu_work(void)
{
	int i;

	work_enabled = 0;

	for_each_online_cpu(i) {
119
		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
L
Linus Torvalds 已提交
120 121 122 123 124 125 126

		cancel_delayed_work(&b->work);
	}

	flush_scheduled_work();
}

127 128 129 130 131 132 133 134 135 136 137 138 139 140
/*
 * This function prepares the cpu buffer to write a sample.
 *
 * Struct op_entry is used during operations on the ring buffer while
 * struct op_sample contains the data that is stored in the ring
 * buffer. Struct entry can be uninitialized. The function reserves a
 * data array that is specified by size. Use
 * op_cpu_buffer_write_commit() after preparing the sample. In case of
 * errors a null pointer is returned, otherwise the pointer to the
 * sample.
 *
 */
struct op_sample
*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
141
{
142
	entry->event = ring_buffer_lock_reserve
143
		(op_ring_buffer, sizeof(struct op_sample) +
144
		 size * sizeof(entry->sample->data[0]));
145
	if (!entry->event)
146
		return NULL;
147
	entry->sample = ring_buffer_event_data(entry->event);
148 149 150 151
	entry->size = size;
	entry->data = entry->sample->data;

	return entry->sample;
152 153 154 155
}

int op_cpu_buffer_write_commit(struct op_entry *entry)
{
156
	return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
157 158
}

159
struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
160 161
{
	struct ring_buffer_event *e;
162 163
	e = ring_buffer_consume(op_ring_buffer, cpu, NULL);
	if (!e)
164
		return NULL;
165 166 167 168 169 170 171

	entry->event = e;
	entry->sample = ring_buffer_event_data(e);
	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
		/ sizeof(entry->sample->data[0]);
	entry->data = entry->sample->data;
	return entry->sample;
172 173 174 175
}

unsigned long op_cpu_buffer_entries(int cpu)
{
176
	return ring_buffer_entries_cpu(op_ring_buffer, cpu);
177 178
}

179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
static int
op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
	    int is_kernel, struct task_struct *task)
{
	struct op_entry entry;
	struct op_sample *sample;
	unsigned long flags;
	int size;

	flags = 0;

	if (backtrace)
		flags |= TRACE_BEGIN;

	/* notice a switch from user->kernel or vice versa */
	is_kernel = !!is_kernel;
	if (cpu_buf->last_is_kernel != is_kernel) {
		cpu_buf->last_is_kernel = is_kernel;
		flags |= KERNEL_CTX_SWITCH;
		if (is_kernel)
			flags |= IS_KERNEL;
	}

	/* notice a task switch */
	if (cpu_buf->last_task != task) {
		cpu_buf->last_task = task;
		flags |= USER_CTX_SWITCH;
	}

	if (!flags)
		/* nothing to do */
		return 0;

	if (flags & USER_CTX_SWITCH)
		size = 1;
	else
		size = 0;

	sample = op_cpu_buffer_write_reserve(&entry, size);
	if (!sample)
		return -ENOMEM;

	sample->eip = ESCAPE_CODE;
	sample->event = flags;

	if (size)
225
		op_cpu_buffer_add_data(&entry, (unsigned long)task);
226 227 228 229 230 231

	op_cpu_buffer_write_commit(&entry);

	return 0;
}

R
Robert Richter 已提交
232
static inline int
233 234
op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
	      unsigned long pc, unsigned long event)
L
Linus Torvalds 已提交
235
{
236
	struct op_entry entry;
237
	struct op_sample *sample;
238

239 240 241
	sample = op_cpu_buffer_write_reserve(&entry, 0);
	if (!sample)
		return -ENOMEM;
242

243 244
	sample->eip = pc;
	sample->event = event;
245

246
	return op_cpu_buffer_write_commit(&entry);
L
Linus Torvalds 已提交
247 248
}

249 250
/*
 * This must be safe from any context.
L
Linus Torvalds 已提交
251 252 253 254 255 256
 *
 * is_kernel is needed because on some architectures you cannot
 * tell if you are in kernel or user space simply by looking at
 * pc. We tag this in the buffer by generating kernel enter/exit
 * events whenever is_kernel changes
 */
257 258 259
static int
log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
	   unsigned long backtrace, int is_kernel, unsigned long event)
L
Linus Torvalds 已提交
260 261 262
{
	cpu_buf->sample_received++;

263 264 265 266 267
	if (pc == ESCAPE_CODE) {
		cpu_buf->sample_invalid_eip++;
		return 0;
	}

268 269
	if (op_add_code(cpu_buf, backtrace, is_kernel, current))
		goto fail;
270

271
	if (op_add_sample(cpu_buf, pc, event))
R
Robert Richter 已提交
272 273
		goto fail;

L
Linus Torvalds 已提交
274
	return 1;
R
Robert Richter 已提交
275 276 277 278

fail:
	cpu_buf->sample_lost_overflow++;
	return 0;
L
Linus Torvalds 已提交
279 280
}

281
static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
L
Linus Torvalds 已提交
282 283 284 285
{
	cpu_buf->tracing = 1;
}

286
static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
L
Linus Torvalds 已提交
287 288 289 290
{
	cpu_buf->tracing = 0;
}

291 292 293
static inline void
__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
			  unsigned long event, int is_kernel)
L
Linus Torvalds 已提交
294
{
295
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
296
	unsigned long backtrace = oprofile_backtrace_depth;
L
Linus Torvalds 已提交
297

R
Robert Richter 已提交
298 299 300 301
	/*
	 * if log_sample() fail we can't backtrace since we lost the
	 * source of this event
	 */
302 303 304 305 306 307
	if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event))
		/* failed */
		return;

	if (!backtrace)
		return;
308

309 310
	oprofile_begin_trace(cpu_buf);
	oprofile_ops.backtrace(regs, backtrace);
L
Linus Torvalds 已提交
311 312 313
	oprofile_end_trace(cpu_buf);
}

314 315 316 317 318 319
void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
			     unsigned long event, int is_kernel)
{
	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
}

320 321 322 323 324
void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
{
	int is_kernel = !user_mode(regs);
	unsigned long pc = profile_pc(regs);

325
	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
326 327
}

328 329 330
/*
 * Add samples with data to the ring buffer.
 *
331 332
 * Use oprofile_add_data(&entry, val) to add data and
 * oprofile_write_commit(&entry) to commit the sample.
333
 */
334 335
void
oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
336
		       unsigned long pc, int code, int size)
337
{
338
	struct op_sample *sample;
339
	int is_kernel = !user_mode(regs);
340
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
341 342 343

	cpu_buf->sample_received++;

344 345 346 347 348 349 350 351 352
	/* no backtraces for samples with data */
	if (op_add_code(cpu_buf, 0, is_kernel, current))
		goto fail;

	sample = op_cpu_buffer_write_reserve(entry, size + 2);
	if (!sample)
		goto fail;
	sample->eip = ESCAPE_CODE;
	sample->event = 0;		/* no flags */
353

354 355
	op_cpu_buffer_add_data(entry, code);
	op_cpu_buffer_add_data(entry, pc);
356

357
	return;
358

359
fail:
360
	entry->event = NULL;
361
	cpu_buf->sample_lost_overflow++;
362 363
}

364 365
int oprofile_add_data(struct op_entry *entry, unsigned long val)
{
366 367
	if (!entry->event)
		return 0;
368 369 370
	return op_cpu_buffer_add_data(entry, val);
}

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
int oprofile_add_data64(struct op_entry *entry, u64 val)
{
	if (!entry->event)
		return 0;
	if (op_cpu_buffer_get_size(entry) < 2)
		/*
		 * the function returns 0 to indicate a too small
		 * buffer, even if there is some space left
		 */
		return 0;
	if (!op_cpu_buffer_add_data(entry, (u32)val))
		return 0;
	return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
}

386 387
int oprofile_write_commit(struct op_entry *entry)
{
388 389
	if (!entry->event)
		return -EINVAL;
390 391 392
	return op_cpu_buffer_write_commit(entry);
}

L
Linus Torvalds 已提交
393 394
void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
{
395
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
396
	log_sample(cpu_buf, pc, 0, is_kernel, event);
L
Linus Torvalds 已提交
397 398 399 400
}

void oprofile_add_trace(unsigned long pc)
{
401
	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
L
Linus Torvalds 已提交
402 403 404 405

	if (!cpu_buf->tracing)
		return;

R
Robert Richter 已提交
406 407 408 409
	/*
	 * broken frame can give an eip with the same value as an
	 * escape code, abort the trace if we get it
	 */
R
Robert Richter 已提交
410 411 412
	if (pc == ESCAPE_CODE)
		goto fail;

413
	if (op_add_sample(cpu_buf, pc, 0))
R
Robert Richter 已提交
414
		goto fail;
L
Linus Torvalds 已提交
415

R
Robert Richter 已提交
416 417 418 419 420
	return;
fail:
	cpu_buf->tracing = 0;
	cpu_buf->backtrace_aborted++;
	return;
L
Linus Torvalds 已提交
421 422 423 424 425 426 427 428 429
}

/*
 * This serves to avoid cpu buffer overflow, and makes sure
 * the task mortuary progresses
 *
 * By using schedule_delayed_work_on and then schedule_delayed_work
 * we guarantee this will stay on the correct cpu
 */
D
David Howells 已提交
430
static void wq_sync_buffer(struct work_struct *work)
L
Linus Torvalds 已提交
431
{
R
Robert Richter 已提交
432
	struct oprofile_cpu_buffer *b =
D
David Howells 已提交
433
		container_of(work, struct oprofile_cpu_buffer, work.work);
L
Linus Torvalds 已提交
434
	if (b->cpu != smp_processor_id()) {
435
		printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n",
L
Linus Torvalds 已提交
436
		       smp_processor_id(), b->cpu);
C
Chris J Arges 已提交
437 438 439 440 441

		if (!cpu_online(b->cpu)) {
			cancel_delayed_work(&b->work);
			return;
		}
L
Linus Torvalds 已提交
442 443 444 445 446 447 448
	}
	sync_buffer(b->cpu);

	/* don't re-add the work if we're shutting down */
	if (work_enabled)
		schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
}