diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index ce0d9da52a8ab808a24e8d30bff27d631b474713..52146c2a8d9727ce262ced510b73fa76ffdab5a8 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) struct pt_regs *regs = get_irq_regs(); if (regs) show_regs(regs); + perf_counter_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 0000000000000000000000000000000000000000..22c4469abf44c6980673ecd0bead3ae8e2cd2e2d --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,171 @@ +/* + * Performance counters: + * + * Copyright(C) 2008, Thomas Gleixner + * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include + +#include +#include +#include +#include +#include + +struct task_struct; + +/* + * Generalized hardware event types, used by the hw_event_type parameter + * of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + PERF_COUNT_CYCLES, + PERF_COUNT_INSTRUCTIONS, + PERF_COUNT_CACHE_REFERENCES, + PERF_COUNT_CACHE_MISSES, + PERF_COUNT_BRANCH_INSTRUCTIONS, + PERF_COUNT_BRANCH_MISSES, + /* + * If this bit is set in the type, then trigger NMI sampling: + */ + PERF_COUNT_NMI = (1 << 30), +}; + +/* + * IRQ-notification data record type: + */ +enum perf_record_type { + PERF_RECORD_SIMPLE, + PERF_RECORD_IRQ, + PERF_RECORD_GROUP, +}; + +/** + * struct hw_perf_counter - performance counter hardware details + */ +struct hw_perf_counter { + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + s32 next_count; + u64 irq_period; +}; + +/* + * Hardcoded buffer length limit for now, for IRQ-fed events: + */ +#define PERF_DATA_BUFLEN 2048 + +/** + * struct perf_data - performance counter IRQ data sampling ... + */ +struct perf_data { + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; +}; + +/** + * struct perf_counter - performance counter kernel representation: + */ +struct perf_counter { + struct list_head list; + int active; +#if BITS_PER_LONG == 64 + atomic64_t count; +#else + atomic_t count32[2]; +#endif + u64 __irq_period; + + struct hw_perf_counter hw; + + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * Protect attach/detach: + */ + struct mutex mutex; + + int oncpu; + int cpu; + + s32 hw_event_type; + enum perf_record_type record_type; + + /* read() / irq related data */ + wait_queue_head_t waitq; + /* optional: for NMIs */ + int wakeup_pending; + struct perf_data *irqdata; + struct perf_data *usrdata; + struct perf_data data[2]; +}; + +/** + * struct perf_counter_context - counter context structure + * + * Used as a container for task counters and CPU counters as well: + */ +struct perf_counter_context { +#ifdef CONFIG_PERF_COUNTERS + /* + * Protect the list of counters: + */ + spinlock_t lock; + struct list_head counters; + int nr_counters; + int nr_active; + struct task_struct *task; +#endif +}; + +/** + * struct perf_counter_cpu_context - per cpu counter context structure + */ +struct perf_cpu_context { + struct perf_counter_context ctx; + struct perf_counter_context *task_ctx; + int active_oncpu; + int max_pertask; +}; + +/* + * Set by architecture code: + */ +extern int perf_max_counters; + +#ifdef CONFIG_PERF_COUNTERS +extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_tick(struct task_struct *task, int cpu); +extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_print_debug(void); +#else +static inline void +perf_counter_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_tick(struct task_struct *task, int cpu) { } +static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_print_debug(void) { } +#endif + +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d11447790dd433d0e874285ced3a5499328..4c530278391b72dc6232179ae7432dbefe70ce5e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,6 +71,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif + struct perf_counter_context perf_counter_ctx; #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; @@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +/* + * Call the function if the target task is executing on a CPU right now: + */ +extern void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info); + + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 04fb47bfb920d317c4b5217ebfbb1602e7606d90..6cce728a6263eec7e2d8002c44efbb2dcb54141b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu); #endif diff --git a/init/Kconfig b/init/Kconfig index f763762d544a135a0a06f67f36701b9f0d331140..78bede218f10f86c2a1ebc46ab5ffe8a6fc79f3c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -732,6 +732,35 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. +config HAVE_PERF_COUNTERS + bool + +menu "Performance Counters" + +config PERF_COUNTERS + bool "Kernel Performance Counters" + depends on HAVE_PERF_COUNTERS + default y + help + Enable kernel support for performance counter hardware. + + Performance counters are special hardware registers available + on most modern CPUs. These registers count the number of certain + types of hw events: such as instructions executed, cachemisses + suffered, or branches mis-predicted - without slowing down the + kernel or applications. These registers can also trigger interrupts + when a threshold number of events have passed - and can thus be + used to profile the code that runs on that CPU. + + The Linux Performance Counter subsystem provides an abstraction of + these hardware capabilities, available via a system call. It + provides per task and per CPU counters, and it provides event + capabilities on top of those. + + Say Y if unsure. + +endmenu + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EMBEDDED diff --git a/kernel/Makefile b/kernel/Makefile index 19fad003b19d6ac0752597f5a23e18341d1d579a..1f184a1dc406a0f15c178cc403d27e97f417287f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/fork.c b/kernel/fork.c index 2a372a0e206fa2de99dbfdd594f86f6eb927bf40..441fadff1fa476a91eb01d427e25b9b926254bc0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; rt_mutex_init_task(p); + perf_counter_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 0000000000000000000000000000000000000000..20508f053658f86ce2dc818d975b36a0d2edf9b1 --- /dev/null +++ b/kernel/perf_counter.c @@ -0,0 +1,943 @@ +/* + * Performance counter core code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Each CPU has a list of per CPU counters: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_counters __read_mostly; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +/* + * Mutex for (sysadmin-configurable) counter reservations: + */ +static DEFINE_MUTEX(perf_resource_mutex); + +/* + * Architecture provided APIs - weak aliases: + */ + +int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) +{ + return -EINVAL; +} + +void __weak hw_perf_counter_enable(struct perf_counter *counter) { } +void __weak hw_perf_counter_disable(struct perf_counter *counter) { } +void __weak hw_perf_counter_read(struct perf_counter *counter) { } +void __weak hw_perf_disable_all(void) { } +void __weak hw_perf_enable_all(void) { } +void __weak hw_perf_counter_setup(void) { } + +#if BITS_PER_LONG == 64 + +/* + * Read the cached counter in counter safe against cross CPU / NMI + * modifications. 64 bit version - no complications. + */ +static inline u64 perf_read_counter_safe(struct perf_counter *counter) +{ + return (u64) atomic64_read(&counter->count); +} + +#else + +/* + * Read the cached counter in counter safe against cross CPU / NMI + * modifications. 32 bit version. + */ +static u64 perf_read_counter_safe(struct perf_counter *counter) +{ + u32 cntl, cnth; + + local_irq_disable(); + do { + cnth = atomic_read(&counter->count32[1]); + cntl = atomic_read(&counter->count32[0]); + } while (cnth != atomic_read(&counter->count32[1])); + + local_irq_enable(); + + return cntl | ((u64) cnth) << 32; +} + +#endif + +/* + * Cross CPU call to remove a performance counter + * + * We disable the counter on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + if (counter->active) { + hw_perf_counter_disable(counter); + counter->active = 0; + ctx->nr_active--; + cpuctx->active_oncpu--; + counter->task = NULL; + } + ctx->nr_counters--; + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + hw_perf_disable_all(); + list_del_init(&counter->list); + hw_perf_enable_all(); + + if (!ctx->task) { + /* + * Allow more per task counters with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_counters - ctx->nr_counters, + perf_max_counters - perf_reserved_percpu); + } + + spin_unlock(&ctx->lock); +} + + +/* + * Remove the counter from a task's (or a CPU's) list of counters. + * + * Must be called with counter->mutex held. + * + * CPU counters are removed with a smp call. For task counters we only + * call when the task is on a CPU. + */ +static void perf_remove_from_context(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu counters are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(counter->cpu, + __perf_remove_from_context, + counter, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_remove_from_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&counter->list)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the counter safely, if it the call above did not + * succeed. + */ + if (!list_empty(&counter->list)) { + ctx->nr_counters--; + list_del_init(&counter->list); + counter->task = NULL; + } + spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to install and enable a preformance counter + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + int cpu = smp_processor_id(); + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + hw_perf_disable_all(); + list_add_tail(&counter->list, &ctx->counters); + hw_perf_enable_all(); + + ctx->nr_counters++; + + if (cpuctx->active_oncpu < perf_max_counters) { + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; + ctx->nr_active++; + cpuctx->active_oncpu++; + } + + if (!ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance counter to a context + * + * First we add the counter to the list with the hardware enable bit + * in counter->hw_config cleared. + * + * If the counter is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + */ +static void +perf_install_in_context(struct perf_counter_context *ctx, + struct perf_counter *counter, + int cpu) +{ + struct task_struct *task = ctx->task; + + counter->ctx = ctx; + if (!task) { + /* + * Per cpu counters are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + counter, 1); + return; + } + + counter->task = task; +retry: + task_oncpu_function_call(task, __perf_install_in_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active and the counter has not been added + * we need to retry the smp call. + */ + if (ctx->nr_active && list_empty(&counter->list)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the counter safely, if it the call above did not + * succeed. + */ + if (list_empty(&counter->list)) { + list_add_tail(&counter->list, &ctx->counters); + ctx->nr_counters++; + } + spin_unlock_irq(&ctx->lock); +} + +/* + * Called from scheduler to remove the counters of the current task, + * with interrupts disabled. + * + * We stop each counter and update the counter value in counter->count. + * + * This does not protect us against NMI, but hw_perf_counter_disable() + * sets the disabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * not restart the counter. + */ +void perf_counter_task_sched_out(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!cpuctx->task_ctx)) + return; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counters, list) { + if (!ctx->nr_active) + break; + if (counter->active) { + hw_perf_counter_disable(counter); + counter->active = 0; + counter->oncpu = -1; + ctx->nr_active--; + cpuctx->active_oncpu--; + } + } + spin_unlock(&ctx->lock); + cpuctx->task_ctx = NULL; +} + +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but hw_perf_counter_enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counters, list) { + if (ctx->nr_active == cpuctx->max_pertask) + break; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; + ctx->nr_active++; + cpuctx->active_oncpu++; + } + spin_unlock(&ctx->lock); + cpuctx->task_ctx = ctx; +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + perf_counter_task_sched_out(curr, cpu); + + spin_lock(&ctx->lock); + + /* + * Rotate the first entry last: + */ + hw_perf_disable_all(); + list_for_each_entry(counter, &ctx->counters, list) { + list_del(&counter->list); + list_add_tail(&counter->list, &ctx->counters); + break; + } + hw_perf_enable_all(); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(curr, cpu); +} + +/* + * Initialize the perf_counter context in task_struct + */ +void perf_counter_init_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = &task->perf_counter_ctx; + + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counters); + ctx->nr_counters = 0; + ctx->task = task; +} + +/* + * Cross CPU call to read the hardware counter + */ +static void __hw_perf_counter_read(void *info) +{ + hw_perf_counter_read(info); +} + +static u64 perf_read_counter(struct perf_counter *counter) +{ + /* + * If counter is enabled and currently active on a CPU, update the + * value in the counter structure: + */ + if (counter->active) { + smp_call_function_single(counter->oncpu, + __hw_perf_counter_read, counter, 1); + } + + return perf_read_counter_safe(counter); +} + +/* + * Cross CPU call to switch performance data pointers + */ +static void __perf_switch_irq_data(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + struct perf_data *oldirqdata = counter->irqdata; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task) { + if (cpuctx->task_ctx != ctx) + return; + spin_lock(&ctx->lock); + } + + /* Change the pointer NMI safe */ + atomic_long_set((atomic_long_t *)&counter->irqdata, + (unsigned long) counter->usrdata); + counter->usrdata = oldirqdata; + + if (ctx->task) + spin_unlock(&ctx->lock); +} + +static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_data *oldirqdata = counter->irqdata; + struct task_struct *task = ctx->task; + + if (!task) { + smp_call_function_single(counter->cpu, + __perf_switch_irq_data, + counter, 1); + return counter->usrdata; + } + +retry: + spin_lock_irq(&ctx->lock); + if (!counter->active) { + counter->irqdata = counter->usrdata; + counter->usrdata = oldirqdata; + spin_unlock_irq(&ctx->lock); + return oldirqdata; + } + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_switch_irq_data, counter); + /* Might have failed, because task was scheduled out */ + if (counter->irqdata == oldirqdata) + goto retry; + + return counter->usrdata; +} + +static void put_context(struct perf_counter_context *ctx) +{ + if (ctx->task) + put_task_struct(ctx->task); +} + +static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * If cpu is not a wildcard then this is a percpu counter: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU counter: */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a counter to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + + WARN_ON_ONCE(ctx->task); + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + ctx = &task->perf_counter_ctx; + ctx->task = task; + + /* Reuse ptrace permission checks for now. */ + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + put_context(ctx); + return ERR_PTR(-EACCES); + } + + return ctx; +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_counter *counter = file->private_data; + struct perf_counter_context *ctx = counter->ctx; + + file->private_data = NULL; + + mutex_lock(&counter->mutex); + + perf_remove_from_context(counter); + put_context(ctx); + + mutex_unlock(&counter->mutex); + + kfree(counter); + + return 0; +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ + u64 cntval; + + if (count != sizeof(cntval)) + return -EINVAL; + + mutex_lock(&counter->mutex); + cntval = perf_read_counter(counter); + mutex_unlock(&counter->mutex); + + return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); +} + +static ssize_t +perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) +{ + if (!usrdata->len) + return 0; + + count = min(count, (size_t)usrdata->len); + if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) + return -EFAULT; + + /* Adjust the counters */ + usrdata->len -= count; + if (!usrdata->len) + usrdata->rd_idx = 0; + else + usrdata->rd_idx += count; + + return count; +} + +static ssize_t +perf_read_irq_data(struct perf_counter *counter, + char __user *buf, + size_t count, + int nonblocking) +{ + struct perf_data *irqdata, *usrdata; + DECLARE_WAITQUEUE(wait, current); + ssize_t res; + + irqdata = counter->irqdata; + usrdata = counter->usrdata; + + if (usrdata->len + irqdata->len >= count) + goto read_pending; + + if (nonblocking) + return -EAGAIN; + + spin_lock_irq(&counter->waitq.lock); + __add_wait_queue(&counter->waitq, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (usrdata->len + irqdata->len >= count) + break; + + if (signal_pending(current)) + break; + + spin_unlock_irq(&counter->waitq.lock); + schedule(); + spin_lock_irq(&counter->waitq.lock); + } + __remove_wait_queue(&counter->waitq, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&counter->waitq.lock); + + if (usrdata->len + irqdata->len < count) + return -ERESTARTSYS; +read_pending: + mutex_lock(&counter->mutex); + + /* Drain pending data first: */ + res = perf_copy_usrdata(usrdata, buf, count); + if (res < 0 || res == count) + goto out; + + /* Switch irq buffer: */ + usrdata = perf_switch_irq_data(counter); + if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { + if (!res) + res = -EFAULT; + } else { + res = count; + } +out: + mutex_unlock(&counter->mutex); + + return res; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_counter *counter = file->private_data; + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + return perf_read_hw(counter, buf, count); + + case PERF_RECORD_IRQ: + case PERF_RECORD_GROUP: + return perf_read_irq_data(counter, buf, count, + file->f_flags & O_NONBLOCK); + } + return -EINVAL; +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_counter *counter = file->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(file, &counter->waitq, wait); + + spin_lock_irqsave(&counter->waitq.lock, flags); + if (counter->usrdata->len || counter->irqdata->len) + events |= POLLIN; + spin_unlock_irqrestore(&counter->waitq.lock, flags); + + return events; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, +}; + +/* + * Allocate and initialize a counter structure + */ +static struct perf_counter * +perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) +{ + struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); + + if (!counter) + return NULL; + + mutex_init(&counter->mutex); + INIT_LIST_HEAD(&counter->list); + init_waitqueue_head(&counter->waitq); + + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->record_type = record_type; + counter->__irq_period = hw_event_period; + counter->wakeup_pending = 0; + + return counter; +} + +/** + * sys_perf_task_open - open a performance counter associate it to a task + * @hw_event_type: event type for monitoring/sampling... + * @pid: target pid + */ +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu) +{ + struct perf_counter_context *ctx; + struct perf_counter *counter; + int ret; + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = -ENOMEM; + counter = perf_counter_alloc(hw_event_period, cpu, record_type); + if (!counter) + goto err_put_context; + + ret = hw_perf_counter_init(counter, hw_event_type); + if (ret) + goto err_free_put_context; + + perf_install_in_context(ctx, counter, cpu); + + ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + if (ret < 0) + goto err_remove_free_put_context; + + return ret; + +err_remove_free_put_context: + mutex_lock(&counter->mutex); + perf_remove_from_context(counter); + mutex_unlock(&counter->mutex); + +err_free_put_context: + kfree(counter); + +err_put_context: + put_context(ctx); + + return ret; +} + +static void __cpuinit perf_init_cpu(int cpu) +{ + struct perf_cpu_context *ctx; + + ctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_init(&ctx->ctx.lock); + INIT_LIST_HEAD(&ctx->ctx.counters); + + mutex_lock(&perf_resource_mutex); + ctx->max_pertask = perf_max_counters - perf_reserved_percpu; + mutex_unlock(&perf_resource_mutex); + hw_perf_counter_setup(); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_counter *counter, *tmp; + + list_for_each_entry_safe(counter, tmp, &ctx->counters, list) + __perf_remove_from_context(counter); + +} +static void perf_exit_cpu(int cpu) +{ + smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); +} +#else +static inline void perf_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_init_cpu(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, +}; + +static int __init perf_counter_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); + + return 0; +} +early_initcall(perf_counter_init); + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_counters) + return -EINVAL; + + mutex_lock(&perf_resource_mutex); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, + perf_max_counters - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + mutex_unlock(&perf_resource_mutex); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + mutex_lock(&perf_resource_mutex); + perf_overcommit = val; + mutex_unlock(&perf_resource_mutex); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_counters", +}; + +static int __init perf_counter_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_counter_sysfs_init); + diff --git a/kernel/sched.c b/kernel/sched.c index b7480fb5c3dc21a7bf6513a978cf0ed2e8c19a8f..254d56de2548d11f75064127a752816eb3bcaaba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); + perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (current->sched_class->post_schedule) @@ -4296,6 +4319,7 @@ void scheduler_tick(void) rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a23281707610c6d52338af9e091116e29b987..4be8bbc7577ccb4edbcec3b665db7d7291a71054 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_counter_open);