diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5b0211348c73a19be76fd80cb6e1edb2c5025d37..bd6ba85beb5492badc062e37a662e7af7e6bcc5d 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -16,6 +16,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev) return 0; } +/* + * Check if newly-added counters have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added counters. + */ +static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) +{ + int eu, ek, eh; + int i, n; + struct perf_counter *counter; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + eu = ctrs[0]->hw_event.exclude_user; + ek = ctrs[0]->hw_event.exclude_kernel; + eh = ctrs[0]->hw_event.exclude_hv; + if (n_prev == 0) + n_prev = 1; + for (i = n_prev; i < n; ++i) { + counter = ctrs[i]; + if (counter->hw_event.exclude_user != eu || + counter->hw_event.exclude_kernel != ek || + counter->hw_event.exclude_hv != eh) + return -EAGAIN; + } + return 0; +} + static void power_perf_read(struct perf_counter *counter) { long val, delta, prev; @@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable) goto out; } + /* + * Add in MMCR0 freeze bits corresponding to the + * hw_event.exclude_* bits for the first counter. + * We have already checked that all counters have the + * same values for these bits as the first counter. + */ + counter = cpuhw->counter[0]; + if (counter->hw_event.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (counter->hw_event.exclude_kernel) + cpuhw->mmcr[0] |= MMCR0_FCS; + if (counter->hw_event.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + /* * Write the new configuration to MMCR* with the freeze * bit set and set the hardware counters to their initial values. @@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, &cpuhw->counter[n0], &cpuhw->events[n0]); if (n < 0) return -EAGAIN; + if (check_excludes(cpuhw->counter, n0, n)) + return -EAGAIN; if (power_check_constraints(cpuhw->events, n + n0)) return -EAGAIN; cpuhw->n_counters = n0 + n; @@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter) goto out; cpuhw->counter[n0] = counter; cpuhw->events[n0] = counter->hw.config; + if (check_excludes(cpuhw->counter, n0, 1)) + goto out; if (power_check_constraints(cpuhw->events, n0 + 1)) goto out; @@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter) counter->hw.config_base = ev; counter->hw.idx = 0; + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. This also means that we don't + * set the MMCR0_FCHV bit, which unconditionally freezes + * the counters on the PPC970 variants used in Apple G5 + * machines (since MSR.HV is always 1 on those machines). + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + counter->hw_event.exclude_hv = 0; + /* * If this is in a group, check if it can go on with all the * other hardware counters in the group. We assume the counter @@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter) if (n < 0) return NULL; } - events[n++] = ev; - if (power_check_constraints(events, n)) + events[n] = ev; + if (check_excludes(ctrs, n, 1)) + return NULL; + if (power_check_constraints(events, n + 1)) return NULL; - counter->hw.config = events[n - 1]; + counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); return &power_perf_ops; } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9901e46998d1a59d006eff17111046e0e539ca52..383d4c6423a1c6670a2b09b07d42b271558e7e2b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EINVAL; /* - * Count user events, and generate PMC IRQs: + * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ - hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + hwc->config = ARCH_PERFMON_EVENTSEL_INT; /* - * If privileged enough, count OS events too, and allow - * NMI events as well: + * Count user and OS events unless requested not to. */ - hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN)) { + if (!hw_event->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event->nmi) - hwc->nmi = 1; - } + + /* + * If privileged enough, allow NMI events: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; /* @@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter, int err; /* - * Enable IRQ generation (0x8) and ring-3 counting (0x2), - * and enable ring-0 counting if allowed: + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: */ - bits = 0x8ULL | 0x2ULL; + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; bits <<= (idx * 4); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f55381fbcac98f815548027875118242de7b1070..c83f51d6e359f9ec4bc7c3eee01e5fd0279348ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,14 +83,17 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only counter on PMU */ - - __reserved_1 : 26; + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + + __reserved_1 : 23; u64 __reserved_2; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 544193cbc47849263b5fbef8b03ca98cd0dbb56c..89d5e3fe9700e0bdac8963160ff0e8666949c2c4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter) { const struct hw_perf_counter_ops *hw_ops = NULL; + /* + * Software counters (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv)) + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: + if (counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv) + break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - hw_ops = &perf_ops_page_faults; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel)) + hw_ops = &perf_ops_page_faults; break; case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_context_switches; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_context_switches; break; case PERF_COUNT_CPU_MIGRATIONS: - hw_ops = &perf_ops_cpu_migrations; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_cpu_migrations; break; default: break;