提交 a3d4fd7a 编写于 作者: I Ingo Molnar

Merge branch 'uprobes/core' of...

Merge branch 'uprobes/core' of git://git.kernel.org/pub/scm/linux/kernel/git/oleg/misc into perf/core

Improve uprobes performance by adding 'pre-filtering' support,
by Oleg Nesterov:

	# time perl -e 'syscall -1 for 1..100_000'
	real    0m0.040s
	user    0m0.027s
	sys     0m0.010s

	# perf probe -x /lib/libc.so.6 syscall
	# perf record -e probe_libc:syscall sleep 100 &

Before this series:

	# time perl -e 'syscall -1 for 1..100_000'
	real    0m1.714s
	user    0m0.103s
	sys     0m1.607s

After:

	# time perl -e 'syscall -1 for 1..100_000'
	real    0m0.037s
	user    0m0.013s
	sys     0m0.023s
Signed-off-by: NIngo Molnar <mingo@kernel.org>
...@@ -680,8 +680,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) ...@@ -680,8 +680,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (auprobe->insn[i] == 0x66) if (auprobe->insn[i] == 0x66)
continue; continue;
if (auprobe->insn[i] == 0x90) if (auprobe->insn[i] == 0x90) {
regs->ip += i + 1;
return true; return true;
}
break; break;
} }
......
...@@ -135,16 +135,21 @@ struct hw_perf_event { ...@@ -135,16 +135,21 @@ struct hw_perf_event {
struct { /* software */ struct { /* software */
struct hrtimer hrtimer; struct hrtimer hrtimer;
}; };
struct { /* tracepoint */
struct task_struct *tp_target;
/* for tp_event->class */
struct list_head tp_list;
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT #ifdef CONFIG_HAVE_HW_BREAKPOINT
struct { /* breakpoint */ struct { /* breakpoint */
struct arch_hw_breakpoint info;
struct list_head bp_list;
/* /*
* Crufty hack to avoid the chicken and egg * Crufty hack to avoid the chicken and egg
* problem hw_breakpoint has with context * problem hw_breakpoint has with context
* creation and event initalization. * creation and event initalization.
*/ */
struct task_struct *bp_target; struct task_struct *bp_target;
struct arch_hw_breakpoint info;
struct list_head bp_list;
}; };
#endif #endif
}; };
......
...@@ -35,13 +35,20 @@ struct inode; ...@@ -35,13 +35,20 @@ struct inode;
# include <asm/uprobes.h> # include <asm/uprobes.h>
#endif #endif
#define UPROBE_HANDLER_REMOVE 1
#define UPROBE_HANDLER_MASK 1
enum uprobe_filter_ctx {
UPROBE_FILTER_REGISTER,
UPROBE_FILTER_UNREGISTER,
UPROBE_FILTER_MMAP,
};
struct uprobe_consumer { struct uprobe_consumer {
int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
/* bool (*filter)(struct uprobe_consumer *self,
* filter is optional; If a filter exists, handler is run enum uprobe_filter_ctx ctx,
* if and only if filter returns true. struct mm_struct *mm);
*/
bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
struct uprobe_consumer *next; struct uprobe_consumer *next;
}; };
...@@ -94,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign ...@@ -94,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign
extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern int uprobe_mmap(struct vm_area_struct *vma); extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
...@@ -117,6 +125,11 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) ...@@ -117,6 +125,11 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{ {
return -ENOSYS; return -ENOSYS;
} }
static inline int
uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
{
return -ENOSYS;
}
static inline void static inline void
uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{ {
......
...@@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, ...@@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (task) { if (task) {
event->attach_state = PERF_ATTACH_TASK; event->attach_state = PERF_ATTACH_TASK;
if (attr->type == PERF_TYPE_TRACEPOINT)
event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT #ifdef CONFIG_HAVE_HW_BREAKPOINT
/* /*
* hw_breakpoint is a bit difficult here.. * hw_breakpoint is a bit difficult here..
*/ */
if (attr->type == PERF_TYPE_BREAKPOINT) else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task; event->hw.bp_target = task;
#endif #endif
} }
......
此差异已折叠。
...@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, ...@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
kiov->iov_len, kiov->iov_base); kiov->iov_len, kiov->iov_base);
} }
/*
* This is declared in linux/regset.h and defined in machine-dependent
* code. We put the export here, near the primary machine-neutral use,
* to ensure no machine forgets it.
*/
EXPORT_SYMBOL_GPL(task_user_regset_view);
#endif #endif
int ptrace_request(struct task_struct *child, long request, int ptrace_request(struct task_struct *child, long request,
......
...@@ -66,7 +66,6 @@ ...@@ -66,7 +66,6 @@
#define TP_FLAG_TRACE 1 #define TP_FLAG_TRACE 1
#define TP_FLAG_PROFILE 2 #define TP_FLAG_PROFILE 2
#define TP_FLAG_REGISTERED 4 #define TP_FLAG_REGISTERED 4
#define TP_FLAG_UPROBE 8
/* data_rloc: data relative location, compatible with u32 */ /* data_rloc: data relative location, compatible with u32 */
......
...@@ -28,20 +28,21 @@ ...@@ -28,20 +28,21 @@
#define UPROBE_EVENT_SYSTEM "uprobes" #define UPROBE_EVENT_SYSTEM "uprobes"
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
struct list_head perf_events;
};
/* /*
* uprobe event core functions * uprobe event core functions
*/ */
struct trace_uprobe;
struct uprobe_trace_consumer {
struct uprobe_consumer cons;
struct trace_uprobe *tu;
};
struct trace_uprobe { struct trace_uprobe {
struct list_head list; struct list_head list;
struct ftrace_event_class class; struct ftrace_event_class class;
struct ftrace_event_call call; struct ftrace_event_call call;
struct uprobe_trace_consumer *consumer; struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
struct inode *inode; struct inode *inode;
char *filename; char *filename;
unsigned long offset; unsigned long offset;
...@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list); ...@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
{
rwlock_init(&filter->rwlock);
filter->nr_systemwide = 0;
INIT_LIST_HEAD(&filter->perf_events);
}
static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
{
return !filter->nr_systemwide && list_empty(&filter->perf_events);
}
/* /*
* Allocate new trace_uprobe and initialize it (including uprobes). * Allocate new trace_uprobe and initialize it (including uprobes).
*/ */
...@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) ...@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
goto error; goto error;
INIT_LIST_HEAD(&tu->list); INIT_LIST_HEAD(&tu->list);
tu->consumer.handler = uprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
return tu; return tu;
error: error:
...@@ -253,16 +268,18 @@ static int create_trace_uprobe(int argc, char **argv) ...@@ -253,16 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret) if (ret)
goto fail_address_parse; goto fail_address_parse;
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
inode = igrab(path.dentry->d_inode); inode = igrab(path.dentry->d_inode);
if (!S_ISREG(inode->i_mode)) { path_put(&path);
if (!inode || !S_ISREG(inode->i_mode)) {
ret = -EINVAL; ret = -EINVAL;
goto fail_address_parse; goto fail_address_parse;
} }
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
argc -= 2; argc -= 2;
argv += 2; argv += 2;
...@@ -469,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { ...@@ -469,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
}; };
/* uprobe handler */ /* uprobe handler */
static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
{ {
struct uprobe_trace_entry_head *entry; struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event; struct ring_buffer_event *event;
...@@ -479,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) ...@@ -479,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
unsigned long irq_flags; unsigned long irq_flags;
struct ftrace_event_call *call = &tu->call; struct ftrace_event_call *call = &tu->call;
tu->nhit++;
local_save_flags(irq_flags); local_save_flags(irq_flags);
pc = preempt_count(); pc = preempt_count();
...@@ -489,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) ...@@ -489,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->event.type, event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
size, irq_flags, pc); size, irq_flags, pc);
if (!event) if (!event)
return; return 0;
entry = ring_buffer_event_data(event); entry = ring_buffer_event_data(event);
entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1]; data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++) for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event)) if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit(buffer, event, irq_flags, pc); trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
return 0;
} }
/* Event entry printers */ /* Event entry printers */
...@@ -537,42 +554,43 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e ...@@ -537,42 +554,43 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return TRACE_TYPE_PARTIAL_LINE; return TRACE_TYPE_PARTIAL_LINE;
} }
static int probe_event_enable(struct trace_uprobe *tu, int flag) static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
{ {
struct uprobe_trace_consumer *utc; return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
int ret = 0; }
if (!tu->inode || tu->consumer) typedef bool (*filter_func_t)(struct uprobe_consumer *self,
return -EINTR; enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
static int
probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
{
int ret = 0;
utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); if (is_trace_uprobe_enabled(tu))
if (!utc)
return -EINTR; return -EINTR;
utc->cons.handler = uprobe_dispatcher; WARN_ON(!uprobe_filter_is_empty(&tu->filter));
utc->cons.filter = NULL;
ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
if (ret) {
kfree(utc);
return ret;
}
tu->flags |= flag; tu->flags |= flag;
utc->tu = tu; tu->consumer.filter = filter;
tu->consumer = utc; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret)
tu->flags &= ~flag;
return 0; return ret;
} }
static void probe_event_disable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag)
{ {
if (!tu->inode || !tu->consumer) if (!is_trace_uprobe_enabled(tu))
return; return;
uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); WARN_ON(!uprobe_filter_is_empty(&tu->filter));
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->flags &= ~flag; tu->flags &= ~flag;
kfree(tu->consumer);
tu->consumer = NULL;
} }
static int uprobe_event_define_fields(struct ftrace_event_call *event_call) static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
...@@ -646,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu) ...@@ -646,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
} }
#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PERF_EVENTS
static bool
__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
{
struct perf_event *event;
if (filter->nr_systemwide)
return true;
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.tp_target->mm == mm)
return true;
}
return false;
}
static inline bool
uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
{
return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
}
static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
/*
* event->parent != NULL means copy_process(), we can avoid
* uprobe_apply(). current->mm must be probed and we can rely
* on dup_mmap() which preserves the already installed bp's.
*
* attr.enable_on_exec means that exec/mmap will install the
* breakpoints we need.
*/
done = tu->filter.nr_systemwide ||
event->parent || event->attr.enable_on_exec ||
uprobe_filter_event(tu, event);
list_add(&event->hw.tp_list, &tu->filter.perf_events);
} else {
done = tu->filter.nr_systemwide;
tu->filter.nr_systemwide++;
}
write_unlock(&tu->filter.rwlock);
if (!done)
uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
return 0;
}
static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
(event->hw.tp_target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
} else {
tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
}
write_unlock(&tu->filter.rwlock);
if (!done)
uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
return 0;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
struct trace_uprobe *tu;
int ret;
tu = container_of(uc, struct trace_uprobe, consumer);
read_lock(&tu->filter.rwlock);
ret = __uprobe_perf_filter(&tu->filter, mm);
read_unlock(&tu->filter.rwlock);
return ret;
}
/* uprobe profile handler */ /* uprobe profile handler */
static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
{ {
struct ftrace_event_call *call = &tu->call; struct ftrace_event_call *call = &tu->call;
struct uprobe_trace_entry_head *entry; struct uprobe_trace_entry_head *entry;
...@@ -656,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) ...@@ -656,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
int size, __size, i; int size, __size, i;
int rctx; int rctx;
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
__size = sizeof(*entry) + tu->size; __size = sizeof(*entry) + tu->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64)); size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32); size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return; return 0;
preempt_disable(); preempt_disable();
...@@ -668,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) ...@@ -668,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
if (!entry) if (!entry)
goto out; goto out;
entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1]; data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++) for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
...@@ -678,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) ...@@ -678,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
out: out:
preempt_enable(); preempt_enable();
return 0;
} }
#endif /* CONFIG_PERF_EVENTS */ #endif /* CONFIG_PERF_EVENTS */
...@@ -688,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, ...@@ -688,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
switch (type) { switch (type) {
case TRACE_REG_REGISTER: case TRACE_REG_REGISTER:
return probe_event_enable(tu, TP_FLAG_TRACE); return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
case TRACE_REG_UNREGISTER: case TRACE_REG_UNREGISTER:
probe_event_disable(tu, TP_FLAG_TRACE); probe_event_disable(tu, TP_FLAG_TRACE);
...@@ -696,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, ...@@ -696,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER: case TRACE_REG_PERF_REGISTER:
return probe_event_enable(tu, TP_FLAG_PROFILE); return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER: case TRACE_REG_PERF_UNREGISTER:
probe_event_disable(tu, TP_FLAG_PROFILE); probe_event_disable(tu, TP_FLAG_PROFILE);
return 0; return 0;
case TRACE_REG_PERF_OPEN:
return uprobe_perf_open(tu, data);
case TRACE_REG_PERF_CLOSE:
return uprobe_perf_close(tu, data);
#endif #endif
default: default:
return 0; return 0;
...@@ -710,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, ...@@ -710,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{ {
struct uprobe_trace_consumer *utc;
struct trace_uprobe *tu; struct trace_uprobe *tu;
int ret = 0;
utc = container_of(con, struct uprobe_trace_consumer, cons); tu = container_of(con, struct trace_uprobe, consumer);
tu = utc->tu; tu->nhit++;
if (!tu || tu->consumer != utc)
return 0;
if (tu->flags & TP_FLAG_TRACE) if (tu->flags & TP_FLAG_TRACE)
uprobe_trace_func(tu, regs); ret |= uprobe_trace_func(tu, regs);
#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PERF_EVENTS
if (tu->flags & TP_FLAG_PROFILE) if (tu->flags & TP_FLAG_PROFILE)
uprobe_perf_func(tu, regs); ret |= uprobe_perf_func(tu, regs);
#endif #endif
return 0; return ret;
} }
static struct trace_event_functions uprobe_funcs = { static struct trace_event_functions uprobe_funcs = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册