diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt new file mode 100644 index 0000000000000000000000000000000000000000..7003e10f10f56e0085b47e54728cd6adf3215771 --- /dev/null +++ b/Documentation/trace/ftrace-design.txt @@ -0,0 +1,233 @@ + function tracer guts + ==================== + +Introduction +------------ + +Here we will cover the architecture pieces that the common function tracing +code relies on for proper functioning. Things are broken down into increasing +complexity so that you can start simple and at least get basic functionality. + +Note that this focuses on architecture implementation details only. If you +want more explanation of a feature in terms of common code, review the common +ftrace.txt file. + + +Prerequisites +------------- + +Ftrace relies on these features being implemented: + STACKTRACE_SUPPORT - implement save_stack_trace() + TRACE_IRQFLAGS_SUPPORT - implement include/asm/irqflags.h + + +HAVE_FUNCTION_TRACER +-------------------- + +You will need to implement the mcount and the ftrace_stub functions. + +The exact mcount symbol name will depend on your toolchain. Some call it +"mcount", "_mcount", or even "__mcount". You can probably figure it out by +running something like: + $ echo 'main(){}' | gcc -x c -S -o - - -pg | grep mcount + call mcount +We'll make the assumption below that the symbol is "mcount" just to keep things +nice and simple in the examples. + +Keep in mind that the ABI that is in effect inside of the mcount function is +*highly* architecture/toolchain specific. We cannot help you in this regard, +sorry. Dig up some old documentation and/or find someone more familiar than +you to bang ideas off of. Typically, register usage (argument/scratch/etc...) +is a major issue at this point, especially in relation to the location of the +mcount call (before/after function prologue). You might also want to look at +how glibc has implemented the mcount function for your architecture. It might +be (semi-)relevant. + +The mcount function should check the function pointer ftrace_trace_function +to see if it is set to ftrace_stub. If it is, there is nothing for you to do, +so return immediately. If it isn't, then call that function in the same way +the mcount function normally calls __mcount_internal -- the first argument is +the "frompc" while the second argument is the "selfpc" (adjusted to remove the +size of the mcount call that is embedded in the function). + +For example, if the function foo() calls bar(), when the bar() function calls +mcount(), the arguments mcount() will pass to the tracer are: + "frompc" - the address bar() will use to return to foo() + "selfpc" - the address bar() (with _mcount() size adjustment) + +Also keep in mind that this mcount function will be called *a lot*, so +optimizing for the default case of no tracer will help the smooth running of +your system when tracing is disabled. So the start of the mcount function is +typically the bare min with checking things before returning. That also means +the code flow should usually kept linear (i.e. no branching in the nop case). +This is of course an optimization and not a hard requirement. + +Here is some pseudo code that should help (these functions should actually be +implemented in assembly): + +void ftrace_stub(void) +{ + return; +} + +void mcount(void) +{ + /* save any bare state needed in order to do initial checking */ + + extern void (*ftrace_trace_function)(unsigned long, unsigned long); + if (ftrace_trace_function != ftrace_stub) + goto do_trace; + + /* restore any bare state */ + + return; + +do_trace: + + /* save all state needed by the ABI (see paragraph above) */ + + unsigned long frompc = ...; + unsigned long selfpc = - MCOUNT_INSN_SIZE; + ftrace_trace_function(frompc, selfpc); + + /* restore all state needed by the ABI */ +} + +Don't forget to export mcount for modules ! +extern void mcount(void); +EXPORT_SYMBOL(mcount); + + +HAVE_FUNCTION_TRACE_MCOUNT_TEST +------------------------------- + +This is an optional optimization for the normal case when tracing is turned off +in the system. If you do not enable this Kconfig option, the common ftrace +code will take care of doing the checking for you. + +To support this feature, you only need to check the function_trace_stop +variable in the mcount function. If it is non-zero, there is no tracing to be +done at all, so you can return. + +This additional pseudo code would simply be: +void mcount(void) +{ + /* save any bare state needed in order to do initial checking */ + ++ if (function_trace_stop) ++ return; + + extern void (*ftrace_trace_function)(unsigned long, unsigned long); + if (ftrace_trace_function != ftrace_stub) +... + + +HAVE_FUNCTION_GRAPH_TRACER +-------------------------- + +Deep breath ... time to do some real work. Here you will need to update the +mcount function to check ftrace graph function pointers, as well as implement +some functions to save (hijack) and restore the return address. + +The mcount function should check the function pointers ftrace_graph_return +(compare to ftrace_stub) and ftrace_graph_entry (compare to +ftrace_graph_entry_stub). If either of those are not set to the relevant stub +function, call the arch-specific function ftrace_graph_caller which in turn +calls the arch-specific function prepare_ftrace_return. Neither of these +function names are strictly required, but you should use them anyways to stay +consistent across the architecture ports -- easier to compare & contrast +things. + +The arguments to prepare_ftrace_return are slightly different than what are +passed to ftrace_trace_function. The second argument "selfpc" is the same, +but the first argument should be a pointer to the "frompc". Typically this is +located on the stack. This allows the function to hijack the return address +temporarily to have it point to the arch-specific function return_to_handler. +That function will simply call the common ftrace_return_to_handler function and +that will return the original return address with which, you can return to the +original call site. + +Here is the updated mcount pseudo code: +void mcount(void) +{ +... + if (ftrace_trace_function != ftrace_stub) + goto do_trace; + ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ extern void (*ftrace_graph_return)(...); ++ extern void (*ftrace_graph_entry)(...); ++ if (ftrace_graph_return != ftrace_stub || ++ ftrace_graph_entry != ftrace_graph_entry_stub) ++ ftrace_graph_caller(); ++#endif + + /* restore any bare state */ +... + +Here is the pseudo code for the new ftrace_graph_caller assembly function: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void ftrace_graph_caller(void) +{ + /* save all state needed by the ABI */ + + unsigned long *frompc = &...; + unsigned long selfpc = - MCOUNT_INSN_SIZE; + prepare_ftrace_return(frompc, selfpc); + + /* restore all state needed by the ABI */ +} +#endif + +For information on how to implement prepare_ftrace_return(), simply look at +the x86 version. The only architecture-specific piece in it is the setup of +the fault recovery table (the asm(...) code). The rest should be the same +across architectures. + +Here is the pseudo code for the new return_to_handler assembly function. Note +that the ABI that applies here is different from what applies to the mcount +code. Since you are returning from a function (after the epilogue), you might +be able to skimp on things saved/restored (usually just registers used to pass +return values). + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void return_to_handler(void) +{ + /* save all state needed by the ABI (see paragraph above) */ + + void (*original_return_point)(void) = ftrace_return_to_handler(); + + /* restore all state needed by the ABI */ + + /* this is usually either a return or a jump */ + original_return_point(); +} +#endif + + +HAVE_FTRACE_NMI_ENTER +--------------------- + +If you can't trace NMI functions, then skip this option. + +
+ + +HAVE_FTRACE_SYSCALLS +--------------------- + +
+ + +HAVE_FTRACE_MCOUNT_RECORD +------------------------- + +See scripts/recordmcount.pl for more info. + +
+ + +HAVE_DYNAMIC_FTRACE +--------------------- + +
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 355d0f1f8c501ebf244f71fd45a777e7ee26649c..1b6292bbdd6dac91655057367681e1c733bf2a73 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -26,6 +26,12 @@ disabled, and more (ftrace allows for tracer plugins, which means that the list of tracers can always grow). +Implementation Details +---------------------- + +See ftrace-design.txt for details for arch porters and such. + + The File System --------------- diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index bcd9c07848bef0c4fa9eb8042add0ad59faeb374..3a46b7b7abb219c40bf39ce4d5f4e448da131212 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -48,13 +48,13 @@ #define KPROBE_HIT_SSDONE 0x00000008 /* Attach to insert probes on any functions which should be ignored*/ -#define __kprobes __attribute__((__section__(".kprobes.text"))) notrace +#define __kprobes __attribute__((__section__(".kprobes.text"))) #else /* CONFIG_KPROBES */ typedef int kprobe_opcode_t; struct arch_specific_insn { int dummy; }; -#define __kprobes notrace +#define __kprobes #endif /* CONFIG_KPROBES */ struct kprobe; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 308bafd93325c992bb43bab91726aacefd91db3c..72a3b437b8299eef59a0a30cfca73ab84d431d61 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -239,9 +239,9 @@ ftrace_format_##call(struct ftrace_event_call *unused, \ #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ ({ \ - static const struct trace_print_flags flags[] = \ + static const struct trace_print_flags __flags[] = \ { flag_array, { -1, NULL }}; \ - ftrace_print_flags_seq(p, delim, flag, flags); \ + ftrace_print_flags_seq(p, delim, flag, __flags); \ }) #undef __print_symbolic @@ -254,7 +254,7 @@ ftrace_format_##call(struct ftrace_event_call *unused, \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ +static enum print_line_t \ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ { \ struct trace_seq *s = &iter->seq; \ @@ -317,7 +317,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ +static int \ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ struct ftrace_raw_##call field; \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1ea0d1234f4a36cb2b0fcb6769ddef83def4d35e..e71634604400c6dd5bb9f7e0a019143ae9c7cc32 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,12 +11,18 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - This gets selected when the arch tests the function_trace_stop - variable at the mcount call site. Otherwise, this variable - is tested by the called function. + See Documentation/trace/ftrace-implementation.txt config HAVE_DYNAMIC_FTRACE bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FTRACE_MCOUNT_RECORD bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_HW_BRANCH_TRACER bool config HAVE_SYSCALL_TRACEPOINTS bool + help + See Documentation/trace/ftrace-implementation.txt config TRACER_MAX_TRACE bool @@ -469,6 +479,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b23d56700882a265d0941fb65b64b49863e24e2..f7ab7fc162ccbbc091a64541e2e82761c528e1d2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2062,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int i, len = 0; char *search; - if (glob && (strcmp(glob, "*") || !strlen(glob))) + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; - else { + else if (glob) { int not; type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd81f7f996792852e5035278736cdf9bec46..20c5f92e28a8864cc3ca188cc1205200a89b6c55 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -66,10 +66,14 @@ u64 notrace trace_clock(void) * Used by plugins that need globally coherent timestamps. */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + raw_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + }; u64 notrace trace_clock_global(void) { @@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_lock); + __raw_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_trace_clock_time+1, to make sure + * my_scd->clock to prev_time+1, to make sure * we start ticking with the local clock from now on? */ - if ((s64)(now - prev_trace_clock_time) < 0) - now = prev_trace_clock_time + 1; + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; - prev_trace_clock_time = now; + trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_lock); + __raw_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c866d34e0144cc0f04a24214e54213b83baf27fe..a431748ddd6e34ee91f990bc525c2893e80a1841 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -78,7 +78,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth) ); /* Function return entry */ @@ -97,8 +97,8 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, - __entry->calltime, __entry->rettim, - __entrty->depth) + __entry->calltime, __entry->rettime, + __entry->depth) ); /* @@ -116,15 +116,6 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, __field( unsigned char, next_state ) \ __field( unsigned int, next_cpu ) -#if 0 -FTRACE_ENTRY_STRUCT_ONLY(ctx_switch_entry, - - F_STRUCT( - FTRACE_CTX_FIELDS - ) -); -#endif - FTRACE_ENTRY(context_switch, ctx_switch_entry, TRACE_CTX, @@ -133,7 +124,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, FTRACE_CTX_FIELDS ), - F_printk(b"%u:%u:%u ==> %u:%u:%u [%03u]", + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, __entry->next_cpu @@ -257,8 +248,8 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, __field_desc( unsigned char, rw, width ) ), - F_printk("%lx %lx %lx %d %lx %lx", - __entry->phs, __entry->value, __entry->pc, + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, __entry->map_id, __entry->opcode, __entry->width) ); @@ -275,8 +266,8 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __field_desc( unsigned char, map, opcode ) ), - F_printk("%lx %lx %lx %d %lx", - __entry->phs, __entry->virt, __entry->len, + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, __entry->map_id, __entry->opcode) ); @@ -370,7 +361,7 @@ FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, __field( int, node ) ), - F_printk("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" + F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" " flags:%x node:%d", __entry->type_id, __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index adbed124c3e7eb3099232b30281b6ad6553e40d1..787f0fb0994e861a3248a5018f7c4667b200910b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1154,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; @@ -1326,6 +1326,18 @@ static __init void event_trace_self_tests(void) if (!call->regfunc) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->system && + strcmp(call->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4cb29d84d73a3c33325f908865c09ad00fc94e1f..9753fcc61bc55b4577527f1ef8ef4853cba7406f 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -22,6 +22,47 @@ #undef __field_struct #define __field_struct(type, item) +#undef __field +#define __field(type, item) type item; + +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __used ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force cmpile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ @@ -88,10 +129,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ return ret; \ } -#undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) - #include "trace_entries.h" @@ -172,32 +209,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef __dynamic_array #define __dynamic_array(type, item) - -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) - -#undef TP_CMD -#define TP_CMD(cmd...) cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY entry - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; - #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ static int ftrace_raw_init_event_##call(void); \