diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 47e3997f7b5cf39233283ff43d84937daa502c2f..1a99e7939c2b7274cc6d8efec7f3e93d1035f232 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -208,7 +208,6 @@ struct ftrace_event_call { #define PERF_MAX_TRACE_SIZE 2048 -#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ extern void destroy_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3e379edbd8f39770bd9345794e266..856e73cc1d3f7010b67eff60b356cf075d1f8c2f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -661,8 +661,10 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; - struct filter_pred **preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ + struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -674,11 +676,23 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); @@ -700,11 +714,23 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f6d53cdc89b45841cff2679551c56..3249b4f77ef083fdec2a5bba0a52c3f17a482b23 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,9 +123,13 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; + struct filter_pred *preds; struct filter_pred *pred; - int i; + struct filter_pred *root; + int n_preds; + int done = 0; + + /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + + if (!n_preds) + return 1; + + /* + * n_preds, root and filter->preds are protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; + + pred = root; - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else { + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + + return __push_pred_stack(stack, dest); } -static void filter_disable_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; - call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void filter_disable(struct ftrace_event_call *call) { - int i; + call->flags &= ~TRACE_EVENT_FL_FILTERED; +} +static void __free_filter(struct event_filter *filter) +{ if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ struct filter_pred *pred; int i; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); + if (filter->preds) + __free_preds(filter); - filter->n_preds = 0; + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!filter->preds) - goto oom; + return -ENOMEM; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + filter->a_preds = n_preds; + filter->n_preds = 0; + + for (i = 0; i < n_preds; i++) { + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } - return filter; - -oom: - __free_preds(filter); - return ERR_PTR(-ENOMEM); -} - -static int init_preds(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - return 0; } -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; - fn = filter_pred_and; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; - fn = filter_pred_or; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_pred_stack(&stack, n_preds); + if (err) + return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) + goto fail; + } + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter; + + if (strcmp(call->class->system, system->name) != 0) continue; - /* really apply the filter */ - filter_disable_preds(call); - err = replace_preds(call, filter, ps, filter_string, false); + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); + + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; + + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); if (err) - filter_disable_preds(call); - else { + goto fail_mem; + + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; + if (fail) + goto fail; + + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); } return 0; + fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_preds(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - remove_filter_string(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_filter(filter); goto out_unlock; } @@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1334,18 +1880,21 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); - if (IS_ERR(filter)) { + filter = __alloc_filter(); + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } @@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1435,9 +1994,9 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex);