提交 f7d18ef6 编写于 作者: D David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

The following patchset contains Netfilter fixes for net, specifically
fixes for the nf_conncount infrastructure which is causing troubles
since 5c789e13 ("netfilter: nf_conncount: Add list lock and gc
worker, and RCU for init tree search"). Patches aim to simplify this
infrastructure while fixing up the problems:

1) Use fixed size CONNCOUNT_SLOTS in nf_conncount, from Shawn Bohrer.

2) Incorrect signedness in age calculation from find_or_evict(),
   from Florian Westphal.

3) Proper locking for the garbage collector workqueue callback,
   first make a patch to count how many nodes can be collected
   without holding locks, then grab lock and release them. Also
   from Florian.

4) Restart node lookup from the insertion path, after releasing nodes
   via packet path garbage collection. Shawn Bohrer described a scenario
   that may result in inserting a connection in an already dead list
   node. Patch from Florian.

5) Merge lookup and add function to avoid a hold release and re-grab.
   From Florian.

6) Be safe and iterate over the node lists under the spinlock.

7) Speculative list nodes removal via garbage collection, check if
   list node got a connection while it was scheduled for deletion
   via gc.

8) Accidental argument swap in find_next_bit() that leads to more
   frequent scheduling of the workqueue. From Florian Westphal.
====================
Signed-off-by: NDavid S. Miller <davem@davemloft.net>
......@@ -5,17 +5,10 @@
struct nf_conncount_data;
enum nf_conncount_list_add {
NF_CONNCOUNT_ADDED, /* list add was ok */
NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */
NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */
};
struct nf_conncount_list {
spinlock_t list_lock;
struct list_head head; /* connections with the same filtering key */
unsigned int count; /* length of list */
bool dead;
};
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
......@@ -29,18 +22,12 @@ unsigned int nf_conncount_count(struct net *net,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone);
void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone,
bool *addit);
int nf_conncount_add(struct net *net, struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone);
void nf_conncount_list_init(struct nf_conncount_list *list);
enum nf_conncount_list_add
nf_conncount_add(struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone);
bool nf_conncount_gc_list(struct net *net,
struct nf_conncount_list *list);
......
......@@ -33,12 +33,6 @@
#define CONNCOUNT_SLOTS 256U
#ifdef CONFIG_LOCKDEP
#define CONNCOUNT_LOCK_SLOTS 8U
#else
#define CONNCOUNT_LOCK_SLOTS 256U
#endif
#define CONNCOUNT_GC_MAX_NODES 8
#define MAX_KEYLEN 5
......@@ -49,8 +43,6 @@ struct nf_conncount_tuple {
struct nf_conntrack_zone zone;
int cpu;
u32 jiffies32;
bool dead;
struct rcu_head rcu_head;
};
struct nf_conncount_rb {
......@@ -60,7 +52,7 @@ struct nf_conncount_rb {
struct rcu_head rcu_head;
};
static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
struct nf_conncount_data {
unsigned int keylen;
......@@ -89,79 +81,25 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
return memcmp(a, b, klen * sizeof(u32));
}
enum nf_conncount_list_add
nf_conncount_add(struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone)
{
struct nf_conncount_tuple *conn;
if (WARN_ON_ONCE(list->count > INT_MAX))
return NF_CONNCOUNT_ERR;
conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
if (conn == NULL)
return NF_CONNCOUNT_ERR;
conn->tuple = *tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
conn->dead = false;
spin_lock_bh(&list->list_lock);
if (list->dead == true) {
kmem_cache_free(conncount_conn_cachep, conn);
spin_unlock_bh(&list->list_lock);
return NF_CONNCOUNT_SKIP;
}
list_add_tail(&conn->node, &list->head);
list->count++;
spin_unlock_bh(&list->list_lock);
return NF_CONNCOUNT_ADDED;
}
EXPORT_SYMBOL_GPL(nf_conncount_add);
static void __conn_free(struct rcu_head *h)
{
struct nf_conncount_tuple *conn;
conn = container_of(h, struct nf_conncount_tuple, rcu_head);
kmem_cache_free(conncount_conn_cachep, conn);
}
static bool conn_free(struct nf_conncount_list *list,
static void conn_free(struct nf_conncount_list *list,
struct nf_conncount_tuple *conn)
{
bool free_entry = false;
spin_lock_bh(&list->list_lock);
if (conn->dead) {
spin_unlock_bh(&list->list_lock);
return free_entry;
}
lockdep_assert_held(&list->list_lock);
list->count--;
conn->dead = true;
list_del_rcu(&conn->node);
if (list->count == 0) {
list->dead = true;
free_entry = true;
}
list_del(&conn->node);
spin_unlock_bh(&list->list_lock);
call_rcu(&conn->rcu_head, __conn_free);
return free_entry;
kmem_cache_free(conncount_conn_cachep, conn);
}
static const struct nf_conntrack_tuple_hash *
find_or_evict(struct net *net, struct nf_conncount_list *list,
struct nf_conncount_tuple *conn, bool *free_entry)
struct nf_conncount_tuple *conn)
{
const struct nf_conntrack_tuple_hash *found;
unsigned long a, b;
int cpu = raw_smp_processor_id();
__s32 age;
u32 age;
found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
if (found)
......@@ -176,52 +114,45 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
*/
age = a - b;
if (conn->cpu == cpu || age >= 2) {
*free_entry = conn_free(list, conn);
conn_free(list, conn);
return ERR_PTR(-ENOENT);
}
return ERR_PTR(-EAGAIN);
}
void nf_conncount_lookup(struct net *net,
struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone,
bool *addit)
static int __nf_conncount_add(struct net *net,
struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone)
{
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
struct nf_conn *found_ct;
unsigned int collect = 0;
bool free_entry = false;
/* best effort only */
*addit = tuple ? true : false;
/* check the saved connections */
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
if (collect > CONNCOUNT_GC_MAX_NODES)
break;
found = find_or_evict(net, list, conn, &free_entry);
found = find_or_evict(net, list, conn);
if (IS_ERR(found)) {
/* Not found, but might be about to be confirmed */
if (PTR_ERR(found) == -EAGAIN) {
if (!tuple)
continue;
if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
nf_ct_zone_id(zone, zone->dir))
*addit = false;
} else if (PTR_ERR(found) == -ENOENT)
return 0; /* already exists */
} else {
collect++;
}
continue;
}
found_ct = nf_ct_tuplehash_to_ctrack(found);
if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
nf_ct_zone_equal(found_ct, zone, zone->dir)) {
/*
* We should not see tuples twice unless someone hooks
......@@ -229,7 +160,8 @@ void nf_conncount_lookup(struct net *net,
*
* Attempt to avoid a re-add in this case.
*/
*addit = false;
nf_ct_put(found_ct);
return 0;
} else if (already_closed(found_ct)) {
/*
* we do not care about connections which are
......@@ -243,19 +175,48 @@ void nf_conncount_lookup(struct net *net,
nf_ct_put(found_ct);
}
if (WARN_ON_ONCE(list->count > INT_MAX))
return -EOVERFLOW;
conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
if (conn == NULL)
return -ENOMEM;
conn->tuple = *tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
list_add_tail(&conn->node, &list->head);
list->count++;
return 0;
}
EXPORT_SYMBOL_GPL(nf_conncount_lookup);
int nf_conncount_add(struct net *net,
struct nf_conncount_list *list,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone)
{
int ret;
/* check the saved connections */
spin_lock_bh(&list->list_lock);
ret = __nf_conncount_add(net, list, tuple, zone);
spin_unlock_bh(&list->list_lock);
return ret;
}
EXPORT_SYMBOL_GPL(nf_conncount_add);
void nf_conncount_list_init(struct nf_conncount_list *list)
{
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
list->count = 0;
list->dead = false;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
/* Return true if the list is empty */
/* Return true if the list is empty. Must be called with BH disabled. */
bool nf_conncount_gc_list(struct net *net,
struct nf_conncount_list *list)
{
......@@ -263,17 +224,17 @@ bool nf_conncount_gc_list(struct net *net,
struct nf_conncount_tuple *conn, *conn_n;
struct nf_conn *found_ct;
unsigned int collected = 0;
bool free_entry = false;
bool ret = false;
/* don't bother if other cpu is already doing GC */
if (!spin_trylock(&list->list_lock))
return false;
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
found = find_or_evict(net, list, conn, &free_entry);
found = find_or_evict(net, list, conn);
if (IS_ERR(found)) {
if (PTR_ERR(found) == -ENOENT) {
if (free_entry)
return true;
if (PTR_ERR(found) == -ENOENT)
collected++;
}
continue;
}
......@@ -284,23 +245,19 @@ bool nf_conncount_gc_list(struct net *net,
* closed already -> ditch it
*/
nf_ct_put(found_ct);
if (conn_free(list, conn))
return true;
conn_free(list, conn);
collected++;
continue;
}
nf_ct_put(found_ct);
if (collected > CONNCOUNT_GC_MAX_NODES)
return false;
break;
}
spin_lock_bh(&list->list_lock);
if (!list->count) {
list->dead = true;
if (!list->count)
ret = true;
}
spin_unlock_bh(&list->list_lock);
spin_unlock(&list->list_lock);
return ret;
}
......@@ -314,6 +271,7 @@ static void __tree_nodes_free(struct rcu_head *h)
kmem_cache_free(conncount_rb_cachep, rbconn);
}
/* caller must hold tree nf_conncount_locks[] lock */
static void tree_nodes_free(struct rb_root *root,
struct nf_conncount_rb *gc_nodes[],
unsigned int gc_count)
......@@ -323,8 +281,10 @@ static void tree_nodes_free(struct rb_root *root,
while (gc_count) {
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
rb_erase(&rbconn->node, root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
if (!rbconn->list.count) {
rb_erase(&rbconn->node, root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
}
spin_unlock(&rbconn->list.list_lock);
}
}
......@@ -341,20 +301,19 @@ insert_tree(struct net *net,
struct rb_root *root,
unsigned int hash,
const u32 *key,
u8 keylen,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone)
{
enum nf_conncount_list_add ret;
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
struct rb_node **rbnode, *parent;
struct nf_conncount_rb *rbconn;
struct nf_conncount_tuple *conn;
unsigned int count = 0, gc_count = 0;
bool node_found = false;
spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
u8 keylen = data->keylen;
bool do_gc = true;
spin_lock_bh(&nf_conncount_locks[hash]);
restart:
parent = NULL;
rbnode = &(root->rb_node);
while (*rbnode) {
......@@ -368,45 +327,32 @@ insert_tree(struct net *net,
} else if (diff > 0) {
rbnode = &((*rbnode)->rb_right);
} else {
/* unlikely: other cpu added node already */
node_found = true;
ret = nf_conncount_add(&rbconn->list, tuple, zone);
if (ret == NF_CONNCOUNT_ERR) {
int ret;
ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
if (ret)
count = 0; /* hotdrop */
} else if (ret == NF_CONNCOUNT_ADDED) {
else
count = rbconn->list.count;
} else {
/* NF_CONNCOUNT_SKIP, rbconn is already
* reclaimed by gc, insert a new tree node
*/
node_found = false;
}
break;
tree_nodes_free(root, gc_nodes, gc_count);
goto out_unlock;
}
if (gc_count >= ARRAY_SIZE(gc_nodes))
continue;
if (nf_conncount_gc_list(net, &rbconn->list))
if (do_gc && nf_conncount_gc_list(net, &rbconn->list))
gc_nodes[gc_count++] = rbconn;
}
if (gc_count) {
tree_nodes_free(root, gc_nodes, gc_count);
/* tree_node_free before new allocation permits
* allocator to re-use newly free'd object.
*
* This is a rare event; in most cases we will find
* existing node to re-use. (or gc_count is 0).
*/
if (gc_count >= ARRAY_SIZE(gc_nodes))
schedule_gc_worker(data, hash);
schedule_gc_worker(data, hash);
gc_count = 0;
do_gc = false;
goto restart;
}
if (node_found)
goto out_unlock;
/* expected case: match, insert new node */
rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
if (rbconn == NULL)
......@@ -430,7 +376,7 @@ insert_tree(struct net *net,
rb_link_node_rcu(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, root);
out_unlock:
spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
spin_unlock_bh(&nf_conncount_locks[hash]);
return count;
}
......@@ -441,7 +387,6 @@ count_tree(struct net *net,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_zone *zone)
{
enum nf_conncount_list_add ret;
struct rb_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
......@@ -454,7 +399,6 @@ count_tree(struct net *net,
parent = rcu_dereference_raw(root->rb_node);
while (parent) {
int diff;
bool addit;
rbconn = rb_entry(parent, struct nf_conncount_rb, node);
......@@ -464,31 +408,36 @@ count_tree(struct net *net,
} else if (diff > 0) {
parent = rcu_dereference_raw(parent->rb_right);
} else {
/* same source network -> be counted! */
nf_conncount_lookup(net, &rbconn->list, tuple, zone,
&addit);
int ret;
if (!addit)
if (!tuple) {
nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
}
ret = nf_conncount_add(&rbconn->list, tuple, zone);
if (ret == NF_CONNCOUNT_ERR) {
return 0; /* hotdrop */
} else if (ret == NF_CONNCOUNT_ADDED) {
return rbconn->list.count;
} else {
/* NF_CONNCOUNT_SKIP, rbconn is already
* reclaimed by gc, insert a new tree node
*/
spin_lock_bh(&rbconn->list.list_lock);
/* Node might be about to be free'd.
* We need to defer to insert_tree() in this case.
*/
if (rbconn->list.count == 0) {
spin_unlock_bh(&rbconn->list.list_lock);
break;
}
/* same source network -> be counted! */
ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
spin_unlock_bh(&rbconn->list.list_lock);
if (ret)
return 0; /* hotdrop */
else
return rbconn->list.count;
}
}
if (!tuple)
return 0;
return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
return insert_tree(net, data, root, hash, key, tuple, zone);
}
static void tree_gc_worker(struct work_struct *work)
......@@ -499,27 +448,47 @@ static void tree_gc_worker(struct work_struct *work)
struct rb_node *node;
unsigned int tree, next_tree, gc_count = 0;
tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
tree = data->gc_tree % CONNCOUNT_SLOTS;
root = &data->root[tree];
local_bh_disable();
rcu_read_lock();
for (node = rb_first(root); node != NULL; node = rb_next(node)) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
if (nf_conncount_gc_list(data->net, &rbconn->list))
gc_nodes[gc_count++] = rbconn;
gc_count++;
}
rcu_read_unlock();
local_bh_enable();
cond_resched();
spin_lock_bh(&nf_conncount_locks[tree]);
if (gc_count < ARRAY_SIZE(gc_nodes))
goto next; /* do not bother */
if (gc_count) {
tree_nodes_free(root, gc_nodes, gc_count);
gc_count = 0;
node = rb_first(root);
while (node != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
node = rb_next(node);
if (rbconn->list.count > 0)
continue;
gc_nodes[gc_count++] = rbconn;
if (gc_count >= ARRAY_SIZE(gc_nodes)) {
tree_nodes_free(root, gc_nodes, gc_count);
gc_count = 0;
}
}
tree_nodes_free(root, gc_nodes, gc_count);
next:
clear_bit(tree, data->pending_trees);
next_tree = (tree + 1) % CONNCOUNT_SLOTS;
next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);
if (next_tree < CONNCOUNT_SLOTS) {
data->gc_tree = next_tree;
......@@ -621,10 +590,7 @@ static int __init nf_conncount_modinit(void)
{
int i;
BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS);
BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0);
for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i)
for (i = 0; i < CONNCOUNT_SLOTS; ++i)
spin_lock_init(&nf_conncount_locks[i]);
conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
......
......@@ -5727,6 +5727,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
if (!nest)
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
goto nla_put_failure;
......
......@@ -30,7 +30,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
unsigned int count;
bool addit;
tuple_ptr = &tuple;
......@@ -44,19 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
return;
}
nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
&addit);
count = priv->list.count;
if (!addit)
goto out;
if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) {
if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) {
regs->verdict.code = NF_DROP;
return;
}
count++;
out:
count = priv->list.count;
if ((count > priv->limit) ^ priv->invert) {
regs->verdict.code = NFT_BREAK;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
新手
引导
客服 返回
顶部