Merge branch 'xfrm-remove-flow-cache'

Florian Westphal says: ==================== xfrm: remove flow cache After RCU-ification of ipsec packet path there are no major scalability issues anymore without flow cache. We still incur a performance hit, which comes mostly from the extra xfrm dst allocation/freeing. The last patch in the series adds a simple percpu cache to avoid the extra allocation if a packet matched the same policies as last one. The main concern with this is that we will see performance drops, especially with large numbers of policies/SAs. However, during hallway discussions at nfws 2017 it seemed the issues with flow caching outweight the removal downsides, and that it might be best to just 'remove it' and see where the practical issues (if any) will appear. It should now be possible to also remove the genid member in the policies as we don't hold bundles for prolonged time anymore, but I think this change is controversial (and intrusive) enough as-is, so defer that to a later point in time. Changes since last rfc: - fix build failures due to implicit interrupt.h includes - rework last patch (pcpu cache): * avoid xchg() * check policies for walk.dead = 1 instead of more costly bundle_ok(). * flush pcpu bundles when sa/policies get removed, to allow module references to go away (suggested by Ilan Tayari) ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>

Merge branch 'xfrm-remove-flow-cache'
Florian Westphal says: ==================== xfrm: remove flow cache After RCU-ification of ipsec packet path there are no major scalability issues anymore without flow cache. We still incur a performance hit, which comes mostly from the extra xfrm dst allocation/freeing. The last patch in the series adds a simple percpu cache to avoid the extra allocation if a packet matched the same policies as last one. The main concern with this is that we will see performance drops, especially with large numbers of policies/SAs. However, during hallway discussions at nfws 2017 it seemed the issues with flow caching outweight the removal downsides, and that it might be best to just 'remove it' and see where the practical issues (if any) will appear. It should now be possible to also remove the genid member in the policies as we don't hold bundles for prolonged time anymore, but I think this change is controversial (and intrusive) enough as-is, so defer that to a later point in time. Changes since last rfc: - fix build failures due to implicit interrupt.h includes - rework last patch (pcpu cache): * avoid xchg() * check policies for walk.dead = 1 instead of more costly bundle_ok(). * flush pcpu bundles when sa/policies get removed, to allow module references to go away (suggested by Ilan Tayari) ==================== Signed-off-by: N David S. Miller <davem@davemloft.net>
7f81ff04 · David S. Miller · 6ddb4fdf · ec30d78c · 7f81ff04 · 7f81ff04
21 changed file
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1291,8 +1291,7 @@ tag - INTEGER
 xfrm4_gc_thresh - INTEGER
 	The threshold at which we will start garbage collecting for IPv4
 	destination cache entries.  At twice this value the system will
-	refuse new allocations. The value must be set below the flowcache
-	limit (4096 * number of online cpus) to take effect.
+	refuse new allocations.

 igmp_link_local_mcast_reports - BOOLEAN
 	Enable IGMP reports for link local multicast groups in the
@@ -1778,8 +1777,7 @@ ratelimit - INTEGER
 xfrm6_gc_thresh - INTEGER
 	The threshold at which we will start garbage collecting for IPv6
 	destination cache entries.  At twice this value the system will
-	refuse new allocations. The value must be set below the flowcache
-	limit (4096 * number of online cpus) to take effect.
+	refuse new allocations.


 IPv6 Update by:

--- a/drivers/net/arcnet/arcdevice.h
+++ b/drivers/net/arcnet/arcdevice.h
@@ -20,7 +20,7 @@
 #include <linux/if_arcnet.h>

 #ifdef __KERNEL__
-#include  <linux/irqreturn.h>
+#include <linux/interrupt.h>

 /*
 * RECON_THRESHOLD is the maximum number of RECON messages to receive

--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -129,6 +129,7 @@
 #include <net/dcbnl.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
+#include <linux/interrupt.h>

 #define XGBE_DRV_NAME		"amd-xgbe"
 #define XGBE_DRV_VERSION	"1.0.3"

--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
@@ -17,6 +17,7 @@

 #include <linux/netdevice.h>
 #include <linux/tcp.h>
+#include <linux/interrupt.h>

 #include "dwc-xlgmac.h"
 #include "dwc-xlgmac-reg.h"

--- a/drivers/net/ieee802154/ca8210.c
+++ b/drivers/net/ieee802154/ca8210.c
@@ -66,6 +66,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
+#include <linux/interrupt.h>

 #include <net/ieee802154_netdev.h>
 #include <net/mac802154.h>

--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -218,40 +218,6 @@ static inline unsigned int flow_key_size(u16 family)
 	return 0;
 }

-#define FLOW_DIR_IN	0
-#define FLOW_DIR_OUT	1
-#define FLOW_DIR_FWD	2
-
-struct net;
-struct sock;
-struct flow_cache_ops;
-
-struct flow_cache_object {
-	const struct flow_cache_ops *ops;
-};
-
-struct flow_cache_ops {
-	struct flow_cache_object *(*get)(struct flow_cache_object *);
-	int (*check)(struct flow_cache_object *);
-	void (*delete)(struct flow_cache_object *);
-};
-
-typedef struct flow_cache_object *(*flow_resolve_t)(
-		struct net *net, const struct flowi *key, u16 family,
-		u8 dir, struct flow_cache_object *oldobj, void *ctx);
-
-struct flow_cache_object *flow_cache_lookup(struct net *net,
-					    const struct flowi *key, u16 family,
-					    u8 dir, flow_resolve_t resolver,
-					    void *ctx);
-int flow_cache_init(struct net *net);
-void flow_cache_fini(struct net *net);
-void flow_cache_hp_init(void);
-
-void flow_cache_flush(struct net *net);
-void flow_cache_flush_deferred(struct net *net);
-extern atomic_t flow_cache_genid;
-
 __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);

 static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6)

--- a/include/net/flowcache.h
+++ b/include/net/flowcache.h
-#ifndef _NET_FLOWCACHE_H
-#define _NET_FLOWCACHE_H
-
-#include <linux/interrupt.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/notifier.h>
-
-struct flow_cache_percpu {
-	struct hlist_head		*hash_table;
-	unsigned int			hash_count;
-	u32				hash_rnd;
-	int				hash_rnd_recalc;
-	struct tasklet_struct		flush_tasklet;
-};
-
-struct flow_cache {
-	u32				hash_shift;
-	struct flow_cache_percpu __percpu *percpu;
-	struct hlist_node		node;
-	unsigned int			low_watermark;
-	unsigned int			high_watermark;
-	struct timer_list		rnd_timer;
-};
-#endif	/* _NET_FLOWCACHE_H */
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -6,7 +6,6 @@
 #include <linux/workqueue.h>
 #include <linux/xfrm.h>
 #include <net/dst_ops.h>
-#include <net/flowcache.h>

 struct ctl_table_header;

@@ -73,16 +72,6 @@ struct netns_xfrm {
 	spinlock_t xfrm_state_lock;
 	spinlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
-
-	/* flow cache part */
-	struct flow_cache	flow_cache_global;
-	atomic_t		flow_cache_genid;
-	struct list_head	flow_cache_gc_list;
-	atomic_t		flow_cache_gc_count;
-	spinlock_t		flow_cache_gc_lock;
-	struct work_struct	flow_cache_gc_work;
-	struct work_struct	flow_cache_flush_work;
-	struct mutex		flow_flush_sem;
 };

 #endif
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -317,6 +317,7 @@ int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int fam
 void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
 void km_policy_notify(struct xfrm_policy *xp, int dir,
 		      const struct km_event *c);
+void xfrm_policy_cache_flush(void);
 void km_state_notify(struct xfrm_state *x, const struct km_event *c);

 struct xfrm_tmpl;
@@ -563,7 +564,6 @@ struct xfrm_policy {
 	refcount_t		refcnt;
 	struct timer_list	timer;

-	struct flow_cache_object flo;
 	atomic_t		genid;
 	u32			priority;
 	u32			index;
@@ -978,7 +978,6 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
-	struct flow_cache_object flo;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 	int num_pols, num_xfrms;
 	u32 xfrm_genid;
@@ -1226,9 +1225,6 @@ static inline void xfrm_sk_free_policy(struct sock *sk)
 	}
 }

-void xfrm_garbage_collect(struct net *net);
-void xfrm_garbage_collect_deferred(struct net *net);
-
 #else

 static inline void xfrm_sk_free_policy(struct sock *sk) {}
@@ -1263,9 +1259,6 @@ static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
 {
 	return 1;
 }
-static inline void xfrm_garbage_collect(struct net *net)
-{
-}
 #endif

 static __inline__

--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -11,7 +11,6 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
 			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o

-obj-$(CONFIG_XFRM) += flow.o
 obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o

--- a/net/core/flow.c
+++ b/net/core/flow.c
-/* flow.c: Generic flow cache.
- *
- * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
- * Copyright (C) 2003 David S. Miller (davem@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/jhash.h>
-#include <linux/interrupt.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/bitops.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/mutex.h>
-#include <net/flow.h>
-#include <linux/atomic.h>
-#include <linux/security.h>
-#include <net/net_namespace.h>
-
-struct flow_cache_entry {
-	union {
-		struct hlist_node	hlist;
-		struct list_head	gc_list;
-	} u;
-	struct net			*net;
-	u16				family;
-	u8				dir;
-	u32				genid;
-	struct flowi			key;
-	struct flow_cache_object	*object;
-};
-
-struct flow_flush_info {
-	struct flow_cache		*cache;
-	atomic_t			cpuleft;
-	struct completion		completion;
-};
-
-static struct kmem_cache *flow_cachep __read_mostly;
-
-#define flow_cache_hash_size(cache)	(1U << (cache)->hash_shift)
-#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
-
-static void flow_cache_new_hashrnd(unsigned long arg)
-{
-	struct flow_cache *fc = (void *) arg;
-	int i;
-
-	for_each_possible_cpu(i)
-		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
-
-	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&fc->rnd_timer);
-}
-
-static int flow_entry_valid(struct flow_cache_entry *fle,
-				struct netns_xfrm *xfrm)
-{
-	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
-		return 0;
-	if (fle->object && !fle->object->ops->check(fle->object))
-		return 0;
-	return 1;
-}
-
-static void flow_entry_kill(struct flow_cache_entry *fle,
-				struct netns_xfrm *xfrm)
-{
-	if (fle->object)
-		fle->object->ops->delete(fle->object);
-	kmem_cache_free(flow_cachep, fle);
-}
-
-static void flow_cache_gc_task(struct work_struct *work)
-{
-	struct list_head gc_list;
-	struct flow_cache_entry *fce, *n;
-	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
-						flow_cache_gc_work);
-
-	INIT_LIST_HEAD(&gc_list);
-	spin_lock_bh(&xfrm->flow_cache_gc_lock);
-	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
-	spin_unlock_bh(&xfrm->flow_cache_gc_lock);
-
-	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
-		flow_entry_kill(fce, xfrm);
-		atomic_dec(&xfrm->flow_cache_gc_count);
-	}
-}
-
-static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
-				     unsigned int deleted,
-				     struct list_head *gc_list,
-				     struct netns_xfrm *xfrm)
-{
-	if (deleted) {
-		atomic_add(deleted, &xfrm->flow_cache_gc_count);
-		fcp->hash_count -= deleted;
-		spin_lock_bh(&xfrm->flow_cache_gc_lock);
-		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
-		spin_unlock_bh(&xfrm->flow_cache_gc_lock);
-		schedule_work(&xfrm->flow_cache_gc_work);
-	}
-}
-
-static void __flow_cache_shrink(struct flow_cache *fc,
-				struct flow_cache_percpu *fcp,
-				unsigned int shrink_to)
-{
-	struct flow_cache_entry *fle;
-	struct hlist_node *tmp;
-	LIST_HEAD(gc_list);
-	unsigned int deleted = 0;
-	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
-						flow_cache_global);
-	unsigned int i;
-
-	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		unsigned int saved = 0;
-
-		hlist_for_each_entry_safe(fle, tmp,
-					  &fcp->hash_table[i], u.hlist) {
-			if (saved < shrink_to &&
-			    flow_entry_valid(fle, xfrm)) {
-				saved++;
-			} else {
-				deleted++;
-				hlist_del(&fle->u.hlist);
-				list_add_tail(&fle->u.gc_list, &gc_list);
-			}
-		}
-	}
-
-	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
-}
-
-static void flow_cache_shrink(struct flow_cache *fc,
-			      struct flow_cache_percpu *fcp)
-{
-	unsigned int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
-
-	__flow_cache_shrink(fc, fcp, shrink_to);
-}
-
-static void flow_new_hash_rnd(struct flow_cache *fc,
-			      struct flow_cache_percpu *fcp)
-{
-	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
-	fcp->hash_rnd_recalc = 0;
-	__flow_cache_shrink(fc, fcp, 0);
-}
-
-static u32 flow_hash_code(struct flow_cache *fc,
-			  struct flow_cache_percpu *fcp,
-			  const struct flowi *key,
-			  unsigned int keysize)
-{
-	const u32 *k = (const u32 *) key;
-	const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
-
-	return jhash2(k, length, fcp->hash_rnd)
-		& (flow_cache_hash_size(fc) - 1);
-}
-
-/* I hear what you're saying, use memcmp.  But memcmp cannot make
- * important assumptions that we can here, such as alignment.
- */
-static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
-			    unsigned int keysize)
-{
-	const flow_compare_t *k1, *k1_lim, *k2;
-
-	k1 = (const flow_compare_t *) key1;
-	k1_lim = k1 + keysize;
-
-	k2 = (const flow_compare_t *) key2;
-
-	do {
-		if (*k1++ != *k2++)
-			return 1;
-	} while (k1 < k1_lim);
-
-	return 0;
-}
-
-struct flow_cache_object *
-flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
-		  flow_resolve_t resolver, void *ctx)
-{
-	struct flow_cache *fc = &net->xfrm.flow_cache_global;
-	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle, *tfle;
-	struct flow_cache_object *flo;
-	unsigned int keysize;
-	unsigned int hash;
-
-	local_bh_disable();
-	fcp = this_cpu_ptr(fc->percpu);
-
-	fle = NULL;
-	flo = NULL;
-
-	keysize = flow_key_size(family);
-	if (!keysize)
-		goto nocache;
-
-	/* Packet really early in init?  Making flow_cache_init a
-	 * pre-smp initcall would solve this.  --RR */
-	if (!fcp->hash_table)
-		goto nocache;
-
-	if (fcp->hash_rnd_recalc)
-		flow_new_hash_rnd(fc, fcp);
-
-	hash = flow_hash_code(fc, fcp, key, keysize);
-	hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
-		if (tfle->net == net &&
-		    tfle->family == family &&
-		    tfle->dir == dir &&
-		    flow_key_compare(key, &tfle->key, keysize) == 0) {
-			fle = tfle;
-			break;
-		}
-	}
-
-	if (unlikely(!fle)) {
-		if (fcp->hash_count > fc->high_watermark)
-			flow_cache_shrink(fc, fcp);
-
-		if (atomic_read(&net->xfrm.flow_cache_gc_count) >
-		    2 * num_online_cpus() * fc->high_watermark) {
-			flo = ERR_PTR(-ENOBUFS);
-			goto ret_object;
-		}
-
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
-		if (fle) {
-			fle->net = net;
-			fle->family = family;
-			fle->dir = dir;
-			memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
-			fle->object = NULL;
-			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
-			fcp->hash_count++;
-		}
-	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
-		flo = fle->object;
-		if (!flo)
-			goto ret_object;
-		flo = flo->ops->get(flo);
-		if (flo)
-			goto ret_object;
-	} else if (fle->object) {
-	        flo = fle->object;
-	        flo->ops->delete(flo);
-	        fle->object = NULL;
-	}
-
-nocache:
-	flo = NULL;
-	if (fle) {
-		flo = fle->object;
-		fle->object = NULL;
-	}
-	flo = resolver(net, key, family, dir, flo, ctx);
-	if (fle) {
-		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
-		if (!IS_ERR(flo))
-			fle->object = flo;
-		else
-			fle->genid--;
-	} else {
-		if (!IS_ERR_OR_NULL(flo))
-			flo->ops->delete(flo);
-	}
-ret_object:
-	local_bh_enable();
-	return flo;
-}
-EXPORT_SYMBOL(flow_cache_lookup);
-
-static void flow_cache_flush_tasklet(unsigned long data)
-{
-	struct flow_flush_info *info = (void *)data;
-	struct flow_cache *fc = info->cache;
-	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle;
-	struct hlist_node *tmp;
-	LIST_HEAD(gc_list);
-	unsigned int deleted = 0;
-	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
-						flow_cache_global);
-	unsigned int i;
-
-	fcp = this_cpu_ptr(fc->percpu);
-	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		hlist_for_each_entry_safe(fle, tmp,
-					  &fcp->hash_table[i], u.hlist) {
-			if (flow_entry_valid(fle, xfrm))
-				continue;
-
-			deleted++;
-			hlist_del(&fle->u.hlist);
-			list_add_tail(&fle->u.gc_list, &gc_list);
-		}
-	}
-
-	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
-
-	if (atomic_dec_and_test(&info->cpuleft))
-		complete(&info->completion);
-}
-
-/*
- * Return whether a cpu needs flushing.  Conservatively, we assume
- * the presence of any entries means the core may require flushing,
- * since the flow_cache_ops.check() function may assume it's running
- * on the same core as the per-cpu cache component.
- */
-static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
-{
-	struct flow_cache_percpu *fcp;
-	unsigned int i;
-
-	fcp = per_cpu_ptr(fc->percpu, cpu);
-	for (i = 0; i < flow_cache_hash_size(fc); i++)
-		if (!hlist_empty(&fcp->hash_table[i]))
-			return 0;
-	return 1;
-}
-
-static void flow_cache_flush_per_cpu(void *data)
-{
-	struct flow_flush_info *info = data;
-	struct tasklet_struct *tasklet;
-
-	tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
-	tasklet->data = (unsigned long)info;
-	tasklet_schedule(tasklet);
-}
-
-void flow_cache_flush(struct net *net)
-{
-	struct flow_flush_info info;
-	cpumask_var_t mask;
-	int i, self;
-
-	/* Track which cpus need flushing to avoid disturbing all cores. */
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return;
-	cpumask_clear(mask);
-
-	/* Don't want cpus going down or up during this. */
-	get_online_cpus();
-	mutex_lock(&net->xfrm.flow_flush_sem);
-	info.cache = &net->xfrm.flow_cache_global;
-	for_each_online_cpu(i)
-		if (!flow_cache_percpu_empty(info.cache, i))
-			cpumask_set_cpu(i, mask);
-	atomic_set(&info.cpuleft, cpumask_weight(mask));
-	if (atomic_read(&info.cpuleft) == 0)
-		goto done;
-
-	init_completion(&info.completion);
-
-	local_bh_disable();
-	self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
-	on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
-	if (self)
-		flow_cache_flush_tasklet((unsigned long)&info);
-	local_bh_enable();
-
-	wait_for_completion(&info.completion);
-
-done:
-	mutex_unlock(&net->xfrm.flow_flush_sem);
-	put_online_cpus();
-	free_cpumask_var(mask);
-}
-
-static void flow_cache_flush_task(struct work_struct *work)
-{
-	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
-						flow_cache_flush_work);
-	struct net *net = container_of(xfrm, struct net, xfrm);
-
-	flow_cache_flush(net);
-}
-
-void flow_cache_flush_deferred(struct net *net)
-{
-	schedule_work(&net->xfrm.flow_cache_flush_work);
-}
-
-static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
-{
-	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
-	unsigned int sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
-
-	if (!fcp->hash_table) {
-		fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
-		if (!fcp->hash_table) {
-			pr_err("NET: failed to allocate flow cache sz %u\n", sz);
-			return -ENOMEM;
-		}
-		fcp->hash_rnd_recalc = 1;
-		fcp->hash_count = 0;
-		tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
-	}
-	return 0;
-}
-
-static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
-{
-	struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
-
-	return flow_cache_cpu_prepare(fc, cpu);
-}
-
-static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
-	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
-
-	__flow_cache_shrink(fc, fcp, 0);
-	return 0;
-}
-
-int flow_cache_init(struct net *net)
-{
-	int i;
-	struct flow_cache *fc = &net->xfrm.flow_cache_global;
-
-	if (!flow_cachep)
-		flow_cachep = kmem_cache_create("flow_cache",
-						sizeof(struct flow_cache_entry),
-						0, SLAB_PANIC, NULL);
-	spin_lock_init(&net->xfrm.flow_cache_gc_lock);
-	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
-	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
-	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
-	mutex_init(&net->xfrm.flow_flush_sem);
-	atomic_set(&net->xfrm.flow_cache_gc_count, 0);
-
-	fc->hash_shift = 10;
-	fc->low_watermark = 2 * flow_cache_hash_size(fc);
-	fc->high_watermark = 4 * flow_cache_hash_size(fc);
-
-	fc->percpu = alloc_percpu(struct flow_cache_percpu);
-	if (!fc->percpu)
-		return -ENOMEM;
-
-	if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
-		goto err;
-
-	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
-		    (unsigned long) fc);
-	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&fc->rnd_timer);
-
-	return 0;
-
-err:
-	for_each_possible_cpu(i) {
-		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
-		kfree(fcp->hash_table);
-		fcp->hash_table = NULL;
-	}
-
-	free_percpu(fc->percpu);
-	fc->percpu = NULL;
-
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(flow_cache_init);
-
-void flow_cache_fini(struct net *net)
-{
-	int i;
-	struct flow_cache *fc = &net->xfrm.flow_cache_global;
-
-	del_timer_sync(&fc->rnd_timer);
-
-	cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
-
-	for_each_possible_cpu(i) {
-		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
-		kfree(fcp->hash_table);
-		fcp->hash_table = NULL;
-	}
-
-	free_percpu(fc->percpu);
-	fc->percpu = NULL;
-}
-EXPORT_SYMBOL(flow_cache_fini);
-
-void __init flow_cache_hp_init(void)
-{
-	int ret;
-
-	ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
-				      "net/flow:prepare",
-				      flow_cache_cpu_up_prep,
-				      flow_cache_cpu_dead);
-	WARN_ON(ret < 0);
-}
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -584,33 +584,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
 	.get_link_net	= ip_tunnel_get_link_net,
 };

-static bool is_vti_tunnel(const struct net_device *dev)
-{
-	return dev->netdev_ops == &vti_netdev_ops;
-}
-
-static int vti_device_event(struct notifier_block *unused,
-			    unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct ip_tunnel *tunnel = netdev_priv(dev);
-
-	if (!is_vti_tunnel(dev))
-		return NOTIFY_DONE;
-
-	switch (event) {
-	case NETDEV_DOWN:
-		if (!net_eq(tunnel->net, dev_net(dev)))
-			xfrm_garbage_collect(tunnel->net);
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block vti_notifier_block __read_mostly = {
-	.notifier_call = vti_device_event,
-};
-
 static int __init vti_init(void)
 {
 	const char *msg;
@@ -618,8 +591,6 @@ static int __init vti_init(void)

 	pr_info("IPv4 over IPsec tunneling driver\n");

-	register_netdevice_notifier(&vti_notifier_block);
-
 	msg = "tunnel device";
 	err = register_pernet_device(&vti_net_ops);
 	if (err < 0)
@@ -652,7 +623,6 @@ static int __init vti_init(void)
 xfrm_proto_esp_failed:
 	unregister_pernet_device(&vti_net_ops);
 pernet_dev_failed:
-	unregister_netdevice_notifier(&vti_notifier_block);
 	pr_err("vti init: failed to register %s\n", msg);
 	return err;
 }
@@ -664,7 +634,6 @@ static void __exit vti_fini(void)
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
 	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
 	unregister_pernet_device(&vti_net_ops);
-	unregister_netdevice_notifier(&vti_notifier_block);
 }

 module_init(vti_init);

--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -213,14 +213,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 	fl4->flowi4_tos = iph->tos;
 }

-static inline int xfrm4_garbage_collect(struct dst_ops *ops)
-{
-	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
-
-	xfrm_garbage_collect_deferred(net);
-	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
-}
-
 static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu)
 {
@@ -259,14 +251,13 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,

 static struct dst_ops xfrm4_dst_ops_template = {
 	.family =		AF_INET,
-	.gc =			xfrm4_garbage_collect,
 	.update_pmtu =		xfrm4_update_pmtu,
 	.redirect =		xfrm4_redirect,
 	.cow_metrics =		dst_cow_metrics_generic,
 	.destroy =		xfrm4_dst_destroy,
 	.ifdown =		xfrm4_dst_ifdown,
 	.local_out =		__ip_local_out,
-	.gc_thresh =		INT_MAX,
+	.gc_thresh =		32768,
 };

 static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {

--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1145,33 +1145,6 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
 	.priority	=	100,
 };

-static bool is_vti6_tunnel(const struct net_device *dev)
-{
-	return dev->netdev_ops == &vti6_netdev_ops;
-}
-
-static int vti6_device_event(struct notifier_block *unused,
-			     unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct ip6_tnl *t = netdev_priv(dev);
-
-	if (!is_vti6_tunnel(dev))
-		return NOTIFY_DONE;
-
-	switch (event) {
-	case NETDEV_DOWN:
-		if (!net_eq(t->net, dev_net(dev)))
-			xfrm_garbage_collect(t->net);
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block vti6_notifier_block __read_mostly = {
-	.notifier_call = vti6_device_event,
-};
-
 /**
 * vti6_tunnel_init - register protocol and reserve needed resources
 *
@@ -1182,8 +1155,6 @@ static int __init vti6_tunnel_init(void)
 	const char *msg;
 	int err;

-	register_netdevice_notifier(&vti6_notifier_block);
-
 	msg = "tunnel device";
 	err = register_pernet_device(&vti6_net_ops);
 	if (err < 0)
@@ -1216,7 +1187,6 @@ static int __init vti6_tunnel_init(void)
 xfrm_proto_esp_failed:
 	unregister_pernet_device(&vti6_net_ops);
 pernet_dev_failed:
-	unregister_netdevice_notifier(&vti6_notifier_block);
 	pr_err("vti6 init: failed to register %s\n", msg);
 	return err;
 }
@@ -1231,7 +1201,6 @@ static void __exit vti6_tunnel_cleanup(void)
 	xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
 	xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
 	unregister_pernet_device(&vti6_net_ops);
-	unregister_netdevice_notifier(&vti6_notifier_block);
 }

 module_init(vti6_tunnel_init);

--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -214,14 +214,6 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 	}
 }

-static inline int xfrm6_garbage_collect(struct dst_ops *ops)
-{
-	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
-
-	xfrm_garbage_collect_deferred(net);
-	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
-}
-
 static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu)
 {
@@ -279,14 +271,13 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,

 static struct dst_ops xfrm6_dst_ops_template = {
 	.family =		AF_INET6,
-	.gc =			xfrm6_garbage_collect,
 	.update_pmtu =		xfrm6_update_pmtu,
 	.redirect =		xfrm6_redirect,
 	.cow_metrics =		dst_cow_metrics_generic,
 	.destroy =		xfrm6_dst_destroy,
 	.ifdown =		xfrm6_dst_ifdown,
 	.local_out =		__ip6_local_out,
-	.gc_thresh =		INT_MAX,
+	.gc_thresh =		32768,
 };

 static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {

--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2398,8 +2398,6 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa

 out:
 	xfrm_pol_put(xp);
-	if (err == 0)
-		xfrm_garbage_collect(net);
 	return err;
 }

@@ -2650,8 +2648,6 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_

 out:
 	xfrm_pol_put(xp);
-	if (delete && err == 0)
-		xfrm_garbage_collect(net);
 	return err;
 }

@@ -2751,8 +2747,6 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad
 	int err, err2;

 	err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true);
-	if (!err)
-		xfrm_garbage_collect(net);
 	err2 = unicast_flush_resp(sk, hdr);
 	if (err || err2) {
 		if (err == -ESRCH) /* empty table - old silent behavior */

--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -153,6 +153,7 @@ static int xfrm_dev_register(struct net_device *dev)

 static int xfrm_dev_unregister(struct net_device *dev)
 {
+	xfrm_policy_cache_flush();
 	return NOTIFY_DONE;
 }

@@ -175,8 +176,7 @@ static int xfrm_dev_down(struct net_device *dev)
 	if (dev->features & NETIF_F_HW_ESP)
 		xfrm_dev_state_flush(dev_net(dev), dev, true);

-	xfrm_garbage_collect(dev_net(dev));
-
+	xfrm_policy_cache_flush();
 	return NOTIFY_DONE;
 }


--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -24,6 +24,7 @@
 #include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/cache.h>
+#include <linux/cpu.h>
 #include <linux/audit.h>
 #include <net/dst.h>
 #include <net/flow.h>
@@ -44,6 +45,8 @@ struct xfrm_flo {
 	u8 flags;
 };

+static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
+static struct work_struct *xfrm_pcpu_work __read_mostly;
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
@@ -246,36 +249,6 @@ static void xfrm_policy_timer(unsigned long data)
 	xfrm_pol_put(xp);
 }

-static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
-{
-	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
-
-	if (unlikely(pol->walk.dead))
-		flo = NULL;
-	else
-		xfrm_pol_hold(pol);
-
-	return flo;
-}
-
-static int xfrm_policy_flo_check(struct flow_cache_object *flo)
-{
-	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
-
-	return !pol->walk.dead;
-}
-
-static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
-{
-	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
-}
-
-static const struct flow_cache_ops xfrm_policy_fc_ops = {
-	.get = xfrm_policy_flo_get,
-	.check = xfrm_policy_flo_check,
-	.delete = xfrm_policy_flo_delete,
-};
-
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
 * SPD calls.
 */
@@ -298,7 +271,6 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 				(unsigned long)policy);
 		setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
 			    (unsigned long)policy);
-		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -798,7 +770,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	else
 		hlist_add_head(&policy->bydst, chain);
 	__xfrm_policy_link(policy, dir);
-	atomic_inc(&net->xfrm.flow_cache_genid);

 	/* After previous checking, family can either be AF_INET or AF_INET6 */
 	if (policy->family == AF_INET)
@@ -1004,6 +975,8 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 	}
 	if (!cnt)
 		err = -ESRCH;
+	else
+		xfrm_policy_cache_flush();
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	return err;
@@ -1175,7 +1148,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 }

 static struct xfrm_policy *
-__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
+xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
 {
 #ifdef CONFIG_XFRM_SUB_POLICY
 	struct xfrm_policy *pol;
@@ -1187,61 +1160,6 @@ __xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir
 	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
 }

-static int flow_to_policy_dir(int dir)
-{
-	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
-	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
-	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
-		return dir;
-
-	switch (dir) {
-	default:
-	case FLOW_DIR_IN:
-		return XFRM_POLICY_IN;
-	case FLOW_DIR_OUT:
-		return XFRM_POLICY_OUT;
-	case FLOW_DIR_FWD:
-		return XFRM_POLICY_FWD;
-	}
-}
-
-static struct flow_cache_object *
-xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
-		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
-{
-	struct xfrm_policy *pol;
-
-	if (old_obj)
-		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
-
-	pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
-	if (IS_ERR_OR_NULL(pol))
-		return ERR_CAST(pol);
-
-	/* Resolver returns two references:
-	 * one for cache and one for caller of flow_cache_lookup() */
-	xfrm_pol_hold(pol);
-
-	return &pol->flo;
-}
-
-static inline int policy_to_flow_dir(int dir)
-{
-	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
-	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
-	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
-		return dir;
-	switch (dir) {
-	default:
-	case XFRM_POLICY_IN:
-		return FLOW_DIR_IN;
-	case XFRM_POLICY_OUT:
-		return FLOW_DIR_OUT;
-	case XFRM_POLICY_FWD:
-		return FLOW_DIR_FWD;
-	}
-}
-
 static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 						 const struct flowi *fl, u16 family)
 {
@@ -1261,7 +1179,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 			}
 			err = security_xfrm_policy_lookup(pol->security,
 						      fl->flowi_secid,
-						      policy_to_flow_dir(dir));
+						      dir);
 			if (!err) {
 				if (!xfrm_pol_hold_rcu(pol))
 					goto again;
@@ -1545,58 +1463,6 @@ static int xfrm_get_tos(const struct flowi *fl, int family)
 	return tos;
 }

-static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
-{
-	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
-	struct dst_entry *dst = &xdst->u.dst;
-
-	if (xdst->route == NULL) {
-		/* Dummy bundle - if it has xfrms we were not
-		 * able to build bundle as template resolution failed.
-		 * It means we need to try again resolving. */
-		if (xdst->num_xfrms > 0)
-			return NULL;
-	} else if (dst->flags & DST_XFRM_QUEUE) {
-		return NULL;
-	} else {
-		/* Real bundle */
-		if (stale_bundle(dst))
-			return NULL;
-	}
-
-	dst_hold(dst);
-	return flo;
-}
-
-static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
-{
-	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
-	struct dst_entry *dst = &xdst->u.dst;
-
-	if (!xdst->route)
-		return 0;
-	if (stale_bundle(dst))
-		return 0;
-
-	return 1;
-}
-
-static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
-{
-	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
-	struct dst_entry *dst = &xdst->u.dst;
-
-	/* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
-	dst->obsolete = DST_OBSOLETE_DEAD;
-	dst_release_immediate(dst);
-}
-
-static const struct flow_cache_ops xfrm_bundle_fc_ops = {
-	.get = xfrm_bundle_flo_get,
-	.check = xfrm_bundle_flo_check,
-	.delete = xfrm_bundle_flo_delete,
-};
-
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
 	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1624,7 +1490,6 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 		struct dst_entry *dst = &xdst->u.dst;

 		memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
-		xdst->flo.ops = &xfrm_bundle_fc_ops;
 	} else
 		xdst = ERR_PTR(-ENOBUFS);

@@ -1840,6 +1705,102 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,

 }

+static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
+{
+	this_cpu_write(xfrm_last_dst, xdst);
+	if (old)
+		dst_release(&old->u.dst);
+}
+
+static void __xfrm_pcpu_work_fn(void)
+{
+	struct xfrm_dst *old;
+
+	old = this_cpu_read(xfrm_last_dst);
+	if (old && !xfrm_bundle_ok(old))
+		xfrm_last_dst_update(NULL, old);
+}
+
+static void xfrm_pcpu_work_fn(struct work_struct *work)
+{
+	local_bh_disable();
+	rcu_read_lock();
+	__xfrm_pcpu_work_fn();
+	rcu_read_unlock();
+	local_bh_enable();
+}
+
+void xfrm_policy_cache_flush(void)
+{
+	struct xfrm_dst *old;
+	bool found = 0;
+	int cpu;
+
+	local_bh_disable();
+	rcu_read_lock();
+	for_each_possible_cpu(cpu) {
+		old = per_cpu(xfrm_last_dst, cpu);
+		if (old && !xfrm_bundle_ok(old)) {
+			if (smp_processor_id() == cpu) {
+				__xfrm_pcpu_work_fn();
+				continue;
+			}
+			found = true;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+	local_bh_enable();
+
+	if (!found)
+		return;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		bool bundle_release;
+
+		rcu_read_lock();
+		old = per_cpu(xfrm_last_dst, cpu);
+		bundle_release = old && !xfrm_bundle_ok(old);
+		rcu_read_unlock();
+
+		if (!bundle_release)
+			continue;
+
+		if (cpu_online(cpu)) {
+			schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
+			continue;
+		}
+
+		rcu_read_lock();
+		old = per_cpu(xfrm_last_dst, cpu);
+		if (old && !xfrm_bundle_ok(old)) {
+			per_cpu(xfrm_last_dst, cpu) = NULL;
+			dst_release(&old->u.dst);
+		}
+		rcu_read_unlock();
+	}
+
+	put_online_cpus();
+}
+
+static bool xfrm_pol_dead(struct xfrm_dst *xdst)
+{
+	unsigned int num_pols = xdst->num_pols;
+	unsigned int pol_dead = 0, i;
+
+	for (i = 0; i < num_pols; i++)
+		pol_dead |= xdst->pols[i]->walk.dead;
+
+	/* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+	if (pol_dead)
+		xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+
+	return pol_dead;
+}
+
 static struct xfrm_dst *
 xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 			       const struct flowi *fl, u16 family,
@@ -1847,10 +1808,22 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 {
 	struct net *net = xp_net(pols[0]);
 	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct xfrm_dst *xdst, *old;
 	struct dst_entry *dst;
-	struct xfrm_dst *xdst;
 	int err;

+	xdst = this_cpu_read(xfrm_last_dst);
+	if (xdst &&
+	    xdst->u.dst.dev == dst_orig->dev &&
+	    xdst->num_pols == num_pols &&
+	    !xfrm_pol_dead(xdst) &&
+	    memcmp(xdst->pols, pols,
+		   sizeof(struct xfrm_policy *) * num_pols) == 0) {
+		dst_hold(&xdst->u.dst);
+		return xdst;
+	}
+
+	old = xdst;
 	/* Try to instantiate a bundle */
 	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
 	if (err <= 0) {
@@ -1871,6 +1844,9 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
 	xdst->policy_genid = atomic_read(&pols[0]->genid);

+	atomic_set(&xdst->u.dst.__refcnt, 2);
+	xfrm_last_dst_update(xdst, old);
+
 	return xdst;
 }

@@ -2051,86 +2027,39 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
 	goto out;
 }

-static struct flow_cache_object *
-xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
-		   struct flow_cache_object *oldflo, void *ctx)
+static struct xfrm_dst *
+xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo)
 {
-	struct xfrm_flo *xflo = (struct xfrm_flo *)ctx;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-	struct xfrm_dst *xdst, *new_xdst;
-	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
-
-	/* Check if the policies from old bundle are usable */
-	xdst = NULL;
-	if (oldflo) {
-		xdst = container_of(oldflo, struct xfrm_dst, flo);
-		num_pols = xdst->num_pols;
-		num_xfrms = xdst->num_xfrms;
-		pol_dead = 0;
-		for (i = 0; i < num_pols; i++) {
-			pols[i] = xdst->pols[i];
-			pol_dead |= pols[i]->walk.dead;
-		}
-		if (pol_dead) {
-			/* Mark DST_OBSOLETE_DEAD to fail the next
-			 * xfrm_dst_check()
-			 */
-			xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
-			dst_release_immediate(&xdst->u.dst);
-			xdst = NULL;
-			num_pols = 0;
-			num_xfrms = 0;
-			oldflo = NULL;
-		}
-	}
+	int num_pols = 0, num_xfrms = 0, err;
+	struct xfrm_dst *xdst;

 	/* Resolve policies to use if we couldn't get them from
 	 * previous cache entry */
-	if (xdst == NULL) {
-		num_pols = 1;
-		pols[0] = __xfrm_policy_lookup(net, fl, family,
-					       flow_to_policy_dir(dir));
-		err = xfrm_expand_policies(fl, family, pols,
+	num_pols = 1;
+	pols[0] = xfrm_policy_lookup(net, fl, family, dir);
+	err = xfrm_expand_policies(fl, family, pols,
 					   &num_pols, &num_xfrms);
-		if (err < 0)
-			goto inc_error;
-		if (num_pols == 0)
-			return NULL;
-		if (num_xfrms <= 0)
-			goto make_dummy_bundle;
-	}
+	if (err < 0)
+		goto inc_error;
+	if (num_pols == 0)
+		return NULL;
+	if (num_xfrms <= 0)
+		goto make_dummy_bundle;

-	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
+	xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
 						  xflo->dst_orig);
-	if (IS_ERR(new_xdst)) {
-		err = PTR_ERR(new_xdst);
+	if (IS_ERR(xdst)) {
+		err = PTR_ERR(xdst);
 		if (err != -EAGAIN)
 			goto error;
-		if (oldflo == NULL)
-			goto make_dummy_bundle;
-		dst_hold(&xdst->u.dst);
-		return oldflo;
-	} else if (new_xdst == NULL) {
+		goto make_dummy_bundle;
+	} else if (xdst == NULL) {
 		num_xfrms = 0;
-		if (oldflo == NULL)
-			goto make_dummy_bundle;
-		xdst->num_xfrms = 0;
-		dst_hold(&xdst->u.dst);
-		return oldflo;
-	}
-
-	/* Kill the previous bundle */
-	if (xdst) {
-		/* The policies were stolen for newly generated bundle */
-		xdst->num_pols = 0;
-		/* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
-		xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
-		dst_release_immediate(&xdst->u.dst);
+		goto make_dummy_bundle;
 	}

-	/* We do need to return one reference for original caller */
-	dst_hold(&new_xdst->u.dst);
-	return &new_xdst->flo;
+	return xdst;

 make_dummy_bundle:
 	/* We found policies, but there's no bundles to instantiate:
@@ -2146,17 +2075,12 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
 	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);

 	dst_hold(&xdst->u.dst);
-	return &xdst->flo;
+	return xdst;

 inc_error:
 	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 error:
-	if (xdst != NULL) {
-		/* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
-		xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
-		dst_release_immediate(&xdst->u.dst);
-	} else
-		xfrm_pols_put(pols, num_pols);
+	xfrm_pols_put(pols, num_pols);
 	return ERR_PTR(err);
 }

@@ -2187,11 +2111,10 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 			      const struct sock *sk, int flags)
 {
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-	struct flow_cache_object *flo;
 	struct xfrm_dst *xdst;
 	struct dst_entry *dst, *route;
 	u16 family = dst_orig->ops->family;
-	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	u8 dir = XFRM_POLICY_OUT;
 	int i, err, num_pols, num_xfrms = 0, drop_pols = 0;

 	dst = NULL;
@@ -2242,15 +2165,13 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;

-		flo = flow_cache_lookup(net, fl, family, dir,
-					xfrm_bundle_lookup, &xflo);
-		if (flo == NULL)
+		xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
+		if (xdst == NULL)
 			goto nopol;
-		if (IS_ERR(flo)) {
-			err = PTR_ERR(flo);
+		if (IS_ERR(xdst)) {
+			err = PTR_ERR(xdst);
 			goto dropdst;
 		}
-		xdst = container_of(flo, struct xfrm_dst, flo);

 		num_pols = xdst->num_pols;
 		num_xfrms = xdst->num_xfrms;
@@ -2449,12 +2370,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	int pi;
 	int reverse;
 	struct flowi fl;
-	u8 fl_dir;
 	int xerr_idx = -1;

 	reverse = dir & ~XFRM_POLICY_MASK;
 	dir &= XFRM_POLICY_MASK;
-	fl_dir = policy_to_flow_dir(dir);

 	if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
@@ -2486,16 +2405,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}

-	if (!pol) {
-		struct flow_cache_object *flo;
-
-		flo = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup, NULL);
-		if (IS_ERR_OR_NULL(flo))
-			pol = ERR_CAST(flo);
-		else
-			pol = container_of(flo, struct xfrm_policy, flo);
-	}
+	if (!pol)
+		pol = xfrm_policy_lookup(net, &fl, family, dir);

 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
@@ -2641,11 +2552,9 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 	 * notice.  That's what we are validating here via the
 	 * stale_bundle() check.
 	 *
-	 * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will
-	 * be marked on it.
 	 * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
 	 * be marked on it.
-	 * Both will force stable_bundle() to fail on any xdst bundle with
+	 * This will force stale_bundle() to fail on any xdst bundle with
 	 * this dst linked in it.
 	 */
 	if (dst->obsolete < 0 && !stale_bundle(dst))
@@ -2685,18 +2594,6 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }

-void xfrm_garbage_collect(struct net *net)
-{
-	flow_cache_flush(net);
-}
-EXPORT_SYMBOL(xfrm_garbage_collect);
-
-void xfrm_garbage_collect_deferred(struct net *net)
-{
-	flow_cache_flush_deferred(net);
-}
-EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
-
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -3034,14 +2931,9 @@ static int __net_init xfrm_net_init(struct net *net)
 	rv = xfrm_sysctl_init(net);
 	if (rv < 0)
 		goto out_sysctl;
-	rv = flow_cache_init(net);
-	if (rv < 0)
-		goto out;

 	return 0;

-out:
-	xfrm_sysctl_fini(net);
 out_sysctl:
 	xfrm_policy_fini(net);
 out_policy:
@@ -3054,7 +2946,6 @@ static int __net_init xfrm_net_init(struct net *net)

 static void __net_exit xfrm_net_exit(struct net *net)
 {
-	flow_cache_fini(net);
 	xfrm_sysctl_fini(net);
 	xfrm_policy_fini(net);
 	xfrm_state_fini(net);
@@ -3068,7 +2959,15 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {

 void __init xfrm_init(void)
 {
-	flow_cache_hp_init();
+	int i;
+
+	xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
+				       GFP_KERNEL);
+	BUG_ON(!xfrm_pcpu_work);
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
+
 	register_pernet_subsys(&xfrm_net_ops);
 	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();

--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -724,9 +724,10 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
 			}
 		}
 	}
-	if (cnt)
+	if (cnt) {
 		err = 0;
-
+		xfrm_policy_cache_flush();
+	}
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 	return err;

--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1815,8 +1815,6 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,

 out:
 	xfrm_pol_put(xp);
-	if (delete && err == 0)
-		xfrm_garbage_collect(net);
 	return err;
 }

@@ -2027,7 +2025,6 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 			return 0;
 		return err;
 	}
-	xfrm_garbage_collect(net);

 	c.data.type = type;
 	c.event = nlh->nlmsg_type;

--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -47,10 +47,8 @@ static inline void selinux_xfrm_notify_policyload(void)
 	struct net *net;

 	rtnl_lock();
-	for_each_net(net) {
-		atomic_inc(&net->xfrm.flow_cache_genid);
+	for_each_net(net)
 		rt_genid_bump_all(net);
-	}
 	rtnl_unlock();
 }
 #else