dst.c 8.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12
/*
 * net/core/dst.c	Protocol independent destination cache.
 *
 * Authors:		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
13
#include <linux/workqueue.h>
L
Linus Torvalds 已提交
14 15
#include <linux/mm.h>
#include <linux/module.h>
16
#include <linux/slab.h>
L
Linus Torvalds 已提交
17 18 19 20
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
21
#include <net/net_namespace.h>
22
#include <linux/sched.h>
23
#include <linux/prefetch.h>
24
#include <net/lwtunnel.h>
25
#include <net/xfrm.h>
L
Linus Torvalds 已提交
26 27

#include <net/dst.h>
T
Thomas Graf 已提交
28
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
29

E
Eric W. Biederman 已提交
30
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
31 32 33 34
{
	kfree_skb(skb);
	return 0;
}
E
Eric W. Biederman 已提交
35
EXPORT_SYMBOL(dst_discard_out);
L
Linus Torvalds 已提交
36

37
const struct dst_metrics dst_default_metrics = {
38 39 40 41 42
	/* This initializer is needed to force linker to place this variable
	 * into const section. Otherwise it might end into bss section.
	 * We really want to avoid false sharing on this variable, and catch
	 * any writes on it.
	 */
43
	.refcnt = REFCOUNT_INIT(1),
44
};
45
EXPORT_SYMBOL(dst_default_metrics);
46

T
Thomas Graf 已提交
47 48 49
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
	      struct net_device *dev, int initial_ref, int initial_obsolete,
	      unsigned short flags)
L
Linus Torvalds 已提交
50
{
51 52 53
	dst->dev = dev;
	if (dev)
		dev_hold(dev);
L
Linus Torvalds 已提交
54
	dst->ops = ops;
55
	dst_init_metrics(dst, dst_default_metrics.metrics, true);
56 57 58 59
	dst->expires = 0UL;
#ifdef CONFIG_XFRM
	dst->xfrm = NULL;
#endif
60
	dst->input = dst_discard;
E
Eric W. Biederman 已提交
61
	dst->output = dst_discard_out;
62
	dst->error = 0;
63
	dst->obsolete = initial_obsolete;
64 65 66 67
	dst->header_len = 0;
	dst->trailer_len = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
	dst->tclassid = 0;
L
Linus Torvalds 已提交
68
#endif
69
	dst->lwtstate = NULL;
70
	atomic_set(&dst->__refcnt, initial_ref);
71
	dst->__use = 0;
72 73
	dst->lastuse = jiffies;
	dst->flags = flags;
74 75
	if (!(flags & DST_NOCOUNT))
		dst_entries_add(ops, 1);
T
Thomas Graf 已提交
76 77 78 79 80 81 82 83
}
EXPORT_SYMBOL(dst_init);

void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
		int initial_ref, int initial_obsolete, unsigned short flags)
{
	struct dst_entry *dst;

84 85 86
	if (ops->gc &&
	    !(flags & DST_NOCOUNT) &&
	    dst_entries_get_fast(ops) > ops->gc_thresh) {
87
		if (ops->gc(ops)) {
88
			pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
T
Thomas Graf 已提交
89
			return NULL;
90
		}
T
Thomas Graf 已提交
91 92 93 94 95 96 97 98
	}

	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
	if (!dst)
		return NULL;

	dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);

L
Linus Torvalds 已提交
99 100
	return dst;
}
101
EXPORT_SYMBOL(dst_alloc);
L
Linus Torvalds 已提交
102 103 104

struct dst_entry *dst_destroy(struct dst_entry * dst)
{
105
	struct dst_entry *child = NULL;
L
Linus Torvalds 已提交
106 107 108

	smp_rmb();

109
#ifdef CONFIG_XFRM
110 111 112 113 114
	if (dst->xfrm) {
		struct xfrm_dst *xdst = (struct xfrm_dst *) dst;

		child = xdst->child;
	}
115
#endif
116 117
	if (!(dst->flags & DST_NOCOUNT))
		dst_entries_add(dst->ops, -1);
L
Linus Torvalds 已提交
118 119 120 121 122

	if (dst->ops->destroy)
		dst->ops->destroy(dst);
	if (dst->dev)
		dev_put(dst->dev);
T
Thomas Graf 已提交
123

W
WANG Cong 已提交
124 125
	lwtstate_put(dst->lwtstate);

T
Thomas Graf 已提交
126
	if (dst->flags & DST_METADATA)
127
		metadata_dst_free((struct metadata_dst *)dst);
T
Thomas Graf 已提交
128 129
	else
		kmem_cache_free(dst->ops->kmem_cachep, dst);
L
Linus Torvalds 已提交
130 131

	dst = child;
132 133
	if (dst)
		dst_release_immediate(dst);
L
Linus Torvalds 已提交
134 135
	return NULL;
}
136
EXPORT_SYMBOL(dst_destroy);
L
Linus Torvalds 已提交
137

138 139 140 141 142 143 144
static void dst_destroy_rcu(struct rcu_head *head)
{
	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);

	dst = dst_destroy(dst);
}

145 146
/* Operations to mark dst as DEAD and clean up the net device referenced
 * by dst:
147
 * 1. put the dst under blackhole interface and discard all tx/rx packets
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 *    on this route.
 * 2. release the net_device
 * This function should be called when removing routes from the fib tree
 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 * make the next dst_ops->check() fail.
 */
void dst_dev_put(struct dst_entry *dst)
{
	struct net_device *dev = dst->dev;

	dst->obsolete = DST_OBSOLETE_DEAD;
	if (dst->ops->ifdown)
		dst->ops->ifdown(dst, dev, true);
	dst->input = dst_discard;
	dst->output = dst_discard_out;
163
	dst->dev = blackhole_netdev;
164 165 166 167 168
	dev_hold(dst->dev);
	dev_put(dev);
}
EXPORT_SYMBOL(dst_dev_put);

I
Ilpo Järvinen 已提交
169 170 171
void dst_release(struct dst_entry *dst)
{
	if (dst) {
172
		int newrefcnt;
E
Eric Dumazet 已提交
173

174
		newrefcnt = atomic_dec_return(&dst->__refcnt);
175
		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
176 177
			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
					     __func__, dst, newrefcnt);
W
Wei Wang 已提交
178
		if (!newrefcnt)
179
			call_rcu(&dst->rcu_head, dst_destroy_rcu);
I
Ilpo Järvinen 已提交
180 181 182 183
	}
}
EXPORT_SYMBOL(dst_release);

184 185 186 187 188 189
void dst_release_immediate(struct dst_entry *dst)
{
	if (dst) {
		int newrefcnt;

		newrefcnt = atomic_dec_return(&dst->__refcnt);
190
		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
191 192 193 194 195 196 197 198
			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
					     __func__, dst, newrefcnt);
		if (!newrefcnt)
			dst_destroy(dst);
	}
}
EXPORT_SYMBOL(dst_release_immediate);

199 200
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
201
	struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
202 203

	if (p) {
204
		struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
205 206
		unsigned long prev, new;

207
		refcount_set(&p->refcnt, 1);
208
		memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
209 210 211 212 213 214

		new = (unsigned long) p;
		prev = cmpxchg(&dst->_metrics, old, new);

		if (prev != old) {
			kfree(p);
215
			p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
216 217
			if (prev & DST_METRICS_READ_ONLY)
				p = NULL;
218
		} else if (prev & DST_METRICS_REFCOUNTED) {
219
			if (refcount_dec_and_test(&old_p->refcnt))
220
				kfree(old_p);
221 222
		}
	}
223 224
	BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
	return (u32 *)p;
225 226 227 228 229 230 231 232
}
EXPORT_SYMBOL(dst_cow_metrics_generic);

/* Caller asserts that dst_metrics_read_only(dst) is false.  */
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
	unsigned long prev, new;

233
	new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
234 235 236 237 238 239
	prev = cmpxchg(&dst->_metrics, old, new);
	if (prev == old)
		kfree(__DST_METRICS_PTR(old));
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
{
	return NULL;
}

u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
{
	return NULL;
}

struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
					     struct sk_buff *skb,
					     const void *daddr)
{
	return NULL;
}

void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu,
			       bool confirm_neigh)
{
}
EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);

void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
			    struct sk_buff *skb)
{
}
EXPORT_SYMBOL_GPL(dst_blackhole_redirect);

unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
{
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
}
EXPORT_SYMBOL_GPL(dst_blackhole_mtu);

278 279 280 281 282 283 284 285
static struct dst_ops dst_blackhole_ops = {
	.family		= AF_UNSPEC,
	.neigh_lookup	= dst_blackhole_neigh_lookup,
	.check		= dst_blackhole_check,
	.cow_metrics	= dst_blackhole_cow_metrics,
	.update_pmtu	= dst_blackhole_update_pmtu,
	.redirect	= dst_blackhole_redirect,
	.mtu		= dst_blackhole_mtu,
T
Thomas Graf 已提交
286 287
};

288 289
static void __metadata_dst_init(struct metadata_dst *md_dst,
				enum metadata_type type, u8 optslen)
T
Thomas Graf 已提交
290 291 292 293
{
	struct dst_entry *dst;

	dst = &md_dst->dst;
294
	dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
W
Wei Wang 已提交
295
		 DST_METADATA | DST_NOCOUNT);
T
Thomas Graf 已提交
296
	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
297
	md_dst->type = type;
298 299
}

300 301
struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
					gfp_t flags)
302 303 304 305 306 307 308
{
	struct metadata_dst *md_dst;

	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
	if (!md_dst)
		return NULL;

309
	__metadata_dst_init(md_dst, type, optslen);
T
Thomas Graf 已提交
310 311 312 313 314

	return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);

315 316 317
void metadata_dst_free(struct metadata_dst *md_dst)
{
#ifdef CONFIG_DST_CACHE
318 319
	if (md_dst->type == METADATA_IP_TUNNEL)
		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
320 321 322
#endif
	kfree(md_dst);
}
323
EXPORT_SYMBOL_GPL(metadata_dst_free);
324

325 326
struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
327 328 329 330 331 332 333 334 335 336
{
	int cpu;
	struct metadata_dst __percpu *md_dst;

	md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
				    __alignof__(struct metadata_dst), flags);
	if (!md_dst)
		return NULL;

	for_each_possible_cpu(cpu)
337
		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
338 339 340 341

	return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
342 343 344

void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
{
345
#ifdef CONFIG_DST_CACHE
346 347 348 349 350 351 352 353 354 355 356 357
	int cpu;

	for_each_possible_cpu(cpu) {
		struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);

		if (one_md_dst->type == METADATA_IP_TUNNEL)
			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
	}
#endif
	free_percpu(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);