ip_output.c 42.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
L
Linus Torvalds 已提交
2 3 4 5 6 7 8
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The Internet Protocol (IP) output module.
 *
9
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
10 11 12 13 14 15 16 17 18 19 20 21 22 23
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Donald Becker, <becker@super.org>
 *		Alan Cox, <Alan.Cox@linux.org>
 *		Richard Underwood
 *		Stefan Becker, <stefanb@yello.ping.de>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 *	See ip_input.c for original log
 *
 *	Fixes:
 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
24
 *		Bradford Johnson:	Fix faulty handling of some frames when
L
Linus Torvalds 已提交
25 26 27 28 29 30 31 32 33 34
 *					no route is found.
 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
 *					(in case if packet not accepted by
 *					output firewall rules)
 *		Mike McLagan	:	Routing by source
 *		Alexey Kuznetsov:	use new route cache
 *		Andi Kleen:		Fix broken PMTU recovery and remove
 *					some redundant tests.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
35 36 37
 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
 *					for decreased register pressure on x86
 *					and more readibility.
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45
 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
 *					silently drop skb instead of failing with -EPERM.
 *		Detlev Wengorz	:	Copy protocol for fragments.
 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
 *					datagrams.
 *		Hirokazu Takahashi:	sendfile() on UDP works now.
 */

46
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
47 48 49 50 51 52
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
A
Al Viro 已提交
53
#include <linux/highmem.h>
54
#include <linux/slab.h>
L
Linus Torvalds 已提交
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
70
#include <net/xfrm.h>
L
Linus Torvalds 已提交
71 72 73 74 75 76
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
77
#include <net/lwtunnel.h>
78
#include <linux/bpf-cgroup.h>
L
Linus Torvalds 已提交
79 80 81 82
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/netlink.h>
83
#include <linux/tcp.h>
L
Linus Torvalds 已提交
84

85 86 87 88
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
	    unsigned int mtu,
	    int (*output)(struct net *, struct sock *, struct sk_buff *));
89

L
Linus Torvalds 已提交
90
/* Generate a checksum for an outgoing IP datagram. */
91
void ip_send_check(struct iphdr *iph)
L
Linus Torvalds 已提交
92 93 94 95
{
	iph->check = 0;
	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
E
Eric Dumazet 已提交
96
EXPORT_SYMBOL(ip_send_check);
L
Linus Torvalds 已提交
97

98
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
99 100 101 102 103
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);
104 105 106 107 108 109 110 111

	/* if egress device is enslaved to an L3 master device pass the
	 * skb to its handler for processing
	 */
	skb = l3mdev_ip_out(sk, skb);
	if (unlikely(!skb))
		return 0;

112 113
	skb->protocol = htons(ETH_P_IP);

114 115
	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		       net, sk, skb, NULL, skb_dst(skb)->dev,
116
		       dst_output);
117 118
}

119
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
120 121 122
{
	int err;

123
	err = __ip_local_out(net, sk, skb);
H
Herbert Xu 已提交
124
	if (likely(err == 1))
125
		err = dst_output(net, sk, skb);
H
Herbert Xu 已提交
126 127 128

	return err;
}
129
EXPORT_SYMBOL_GPL(ip_local_out);
H
Herbert Xu 已提交
130

L
Linus Torvalds 已提交
131 132 133 134 135
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
	int ttl = inet->uc_ttl;

	if (ttl < 0)
136
		ttl = ip4_dst_hoplimit(dst);
L
Linus Torvalds 已提交
137 138 139
	return ttl;
}

140
/*
L
Linus Torvalds 已提交
141 142 143
 *		Add an ip header to a skbuff and send it out.
 *
 */
144
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
145
			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
L
Linus Torvalds 已提交
146 147
{
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
148
	struct rtable *rt = skb_rtable(skb);
149
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
150 151 152
	struct iphdr *iph;

	/* Build the IP header. */
153
	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
154
	skb_reset_network_header(skb);
155
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
156 157 158
	iph->version  = 4;
	iph->ihl      = 5;
	iph->tos      = inet->tos;
159
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
160 161
	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
	iph->saddr    = saddr;
L
Linus Torvalds 已提交
162
	iph->protocol = sk->sk_protocol;
163 164 165 166 167
	if (ip_dont_fragment(sk, &rt->dst)) {
		iph->frag_off = htons(IP_DF);
		iph->id = 0;
	} else {
		iph->frag_off = 0;
168
		__ip_select_ident(net, iph, 1);
169
	}
L
Linus Torvalds 已提交
170

171 172 173
	if (opt && opt->opt.optlen) {
		iph->ihl += opt->opt.optlen>>2;
		ip_options_build(skb, &opt->opt, daddr, rt, 0);
L
Linus Torvalds 已提交
174 175 176
	}

	skb->priority = sk->sk_priority;
177 178
	if (!skb->mark)
		skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
179 180

	/* Send it out. */
181
	return ip_local_out(net, skb->sk, skb);
L
Linus Torvalds 已提交
182
}
183 184
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);

185
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
186
{
E
Eric Dumazet 已提交
187
	struct dst_entry *dst = skb_dst(skb);
188
	struct rtable *rt = (struct rtable *)dst;
L
Linus Torvalds 已提交
189
	struct net_device *dev = dst->dev;
190
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
191
	struct neighbour *neigh;
192
	bool is_v6gw = false;
L
Linus Torvalds 已提交
193

194
	if (rt->rt_type == RTN_MULTICAST) {
195
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
196
	} else if (rt->rt_type == RTN_BROADCAST)
197
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
198

L
Linus Torvalds 已提交
199
	/* Be paranoid, rather than too clever. */
200
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
L
Linus Torvalds 已提交
201 202 203
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
204
		if (!skb2) {
L
Linus Torvalds 已提交
205 206 207 208 209
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
210
		consume_skb(skb);
L
Linus Torvalds 已提交
211 212 213
		skb = skb2;
	}

214 215 216 217 218 219 220
	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
		int res = lwtunnel_xmit(skb);

		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
			return res;
	}

221
	rcu_read_lock_bh();
222
	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
223
	if (!IS_ERR(neigh)) {
224 225 226
		int res;

		sock_confirm_neigh(skb, neigh);
227 228
		/* if crossing protocols, can not use the cached header */
		res = neigh_output(neigh, skb, is_v6gw);
229
		rcu_read_unlock_bh();
230 231
		return res;
	}
232
	rcu_read_unlock_bh();
233

234 235
	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
L
Linus Torvalds 已提交
236 237 238 239
	kfree_skb(skb);
	return -EINVAL;
}

240 241
static int ip_finish_output_gso(struct net *net, struct sock *sk,
				struct sk_buff *skb, unsigned int mtu)
242 243 244 245 246
{
	netdev_features_t features;
	struct sk_buff *segs;
	int ret = 0;

247
	/* common case: seglen is <= mtu
248
	 */
249
	if (skb_gso_validate_network_len(skb, mtu))
250
		return ip_finish_output2(net, sk, skb);
251

252
	/* Slowpath -  GSO segment length exceeds the egress MTU.
253
	 *
254 255 256 257 258 259 260 261 262 263
	 * This can happen in several cases:
	 *  - Forwarding of a TCP GRO skb, when DF flag is not set.
	 *  - Forwarding of an skb that arrived on a virtualization interface
	 *    (virtio-net/vhost/tap) with TSO/GSO size set by other network
	 *    stack.
	 *  - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
	 *    interface with a smaller MTU.
	 *  - Arriving GRO skb (or GSO skb in a virtualized environment) that is
	 *    bridged to a NETIF_F_TSO tunnel stacked over an interface with an
	 *    insufficent MTU.
264 265
	 */
	features = netif_skb_features(skb);
266
	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
267
	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
268
	if (IS_ERR_OR_NULL(segs)) {
269 270 271 272 273 274 275 276 277 278
		kfree_skb(skb);
		return -ENOMEM;
	}

	consume_skb(skb);

	do {
		struct sk_buff *nskb = segs->next;
		int err;

279
		skb_mark_not_on_list(segs);
280
		err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
281 282 283 284 285 286 287 288 289

		if (err && ret == 0)
			ret = err;
		segs = nskb;
	} while (segs);

	return ret;
}

290
static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
291
{
292 293
	unsigned int mtu;

294 295
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
296
	if (skb_dst(skb)->xfrm) {
297
		IPCB(skb)->flags |= IPSKB_REROUTED;
298
		return dst_output(net, sk, skb);
299
	}
300
#endif
301
	mtu = ip_skb_dst_mtu(sk, skb);
302
	if (skb_is_gso(skb))
303
		return ip_finish_output_gso(net, sk, skb, mtu);
304

305
	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
306
		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
307

308
	return ip_finish_output2(net, sk, skb);
L
Linus Torvalds 已提交
309 310
}

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	int ret;

	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
	switch (ret) {
	case NET_XMIT_SUCCESS:
		return __ip_finish_output(net, sk, skb);
	case NET_XMIT_CN:
		return __ip_finish_output(net, sk, skb) ? : ret;
	default:
		kfree_skb(skb);
		return ret;
	}
}

327 328 329 330 331 332
static int ip_mc_finish_output(struct net *net, struct sock *sk,
			       struct sk_buff *skb)
{
	int ret;

	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
333 334 335 336 337 338
	switch (ret) {
	case NET_XMIT_SUCCESS:
		return dev_loopback_xmit(net, sk, skb);
	case NET_XMIT_CN:
		return dev_loopback_xmit(net, sk, skb) ? : ret;
	default:
339 340 341 342 343
		kfree_skb(skb);
		return ret;
	}
}

E
Eric W. Biederman 已提交
344
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
345
{
E
Eric Dumazet 已提交
346
	struct rtable *rt = skb_rtable(skb);
347
	struct net_device *dev = rt->dst.dev;
L
Linus Torvalds 已提交
348 349 350 351

	/*
	 *	If the indicated interface is up and running, send the packet.
	 */
352
	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
353 354 355 356 357 358 359 360 361

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	/*
	 *	Multicasts are looped back for other local users
	 */

	if (rt->rt_flags&RTCF_MULTICAST) {
362
		if (sk_mc_loop(sk)
L
Linus Torvalds 已提交
363 364 365 366 367 368 369 370 371
#ifdef CONFIG_IP_MROUTE
		/* Small optimization: do not loopback not local frames,
		   which returned after forwarding; they will be  dropped
		   by ip_mr_input in any case.
		   Note, that local frames are looped back to be delivered
		   to local recipients.

		   This check is duplicated in ip_mr_input at the moment.
		 */
372 373 374
		    &&
		    ((rt->rt_flags & RTCF_LOCAL) ||
		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
L
Linus Torvalds 已提交
375
#endif
376
		   ) {
L
Linus Torvalds 已提交
377 378
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
			if (newskb)
379
				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
380
					net, sk, newskb, NULL, newskb->dev,
381
					ip_mc_finish_output);
L
Linus Torvalds 已提交
382 383 384 385
		}

		/* Multicasts with ttl 0 must not go beyond the host */

386
		if (ip_hdr(skb)->ttl == 0) {
L
Linus Torvalds 已提交
387 388 389 390 391 392 393 394
			kfree_skb(skb);
			return 0;
		}
	}

	if (rt->rt_flags&RTCF_BROADCAST) {
		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
		if (newskb)
395 396
			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				net, sk, newskb, NULL, newskb->dev,
397
				ip_mc_finish_output);
L
Linus Torvalds 已提交
398 399
	}

400 401 402
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, NULL, skb->dev,
			    ip_finish_output,
403
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
404 405
}

E
Eric W. Biederman 已提交
406
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
407
{
E
Eric Dumazet 已提交
408
	struct net_device *dev = skb_dst(skb)->dev;
409

410
	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
411

412 413 414
	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

415 416
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, NULL, dev,
417
			    ip_finish_output,
418
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
419 420
}

421 422 423 424 425 426 427 428 429 430 431 432 433 434
/*
 * copy saddr and daddr, possibly using 64bit load/stores
 * Equivalent to :
 *   iph->saddr = fl4->saddr;
 *   iph->daddr = fl4->daddr;
 */
static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
	memcpy(&iph->saddr, &fl4->saddr,
	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
}

435
/* Note: skb->sk can be different from sk, in case of tunnels */
436 437
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
		    __u8 tos)
L
Linus Torvalds 已提交
438 439
{
	struct inet_sock *inet = inet_sk(sk);
440
	struct net *net = sock_net(sk);
441
	struct ip_options_rcu *inet_opt;
442
	struct flowi4 *fl4;
L
Linus Torvalds 已提交
443 444
	struct rtable *rt;
	struct iphdr *iph;
445
	int res;
L
Linus Torvalds 已提交
446 447 448 449

	/* Skip all of this if the packet is already routed,
	 * f.e. by something like SCTP.
	 */
450
	rcu_read_lock();
451
	inet_opt = rcu_dereference(inet->inet_opt);
452
	fl4 = &fl->u.ip4;
E
Eric Dumazet 已提交
453
	rt = skb_rtable(skb);
454
	if (rt)
L
Linus Torvalds 已提交
455 456 457 458
		goto packet_routed;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);
459
	if (!rt) {
A
Al Viro 已提交
460
		__be32 daddr;
L
Linus Torvalds 已提交
461 462

		/* Use correct destination address if we have options. */
E
Eric Dumazet 已提交
463
		daddr = inet->inet_daddr;
464 465
		if (inet_opt && inet_opt->opt.srr)
			daddr = inet_opt->opt.faddr;
L
Linus Torvalds 已提交
466

467 468 469 470
		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times
		 * itself out.
		 */
471
		rt = ip_route_output_ports(net, fl4, sk,
472 473 474 475
					   daddr, inet->inet_saddr,
					   inet->inet_dport,
					   inet->inet_sport,
					   sk->sk_protocol,
476
					   RT_CONN_FLAGS_TOS(sk, tos),
477 478 479
					   sk->sk_bound_dev_if);
		if (IS_ERR(rt))
			goto no_route;
480
		sk_setup_caps(sk, &rt->dst);
L
Linus Torvalds 已提交
481
	}
482
	skb_dst_set_noref(skb, &rt->dst);
L
Linus Torvalds 已提交
483 484

packet_routed:
485
	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family)
L
Linus Torvalds 已提交
486 487 488
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
489
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
490
	skb_reset_network_header(skb);
491
	iph = ip_hdr(skb);
492
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
W
WANG Cong 已提交
493
	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
L
Linus Torvalds 已提交
494 495 496
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
497
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
498
	iph->protocol = sk->sk_protocol;
499 500
	ip_copy_addrs(iph, fl4);

L
Linus Torvalds 已提交
501 502
	/* Transport layer set skb->h.foo itself. */

503 504 505
	if (inet_opt && inet_opt->opt.optlen) {
		iph->ihl += inet_opt->opt.optlen >> 2;
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
L
Linus Torvalds 已提交
506 507
	}

508
	ip_select_ident_segs(net, skb, sk,
509
			     skb_shinfo(skb)->gso_segs ?: 1);
L
Linus Torvalds 已提交
510

511
	/* TODO : should we use skb->sk here instead of sk ? */
L
Linus Torvalds 已提交
512
	skb->priority = sk->sk_priority;
513
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
514

515
	res = ip_local_out(net, sk, skb);
516 517
	rcu_read_unlock();
	return res;
L
Linus Torvalds 已提交
518 519

no_route:
520
	rcu_read_unlock();
521
	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
522 523 524
	kfree_skb(skb);
	return -EHOSTUNREACH;
}
525
EXPORT_SYMBOL(__ip_queue_xmit);
L
Linus Torvalds 已提交
526 527 528 529 530 531

static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
	to->pkt_type = from->pkt_type;
	to->priority = from->priority;
	to->protocol = from->protocol;
532
	to->skb_iif = from->skb_iif;
E
Eric Dumazet 已提交
533
	skb_dst_drop(to);
534
	skb_dst_copy(to, from);
L
Linus Torvalds 已提交
535
	to->dev = from->dev;
T
Thomas Graf 已提交
536
	to->mark = from->mark;
L
Linus Torvalds 已提交
537

P
Paolo Abeni 已提交
538 539
	skb_copy_hash(to, from);

L
Linus Torvalds 已提交
540 541 542
#ifdef CONFIG_NET_SCHED
	to->tc_index = from->tc_index;
#endif
543
	nf_copy(to, from);
544
	skb_ext_copy(to, from);
545
#if IS_ENABLED(CONFIG_IP_VS)
546
	to->ipvs_property = from->ipvs_property;
L
Linus Torvalds 已提交
547
#endif
548
	skb_copy_secmark(to, from);
L
Linus Torvalds 已提交
549 550
}

551
static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
552
		       unsigned int mtu,
553
		       int (*output)(struct net *, struct sock *, struct sk_buff *))
554 555 556
{
	struct iphdr *iph = ip_hdr(skb);

557
	if ((iph->frag_off & htons(IP_DF)) == 0)
558
		return ip_do_fragment(net, sk, skb, output);
559 560

	if (unlikely(!skb->ignore_df ||
561 562
		     (IPCB(skb)->frag_max_size &&
		      IPCB(skb)->frag_max_size > mtu))) {
563
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
564 565 566 567 568 569
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(mtu));
		kfree_skb(skb);
		return -EMSGSIZE;
	}

570
	return ip_do_fragment(net, sk, skb, output);
571 572
}

573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
		      unsigned int hlen, struct ip_fraglist_iter *iter)
{
	unsigned int first_len = skb_pagelen(skb);

	iter->frag_list = skb_shinfo(skb)->frag_list;
	iter->frag = iter->frag_list;
	skb_frag_list_init(skb);

	iter->offset = 0;
	iter->iph = iph;
	iter->hlen = hlen;

	skb->data_len = first_len - skb_headlen(skb);
	skb->len = first_len;
	iph->tot_len = htons(first_len);
	iph->frag_off = htons(IP_MF);
	ip_send_check(iph);
}
EXPORT_SYMBOL(ip_fraglist_init);

594 595 596 597 598 599 600 601 602 603 604 605
static void ip_fraglist_ipcb_prepare(struct sk_buff *skb,
				     struct ip_fraglist_iter *iter)
{
	struct sk_buff *to = iter->frag;

	/* Copy the flags to each fragment. */
	IPCB(to)->flags = IPCB(skb)->flags;

	if (iter->offset == 0)
		ip_options_fragment(to);
}

606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
{
	unsigned int hlen = iter->hlen;
	struct iphdr *iph = iter->iph;
	struct sk_buff *frag;

	frag = iter->frag;
	frag->ip_summed = CHECKSUM_NONE;
	skb_reset_transport_header(frag);
	__skb_push(frag, hlen);
	skb_reset_network_header(frag);
	memcpy(skb_network_header(frag), iph, hlen);
	iter->iph = ip_hdr(frag);
	iph = iter->iph;
	iph->tot_len = htons(frag->len);
	ip_copy_metadata(frag, skb);
	iter->offset += skb->len - hlen;
	iph->frag_off = htons(iter->offset >> 3);
	if (frag->next)
		iph->frag_off |= htons(IP_MF);
	/* Ready, complete checksum */
	ip_send_check(iph);
}
EXPORT_SYMBOL(ip_fraglist_prepare);

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
		  unsigned int ll_rs, unsigned int mtu,
		  struct ip_frag_state *state)
{
	struct iphdr *iph = ip_hdr(skb);

	state->hlen = hlen;
	state->ll_rs = ll_rs;
	state->mtu = mtu;

	state->left = skb->len - hlen;	/* Space per frame */
	state->ptr = hlen;		/* Where to start from */

	state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
	state->not_last_frag = iph->frag_off & htons(IP_MF);
}
EXPORT_SYMBOL(ip_frag_init);

649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
			 bool first_frag, struct ip_frag_state *state)
{
	/* Copy the flags to each fragment. */
	IPCB(to)->flags = IPCB(from)->flags;

	if (IPCB(from)->flags & IPSKB_FRAG_PMTU)
		state->iph->frag_off |= htons(IP_DF);

	/* ANK: dirty, but effective trick. Upgrade options only if
	 * the segment to be fragmented was THE FIRST (otherwise,
	 * options are already fixed) and make it ONCE
	 * on the initial skb, so that all the following fragments
	 * will inherit fixed options.
	 */
	if (first_frag)
		ip_options_fragment(from);
}

668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
{
	unsigned int len = state->left;
	struct sk_buff *skb2;
	struct iphdr *iph;

	len = state->left;
	/* IF: it doesn't fit, use 'mtu' - the data space left */
	if (len > state->mtu)
		len = state->mtu;
	/* IF: we are not sending up to and including the packet end
	   then align the next start on an eight byte boundary */
	if (len < state->left)	{
		len &= ~7;
	}

	/* Allocate buffer */
	skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
	if (!skb2)
		return ERR_PTR(-ENOMEM);

	/*
	 *	Set up data on packet
	 */

	ip_copy_metadata(skb2, skb);
	skb_reserve(skb2, state->ll_rs);
	skb_put(skb2, len + state->hlen);
	skb_reset_network_header(skb2);
	skb2->transport_header = skb2->network_header + state->hlen;

	/*
	 *	Charge the memory for the fragment to any owner
	 *	it might possess
	 */

	if (skb->sk)
		skb_set_owner_w(skb2, skb->sk);

	/*
	 *	Copy the packet header into the new buffer.
	 */

	skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);

	/*
	 *	Copy a block of the IP datagram.
	 */
	if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
		BUG();
	state->left -= len;

	/*
	 *	Fill in the new header fields.
	 */
	iph = ip_hdr(skb2);
	iph->frag_off = htons((state->offset >> 3));

	/*
	 *	Added AC : If we are fragmenting a fragment that's not the
	 *		   last fragment then keep MF on each bit
	 */
	if (state->left > 0 || state->not_last_frag)
		iph->frag_off |= htons(IP_MF);
	state->ptr += len;
	state->offset += len;

	iph->tot_len = htons(len + state->hlen);

	ip_send_check(iph);

	return skb2;
}
EXPORT_SYMBOL(ip_frag_next);

L
Linus Torvalds 已提交
743 744 745 746 747 748 749
/*
 *	This IP datagram is too large to be sent in one piece.  Break it up into
 *	smaller pieces (each of size equal to IP header plus
 *	a block of the data of the original IP data part) that will yet fit in a
 *	single device frame, and queue such a frame for sending.
 */

750 751
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
		   int (*output)(struct net *, struct sock *, struct sk_buff *))
L
Linus Torvalds 已提交
752 753 754
{
	struct iphdr *iph;
	struct sk_buff *skb2;
E
Eric Dumazet 已提交
755
	struct rtable *rt = skb_rtable(skb);
756
	unsigned int mtu, hlen, ll_rs;
757
	struct ip_fraglist_iter iter;
758
	struct ip_frag_state state;
L
Linus Torvalds 已提交
759 760
	int err = 0;

761 762 763 764 765
	/* for offloaded checksums cleanup checksum before fragmentation */
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_checksum_help(skb)))
		goto fail;

L
Linus Torvalds 已提交
766 767 768 769
	/*
	 *	Point into the IP datagram header.
	 */

770
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
771

772
	mtu = ip_skb_dst_mtu(sk, skb);
773 774
	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
		mtu = IPCB(skb)->frag_max_size;
L
Linus Torvalds 已提交
775 776 777 778 779 780

	/*
	 *	Setup starting values.
	 */

	hlen = iph->ihl * 4;
781
	mtu = mtu - hlen;	/* Size of data space */
H
Herbert Xu 已提交
782
	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
783
	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
L
Linus Torvalds 已提交
784 785 786 787 788 789 790 791

	/* When frag_list is given, use it. First, check its validity:
	 * some transformers could create wrong frag_list or break existing
	 * one, it is not prohibited. In this case fall back to copying.
	 *
	 * LATER: this step can be merged to real generation of fragments,
	 * we can switch to copy when see the first bad fragment.
	 */
792
	if (skb_has_frag_list(skb)) {
793
		struct sk_buff *frag, *frag2;
794
		unsigned int first_len = skb_pagelen(skb);
L
Linus Torvalds 已提交
795 796 797

		if (first_len - hlen > mtu ||
		    ((first_len - hlen) & 7) ||
798
		    ip_is_fragment(iph) ||
799 800
		    skb_cloned(skb) ||
		    skb_headroom(skb) < ll_rs)
L
Linus Torvalds 已提交
801 802
			goto slow_path;

803
		skb_walk_frags(skb, frag) {
L
Linus Torvalds 已提交
804 805 806
			/* Correct geometry. */
			if (frag->len > mtu ||
			    ((frag->len & 7) && frag->next) ||
807
			    skb_headroom(frag) < hlen + ll_rs)
808
				goto slow_path_clean;
L
Linus Torvalds 已提交
809 810 811

			/* Partially cloned skb? */
			if (skb_shared(frag))
812
				goto slow_path_clean;
813 814 815 816 817 818

			BUG_ON(frag->sk);
			if (skb->sk) {
				frag->sk = skb->sk;
				frag->destructor = sock_wfree;
			}
819
			skb->truesize -= frag->truesize;
L
Linus Torvalds 已提交
820 821 822
		}

		/* Everything is OK. Generate! */
823
		ip_fraglist_init(skb, iph, hlen, &iter);
L
Linus Torvalds 已提交
824 825 826 827

		for (;;) {
			/* Prepare header of the next frame,
			 * before previous one went down. */
828 829
			if (iter.frag) {
				ip_fraglist_ipcb_prepare(skb, &iter);
830
				ip_fraglist_prepare(skb, &iter);
831
			}
L
Linus Torvalds 已提交
832

833
			err = output(net, sk, skb);
L
Linus Torvalds 已提交
834

835
			if (!err)
836
				IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
837
			if (err || !iter.frag)
L
Linus Torvalds 已提交
838 839
				break;

840
			skb = ip_fraglist_next(&iter);
L
Linus Torvalds 已提交
841 842 843
		}

		if (err == 0) {
844
			IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
845 846 847
			return 0;
		}

848
		kfree_skb_list(iter.frag_list);
849

850
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
851
		return err;
852 853 854 855 856 857 858 859 860

slow_path_clean:
		skb_walk_frags(skb, frag2) {
			if (frag2 == frag)
				break;
			frag2->sk = NULL;
			frag2->destructor = NULL;
			skb->truesize += frag2->truesize;
		}
L
Linus Torvalds 已提交
861 862 863 864 865 866 867
	}

slow_path:
	/*
	 *	Fragment the datagram.
	 */

868
	ip_frag_init(skb, hlen, ll_rs, mtu, &state);
L
Linus Torvalds 已提交
869 870 871 872 873

	/*
	 *	Keep copying data until we run out.
	 */

874
	while (state.left > 0) {
875 876
		bool first_frag = (state.offset == 0);

877 878 879
		skb2 = ip_frag_next(skb, &state);
		if (IS_ERR(skb2)) {
			err = PTR_ERR(skb2);
L
Linus Torvalds 已提交
880 881
			goto fail;
		}
882
		ip_frag_ipcb(skb, skb2, first_frag, &state);
L
Linus Torvalds 已提交
883 884 885 886

		/*
		 *	Put this fragment into the sending queue.
		 */
887
		err = output(net, sk, skb2);
L
Linus Torvalds 已提交
888 889
		if (err)
			goto fail;
890

891
		IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
892
	}
893
	consume_skb(skb);
894
	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
895 896 897
	return err;

fail:
898
	kfree_skb(skb);
899
	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
900 901
	return err;
}
902
EXPORT_SYMBOL(ip_do_fragment);
903

L
Linus Torvalds 已提交
904 905 906
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
907
	struct msghdr *msg = from;
L
Linus Torvalds 已提交
908

909
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
910
		if (!copy_from_iter_full(to, len, &msg->msg_iter))
L
Linus Torvalds 已提交
911 912
			return -EFAULT;
	} else {
913
		__wsum csum = 0;
914
		if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
L
Linus Torvalds 已提交
915 916 917 918 919
			return -EFAULT;
		skb->csum = csum_block_add(skb->csum, csum, odd);
	}
	return 0;
}
E
Eric Dumazet 已提交
920
EXPORT_SYMBOL(ip_generic_getfrag);
L
Linus Torvalds 已提交
921

922
static inline __wsum
L
Linus Torvalds 已提交
923 924 925
csum_page(struct page *page, int offset, int copy)
{
	char *kaddr;
926
	__wsum csum;
L
Linus Torvalds 已提交
927 928 929 930 931 932
	kaddr = kmap(page);
	csum = csum_partial(kaddr + offset, copy, 0);
	kunmap(page);
	return csum;
}

933 934 935
static int __ip_append_data(struct sock *sk,
			    struct flowi4 *fl4,
			    struct sk_buff_head *queue,
936
			    struct inet_cork *cork,
937
			    struct page_frag *pfrag,
938 939 940 941
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    unsigned int flags)
L
Linus Torvalds 已提交
942 943
{
	struct inet_sock *inet = inet_sk(sk);
W
Willem de Bruijn 已提交
944
	struct ubuf_info *uarg = NULL;
L
Linus Torvalds 已提交
945 946
	struct sk_buff *skb;

947
	struct ip_options *opt = cork->opt;
L
Linus Torvalds 已提交
948 949 950 951 952 953
	int hh_len;
	int exthdrlen;
	int mtu;
	int copy;
	int err;
	int offset = 0;
954
	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
L
Linus Torvalds 已提交
955
	int csummode = CHECKSUM_NONE;
956
	struct rtable *rt = (struct rtable *)cork->dst;
957
	unsigned int wmem_alloc_delta = 0;
958
	bool paged, extra_uref = false;
959
	u32 tskey = 0;
L
Linus Torvalds 已提交
960

961 962 963
	skb = skb_peek_tail(queue);

	exthdrlen = !skb ? rt->dst.header_len : 0;
964
	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
965
	paged = !!cork->gso_size;
966

967 968 969
	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
		tskey = sk->sk_tskey++;
L
Linus Torvalds 已提交
970

971
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
L
Linus Torvalds 已提交
972 973 974

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
975
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
976

977
	if (cork->length + length > maxnonfragsize - fragheaderlen) {
978
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
979
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
980 981 982 983 984 985 986 987 988
		return -EMSGSIZE;
	}

	/*
	 * transhdrlen > 0 means that this is the first fragment and we wish
	 * it won't be fragmented in the future.
	 */
	if (transhdrlen &&
	    length + fragheaderlen <= mtu &&
989
	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
990
	    (!(flags & MSG_MORE) || cork->gso_size) &&
991
	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
992
		csummode = CHECKSUM_PARTIAL;
L
Linus Torvalds 已提交
993

W
Willem de Bruijn 已提交
994 995 996 997
	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
		if (!uarg)
			return -ENOBUFS;
998
		extra_uref = !skb;	/* only extra ref if !MSG_MORE */
W
Willem de Bruijn 已提交
999 1000 1001 1002 1003
		if (rt->dst.dev->features & NETIF_F_SG &&
		    csummode == CHECKSUM_PARTIAL) {
			paged = true;
		} else {
			uarg->zerocopy = 0;
1004
			skb_zcopy_set(skb, uarg, &extra_uref);
W
Willem de Bruijn 已提交
1005 1006 1007
		}
	}

1008
	cork->length += length;
L
Linus Torvalds 已提交
1009 1010 1011 1012 1013 1014 1015 1016

	/* So, what's going on in the loop below?
	 *
	 * We use calculated fragment length to generate chained skb,
	 * each of segments is IP fragment ready for sending to network after
	 * adding appropriate IP header.
	 */

1017
	if (!skb)
L
Linus Torvalds 已提交
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
		goto alloc_new_skb;

	while (length > 0) {
		/* Check if the remaining data fits into current packet. */
		copy = mtu - skb->len;
		if (copy < length)
			copy = maxfraglen - skb->len;
		if (copy <= 0) {
			char *data;
			unsigned int datalen;
			unsigned int fraglen;
			unsigned int fraggap;
			unsigned int alloclen;
W
Willem de Bruijn 已提交
1031
			unsigned int pagedlen;
L
Linus Torvalds 已提交
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
			struct sk_buff *skb_prev;
alloc_new_skb:
			skb_prev = skb;
			if (skb_prev)
				fraggap = skb_prev->len - maxfraglen;
			else
				fraggap = 0;

			/*
			 * If remaining data exceeds the mtu,
			 * we know we need more fragment(s).
			 */
			datalen = length + fraggap;
			if (datalen > mtu - fragheaderlen)
				datalen = maxfraglen - fragheaderlen;
			fraglen = datalen + fragheaderlen;
W
Willem de Bruijn 已提交
1048
			pagedlen = 0;
L
Linus Torvalds 已提交
1049

1050
			if ((flags & MSG_MORE) &&
1051
			    !(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
1052
				alloclen = mtu;
1053
			else if (!paged)
1054
				alloclen = fraglen;
1055 1056 1057 1058
			else {
				alloclen = min_t(int, fraglen, MAX_HEADER);
				pagedlen = fraglen - alloclen;
			}
L
Linus Torvalds 已提交
1059

1060 1061
			alloclen += exthdrlen;

L
Linus Torvalds 已提交
1062 1063 1064 1065 1066
			/* The last fragment gets additional space at tail.
			 * Note, with MSG_MORE we overallocate on fragments,
			 * because we have no idea what fragment will be
			 * the last.
			 */
1067
			if (datalen == length + fraggap)
1068
				alloclen += rt->dst.trailer_len;
1069

L
Linus Torvalds 已提交
1070
			if (transhdrlen) {
1071
				skb = sock_alloc_send_skb(sk,
L
Linus Torvalds 已提交
1072 1073 1074 1075
						alloclen + hh_len + 15,
						(flags & MSG_DONTWAIT), &err);
			} else {
				skb = NULL;
1076
				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
L
Linus Torvalds 已提交
1077
				    2 * sk->sk_sndbuf)
1078 1079
					skb = alloc_skb(alloclen + hh_len + 15,
							sk->sk_allocation);
1080
				if (unlikely(!skb))
L
Linus Torvalds 已提交
1081 1082
					err = -ENOBUFS;
			}
1083
			if (!skb)
L
Linus Torvalds 已提交
1084 1085 1086 1087 1088 1089 1090 1091
				goto error;

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = csummode;
			skb->csum = 0;
			skb_reserve(skb, hh_len);
1092

L
Linus Torvalds 已提交
1093 1094 1095
			/*
			 *	Find where to start putting bytes.
			 */
1096
			data = skb_put(skb, fraglen + exthdrlen - pagedlen);
1097
			skb_set_network_header(skb, exthdrlen);
1098 1099
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
1100
			data += fragheaderlen + exthdrlen;
L
Linus Torvalds 已提交
1101 1102 1103 1104 1105 1106 1107 1108

			if (fraggap) {
				skb->csum = skb_copy_and_csum_bits(
					skb_prev, maxfraglen,
					data + transhdrlen, fraggap, 0);
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
				data += fraggap;
1109
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1110 1111
			}

1112
			copy = datalen - transhdrlen - fraggap - pagedlen;
L
Linus Torvalds 已提交
1113 1114 1115 1116 1117 1118 1119
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				err = -EFAULT;
				kfree_skb(skb);
				goto error;
			}

			offset += copy;
1120
			length -= copy + transhdrlen;
L
Linus Torvalds 已提交
1121 1122 1123 1124
			transhdrlen = 0;
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;

1125 1126 1127 1128 1129 1130 1131
			/* only the initial fragment is time stamped */
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
			cork->tx_flags = 0;
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
			skb_zcopy_set(skb, uarg, &extra_uref);

1132 1133 1134
			if ((flags & MSG_CONFIRM) && !skb_prev)
				skb_set_dst_pending_confirm(skb, 1);

L
Linus Torvalds 已提交
1135 1136 1137
			/*
			 * Put the packet on the pending queue.
			 */
1138 1139 1140 1141 1142
			if (!skb->destructor) {
				skb->destructor = sock_wfree;
				skb->sk = sk;
				wmem_alloc_delta += skb->truesize;
			}
1143
			__skb_queue_tail(queue, skb);
L
Linus Torvalds 已提交
1144 1145 1146 1147 1148 1149
			continue;
		}

		if (copy > length)
			copy = length;

1150 1151
		if (!(rt->dst.dev->features&NETIF_F_SG) &&
		    skb_tailroom(skb) >= copy) {
L
Linus Torvalds 已提交
1152 1153 1154
			unsigned int off;

			off = skb->len;
1155
			if (getfrag(from, skb_put(skb, copy),
L
Linus Torvalds 已提交
1156 1157 1158 1159 1160
					offset, copy, off, skb) < 0) {
				__skb_trim(skb, off);
				err = -EFAULT;
				goto error;
			}
W
Willem de Bruijn 已提交
1161
		} else if (!uarg || !uarg->zerocopy) {
L
Linus Torvalds 已提交
1162 1163
			int i = skb_shinfo(skb)->nr_frags;

1164 1165
			err = -ENOMEM;
			if (!sk_page_frag_refill(sk, pfrag))
L
Linus Torvalds 已提交
1166
				goto error;
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177

			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				err = -EMSGSIZE;
				if (i == MAX_SKB_FRAGS)
					goto error;

				__skb_fill_page_desc(skb, i, pfrag->page,
						     pfrag->offset, 0);
				skb_shinfo(skb)->nr_frags = ++i;
				get_page(pfrag->page);
L
Linus Torvalds 已提交
1178
			}
1179 1180 1181 1182 1183 1184 1185 1186
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
			if (getfrag(from,
				    page_address(pfrag->page) + pfrag->offset,
				    offset, copy, skb->len, skb) < 0)
				goto error_efault;

			pfrag->offset += copy;
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
L
Linus Torvalds 已提交
1187 1188
			skb->len += copy;
			skb->data_len += copy;
1189
			skb->truesize += copy;
1190
			wmem_alloc_delta += copy;
W
Willem de Bruijn 已提交
1191 1192 1193 1194
		} else {
			err = skb_zerocopy_iter_dgram(skb, from, copy);
			if (err < 0)
				goto error;
L
Linus Torvalds 已提交
1195 1196 1197 1198 1199
		}
		offset += copy;
		length -= copy;
	}

1200 1201
	if (wmem_alloc_delta)
		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1202 1203
	return 0;

1204 1205
error_efault:
	err = -EFAULT;
L
Linus Torvalds 已提交
1206
error:
1207 1208
	if (uarg)
		sock_zerocopy_put_abort(uarg, extra_uref);
1209
	cork->length -= length;
P
Pavel Emelyanov 已提交
1210
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1211
	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1212
	return err;
L
Linus Torvalds 已提交
1213 1214
}

1215 1216 1217
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
			 struct ipcm_cookie *ipc, struct rtable **rtp)
{
1218
	struct ip_options_rcu *opt;
1219 1220
	struct rtable *rt;

1221 1222 1223 1224
	rt = *rtp;
	if (unlikely(!rt))
		return -EFAULT;

1225 1226 1227 1228 1229
	/*
	 * setup for corking.
	 */
	opt = ipc->opt;
	if (opt) {
1230
		if (!cork->opt) {
1231 1232
			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
					    sk->sk_allocation);
1233
			if (unlikely(!cork->opt))
1234 1235
				return -ENOBUFS;
		}
1236
		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1237 1238 1239
		cork->flags |= IPCORK_OPT;
		cork->addr = ipc->addr;
	}
1240

1241 1242 1243 1244
	/*
	 * We steal reference to this route, caller should not release it
	 */
	*rtp = NULL;
1245 1246
	cork->fragsize = ip_sk_use_pmtu(sk) ?
			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1247

1248
	cork->gso_size = ipc->gso_size;
1249 1250
	cork->dst = &rt->dst;
	cork->length = 0;
1251 1252 1253
	cork->ttl = ipc->ttl;
	cork->tos = ipc->tos;
	cork->priority = ipc->priority;
1254
	cork->transmit_time = ipc->sockc.transmit_time;
1255 1256
	cork->tx_flags = 0;
	sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271

	return 0;
}

/*
 *	ip_append_data() and ip_append_page() can make one large IP datagram
 *	from many pieces of data. Each pieces will be holded on the socket
 *	until ip_push_pending_frames() is called. Each piece can be a page
 *	or non-page data.
 *
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
 *	this interface potentially.
 *
 *	LATER: length must be adjusted by pad at tail, when it is required.
 */
1272
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285
		   int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
		   void *from, int length, int transhdrlen,
		   struct ipcm_cookie *ipc, struct rtable **rtp,
		   unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	int err;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue)) {
1286
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1287 1288 1289 1290 1291 1292
		if (err)
			return err;
	} else {
		transhdrlen = 0;
	}

1293 1294
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
				sk_page_frag(sk), getfrag,
1295 1296 1297
				from, length, transhdrlen, flags);
}

1298
ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
L
Linus Torvalds 已提交
1299 1300 1301 1302 1303 1304
		       int offset, size_t size, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;
	struct rtable *rt;
	struct ip_options *opt = NULL;
1305
	struct inet_cork *cork;
L
Linus Torvalds 已提交
1306 1307 1308 1309
	int hh_len;
	int mtu;
	int len;
	int err;
1310
	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
L
Linus Torvalds 已提交
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320

	if (inet->hdrincl)
		return -EPERM;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue))
		return -EINVAL;

1321 1322 1323 1324
	cork = &inet->cork.base;
	rt = (struct rtable *)cork->dst;
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1325

1326
	if (!(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
1327 1328
		return -EOPNOTSUPP;

1329
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1330
	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
L
Linus Torvalds 已提交
1331 1332 1333

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
1334
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
1335

1336
	if (cork->length + size > maxnonfragsize - fragheaderlen) {
1337 1338
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
1339 1340 1341
		return -EMSGSIZE;
	}

1342 1343
	skb = skb_peek_tail(&sk->sk_write_queue);
	if (!skb)
L
Linus Torvalds 已提交
1344 1345
		return -EINVAL;

1346
	cork->length += size;
1347

L
Linus Torvalds 已提交
1348
	while (size > 0) {
1349 1350 1351 1352
		/* Check if the remaining data fits into current packet. */
		len = mtu - skb->len;
		if (len < size)
			len = maxfraglen - skb->len;
1353

L
Linus Torvalds 已提交
1354 1355 1356 1357 1358
		if (len <= 0) {
			struct sk_buff *skb_prev;
			int alloclen;

			skb_prev = skb;
1359
			fraggap = skb_prev->len - maxfraglen;
L
Linus Torvalds 已提交
1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377

			alloclen = fragheaderlen + hh_len + fraggap + 15;
			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
			if (unlikely(!skb)) {
				err = -ENOBUFS;
				goto error;
			}

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = CHECKSUM_NONE;
			skb->csum = 0;
			skb_reserve(skb, hh_len);

			/*
			 *	Find where to start putting bytes.
			 */
1378
			skb_put(skb, fragheaderlen + fraggap);
1379
			skb_reset_network_header(skb);
1380 1381
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
L
Linus Torvalds 已提交
1382
			if (fraggap) {
1383 1384
				skb->csum = skb_copy_and_csum_bits(skb_prev,
								   maxfraglen,
1385
						    skb_transport_header(skb),
1386
								   fraggap, 0);
L
Linus Torvalds 已提交
1387 1388
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
1389
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
			}

			/*
			 * Put the packet on the pending queue.
			 */
			__skb_queue_tail(&sk->sk_write_queue, skb);
			continue;
		}

		if (len > size)
			len = size;
1401 1402

		if (skb_append_pagefrags(skb, page, offset, len)) {
L
Linus Torvalds 已提交
1403 1404 1405 1406 1407
			err = -EMSGSIZE;
			goto error;
		}

		if (skb->ip_summed == CHECKSUM_NONE) {
1408
			__wsum csum;
L
Linus Torvalds 已提交
1409 1410 1411 1412 1413 1414
			csum = csum_page(page, offset, len);
			skb->csum = csum_block_add(skb->csum, csum, skb->len);
		}

		skb->len += len;
		skb->data_len += len;
1415
		skb->truesize += len;
1416
		refcount_add(len, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1417 1418 1419 1420 1421 1422
		offset += len;
		size -= len;
	}
	return 0;

error:
1423
	cork->length -= size;
P
Pavel Emelyanov 已提交
1424
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1425 1426 1427
	return err;
}

1428
static void ip_cork_release(struct inet_cork *cork)
1429
{
1430 1431 1432 1433 1434
	cork->flags &= ~IPCORK_OPT;
	kfree(cork->opt);
	cork->opt = NULL;
	dst_release(cork->dst);
	cork->dst = NULL;
1435 1436
}

L
Linus Torvalds 已提交
1437 1438 1439 1440
/*
 *	Combined all pending IP fragments on the socket as one IP datagram
 *	and push them out.
 */
1441
struct sk_buff *__ip_make_skb(struct sock *sk,
1442
			      struct flowi4 *fl4,
1443 1444
			      struct sk_buff_head *queue,
			      struct inet_cork *cork)
L
Linus Torvalds 已提交
1445 1446 1447 1448
{
	struct sk_buff *skb, *tmp_skb;
	struct sk_buff **tail_skb;
	struct inet_sock *inet = inet_sk(sk);
1449
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1450
	struct ip_options *opt = NULL;
1451
	struct rtable *rt = (struct rtable *)cork->dst;
L
Linus Torvalds 已提交
1452
	struct iphdr *iph;
1453
	__be16 df = 0;
L
Linus Torvalds 已提交
1454 1455
	__u8 ttl;

1456 1457
	skb = __skb_dequeue(queue);
	if (!skb)
L
Linus Torvalds 已提交
1458 1459 1460 1461
		goto out;
	tail_skb = &(skb_shinfo(skb)->frag_list);

	/* move skb->data to ip header from ext header */
1462
	if (skb->data < skb_network_header(skb))
1463
		__skb_pull(skb, skb_network_offset(skb));
1464
	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1465
		__skb_pull(tmp_skb, skb_network_header_len(skb));
L
Linus Torvalds 已提交
1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478
		*tail_skb = tmp_skb;
		tail_skb = &(tmp_skb->next);
		skb->len += tmp_skb->len;
		skb->data_len += tmp_skb->len;
		skb->truesize += tmp_skb->truesize;
		tmp_skb->destructor = NULL;
		tmp_skb->sk = NULL;
	}

	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
	 * to fragment the frame generated here. No matter, what transforms
	 * how transforms change size of the packet, it will come out.
	 */
W
WANG Cong 已提交
1479
	skb->ignore_df = ip_sk_ignore_df(sk);
L
Linus Torvalds 已提交
1480 1481

	/* DF bit is set when we want to see DF on outgoing frames.
W
WANG Cong 已提交
1482
	 * If ignore_df is set too, we still allow to fragment this frame
L
Linus Torvalds 已提交
1483
	 * locally. */
1484 1485
	if (inet->pmtudisc == IP_PMTUDISC_DO ||
	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
1486 1487
	    (skb->len <= dst_mtu(&rt->dst) &&
	     ip_dont_fragment(sk, &rt->dst)))
L
Linus Torvalds 已提交
1488 1489
		df = htons(IP_DF);

1490 1491
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1492

1493 1494 1495
	if (cork->ttl != 0)
		ttl = cork->ttl;
	else if (rt->rt_type == RTN_MULTICAST)
L
Linus Torvalds 已提交
1496 1497
		ttl = inet->mc_ttl;
	else
1498
		ttl = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
1499

1500
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1501 1502
	iph->version = 4;
	iph->ihl = 5;
1503
	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
L
Linus Torvalds 已提交
1504 1505 1506
	iph->frag_off = df;
	iph->ttl = ttl;
	iph->protocol = sk->sk_protocol;
1507
	ip_copy_addrs(iph, fl4);
1508
	ip_select_ident(net, skb, sk);
L
Linus Torvalds 已提交
1509

1510 1511 1512 1513 1514
	if (opt) {
		iph->ihl += opt->optlen>>2;
		ip_options_build(skb, opt, cork->addr, rt, 0);
	}

1515
	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1516
	skb->mark = sk->sk_mark;
1517
	skb->tstamp = cork->transmit_time;
1518 1519 1520 1521
	/*
	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
	 * on dst refcount
	 */
1522
	cork->dst = NULL;
1523
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
1524

1525
	if (iph->protocol == IPPROTO_ICMP)
1526
		icmp_out_count(net, ((struct icmphdr *)
1527 1528
			skb_transport_header(skb))->type);

1529 1530 1531 1532 1533
	ip_cork_release(cork);
out:
	return skb;
}

E
Eric Dumazet 已提交
1534
int ip_send_skb(struct net *net, struct sk_buff *skb)
1535 1536 1537
{
	int err;

1538
	err = ip_local_out(net, skb->sk, skb);
L
Linus Torvalds 已提交
1539 1540
	if (err) {
		if (err > 0)
E
Eric Dumazet 已提交
1541
			err = net_xmit_errno(err);
L
Linus Torvalds 已提交
1542
		if (err)
1543
			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1544 1545 1546 1547 1548
	}

	return err;
}

1549
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1550
{
1551 1552
	struct sk_buff *skb;

1553
	skb = ip_finish_skb(sk, fl4);
1554 1555 1556 1557
	if (!skb)
		return 0;

	/* Netfilter gets whole the not fragmented skb. */
E
Eric Dumazet 已提交
1558
	return ip_send_skb(sock_net(sk), skb);
1559 1560
}

L
Linus Torvalds 已提交
1561 1562 1563
/*
 *	Throw away all pending data on the socket.
 */
1564 1565 1566
static void __ip_flush_pending_frames(struct sock *sk,
				      struct sk_buff_head *queue,
				      struct inet_cork *cork)
L
Linus Torvalds 已提交
1567 1568 1569
{
	struct sk_buff *skb;

1570
	while ((skb = __skb_dequeue_tail(queue)) != NULL)
L
Linus Torvalds 已提交
1571 1572
		kfree_skb(skb);

1573 1574 1575 1576 1577
	ip_cork_release(cork);
}

void ip_flush_pending_frames(struct sock *sk)
{
1578
	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
L
Linus Torvalds 已提交
1579 1580
}

1581
struct sk_buff *ip_make_skb(struct sock *sk,
1582
			    struct flowi4 *fl4,
1583 1584 1585 1586
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    struct ipcm_cookie *ipc, struct rtable **rtp,
W
Willem de Bruijn 已提交
1587
			    struct inet_cork *cork, unsigned int flags)
1588 1589 1590 1591 1592 1593 1594 1595 1596
{
	struct sk_buff_head queue;
	int err;

	if (flags & MSG_PROBE)
		return NULL;

	__skb_queue_head_init(&queue);

W
Willem de Bruijn 已提交
1597 1598 1599 1600
	cork->flags = 0;
	cork->addr = 0;
	cork->opt = NULL;
	err = ip_setup_cork(sk, cork, ipc, rtp);
1601 1602 1603
	if (err)
		return ERR_PTR(err);

W
Willem de Bruijn 已提交
1604
	err = __ip_append_data(sk, fl4, &queue, cork,
1605
			       &current->task_frag, getfrag,
1606 1607
			       from, length, transhdrlen, flags);
	if (err) {
W
Willem de Bruijn 已提交
1608
		__ip_flush_pending_frames(sk, &queue, cork);
1609 1610 1611
		return ERR_PTR(err);
	}

W
Willem de Bruijn 已提交
1612
	return __ip_make_skb(sk, fl4, &queue, cork);
1613
}
L
Linus Torvalds 已提交
1614 1615 1616 1617

/*
 *	Fetch data from kernel space and fill in checksum if needed.
 */
1618
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
L
Linus Torvalds 已提交
1619 1620
			      int len, int odd, struct sk_buff *skb)
{
1621
	__wsum csum;
L
Linus Torvalds 已提交
1622 1623 1624

	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
	skb->csum = csum_block_add(skb->csum, csum, odd);
1625
	return 0;
L
Linus Torvalds 已提交
1626 1627
}

1628
/*
L
Linus Torvalds 已提交
1629
 *	Generic function to send a packet as reply to another packet.
1630
 *	Used to send some TCP resets/acks so far.
L
Linus Torvalds 已提交
1631
 */
1632
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1633 1634 1635
			   const struct ip_options *sopt,
			   __be32 daddr, __be32 saddr,
			   const struct ip_reply_arg *arg,
1636
			   unsigned int len)
L
Linus Torvalds 已提交
1637
{
1638
	struct ip_options_data replyopts;
L
Linus Torvalds 已提交
1639
	struct ipcm_cookie ipc;
1640
	struct flowi4 fl4;
E
Eric Dumazet 已提交
1641
	struct rtable *rt = skb_rtable(skb);
1642
	struct net *net = sock_net(sk);
1643
	struct sk_buff *nskb;
1644
	int err;
1645
	int oif;
L
Linus Torvalds 已提交
1646

1647
	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
L
Linus Torvalds 已提交
1648 1649
		return;

1650
	ipcm_init(&ipc);
1651
	ipc.addr = daddr;
L
Linus Torvalds 已提交
1652

1653
	if (replyopts.opt.opt.optlen) {
L
Linus Torvalds 已提交
1654 1655
		ipc.opt = &replyopts.opt;

1656 1657
		if (replyopts.opt.opt.srr)
			daddr = replyopts.opt.opt.faddr;
L
Linus Torvalds 已提交
1658 1659
	}

1660
	oif = arg->bound_dev_if;
1661 1662
	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
		oif = skb->skb_iif;
1663 1664

	flowi4_init_output(&fl4, oif,
J
Jon Maxwell 已提交
1665
			   IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
1666
			   RT_TOS(arg->tos),
1667
			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1668
			   ip_reply_arg_flowi_flags(arg),
1669
			   daddr, saddr,
1670 1671
			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
			   arg->uid);
1672
	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1673
	rt = ip_route_output_key(net, &fl4);
1674 1675
	if (IS_ERR(rt))
		return;
L
Linus Torvalds 已提交
1676

1677
	inet_sk(sk)->tos = arg->tos;
L
Linus Torvalds 已提交
1678 1679

	sk->sk_priority = skb->priority;
1680
	sk->sk_protocol = ip_hdr(skb)->protocol;
1681
	sk->sk_bound_dev_if = arg->bound_dev_if;
1682
	sk->sk_sndbuf = sysctl_wmem_default;
1683
	sk->sk_mark = fl4.flowi4_mark;
1684 1685 1686 1687 1688 1689 1690
	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
			     len, 0, &ipc, &rt, MSG_DONTWAIT);
	if (unlikely(err)) {
		ip_flush_pending_frames(sk);
		goto out;
	}

1691 1692
	nskb = skb_peek(&sk->sk_write_queue);
	if (nskb) {
L
Linus Torvalds 已提交
1693
		if (arg->csumoffset >= 0)
1694 1695
			*((__sum16 *)skb_transport_header(nskb) +
			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1696
								arg->csum));
1697
		nskb->ip_summed = CHECKSUM_NONE;
1698
		ip_push_pending_frames(sk, &fl4);
L
Linus Torvalds 已提交
1699
	}
1700
out:
L
Linus Torvalds 已提交
1701 1702 1703 1704 1705 1706 1707 1708
	ip_rt_put(rt);
}

void __init ip_init(void)
{
	ip_rt_init();
	inet_initpeers();

1709 1710
#if defined(CONFIG_IP_MULTICAST)
	igmp_mc_init();
L
Linus Torvalds 已提交
1711 1712
#endif
}