ip_output.c 39.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The Internet Protocol (IP) output module.
 *
8
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Donald Becker, <becker@super.org>
 *		Alan Cox, <Alan.Cox@linux.org>
 *		Richard Underwood
 *		Stefan Becker, <stefanb@yello.ping.de>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 *	See ip_input.c for original log
 *
 *	Fixes:
 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23
 *		Bradford Johnson:	Fix faulty handling of some frames when
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33
 *					no route is found.
 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
 *					(in case if packet not accepted by
 *					output firewall rules)
 *		Mike McLagan	:	Routing by source
 *		Alexey Kuznetsov:	use new route cache
 *		Andi Kleen:		Fix broken PMTU recovery and remove
 *					some redundant tests.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 35 36
 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
 *					for decreased register pressure on x86
 *					and more readibility.
L
Linus Torvalds 已提交
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
 *					silently drop skb instead of failing with -EPERM.
 *		Detlev Wengorz	:	Copy protocol for fragments.
 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
 *					datagrams.
 *		Hirokazu Takahashi:	sendfile() on UDP works now.
 */

#include <asm/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
A
Al Viro 已提交
52
#include <linux/highmem.h>
53
#include <linux/slab.h>
L
Linus Torvalds 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
69
#include <net/xfrm.h>
L
Linus Torvalds 已提交
70 71 72 73 74 75 76 77 78 79 80
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
81
#include <linux/tcp.h>
L
Linus Torvalds 已提交
82

83
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84
EXPORT_SYMBOL(sysctl_ip_default_ttl);
L
Linus Torvalds 已提交
85

86
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
87
		       unsigned int mtu,
88 89
		       int (*output)(struct sock *, struct sk_buff *));

L
Linus Torvalds 已提交
90
/* Generate a checksum for an outgoing IP datagram. */
91
void ip_send_check(struct iphdr *iph)
L
Linus Torvalds 已提交
92 93 94 95
{
	iph->check = 0;
	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
E
Eric Dumazet 已提交
96
EXPORT_SYMBOL(ip_send_check);
L
Linus Torvalds 已提交
97

98
static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
99 100 101 102 103
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);
104
	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
105
		       skb_dst(skb)->dev, dst_output);
106 107 108 109 110
}

int __ip_local_out(struct sk_buff *skb)
{
	return __ip_local_out_sk(skb->sk, skb);
H
Herbert Xu 已提交
111 112
}

113
int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
114 115 116 117 118
{
	int err;

	err = __ip_local_out(skb);
	if (likely(err == 1))
119
		err = dst_output(sk, skb);
H
Herbert Xu 已提交
120 121 122

	return err;
}
123
EXPORT_SYMBOL_GPL(ip_local_out_sk);
H
Herbert Xu 已提交
124

L
Linus Torvalds 已提交
125 126 127 128 129
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
	int ttl = inet->uc_ttl;

	if (ttl < 0)
130
		ttl = ip4_dst_hoplimit(dst);
L
Linus Torvalds 已提交
131 132 133
	return ttl;
}

134
/*
L
Linus Torvalds 已提交
135 136 137 138
 *		Add an ip header to a skbuff and send it out.
 *
 */
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
139
			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
L
Linus Torvalds 已提交
140 141
{
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
142
	struct rtable *rt = skb_rtable(skb);
L
Linus Torvalds 已提交
143 144 145
	struct iphdr *iph;

	/* Build the IP header. */
146
	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
147
	skb_reset_network_header(skb);
148
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
149 150 151
	iph->version  = 4;
	iph->ihl      = 5;
	iph->tos      = inet->tos;
152
	if (ip_dont_fragment(sk, &rt->dst))
L
Linus Torvalds 已提交
153 154 155
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
156
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
157 158
	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
	iph->saddr    = saddr;
L
Linus Torvalds 已提交
159
	iph->protocol = sk->sk_protocol;
160
	ip_select_ident(sock_net(sk), skb, sk);
L
Linus Torvalds 已提交
161

162 163 164
	if (opt && opt->opt.optlen) {
		iph->ihl += opt->opt.optlen>>2;
		ip_options_build(skb, &opt->opt, daddr, rt, 0);
L
Linus Torvalds 已提交
165 166 167
	}

	skb->priority = sk->sk_priority;
168
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
169 170

	/* Send it out. */
H
Herbert Xu 已提交
171
	return ip_local_out(skb);
L
Linus Torvalds 已提交
172
}
173 174
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);

175
static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
176
{
E
Eric Dumazet 已提交
177
	struct dst_entry *dst = skb_dst(skb);
178
	struct rtable *rt = (struct rtable *)dst;
L
Linus Torvalds 已提交
179
	struct net_device *dev = dst->dev;
180
	struct net *net = dev_net(dev);
181
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
182
	struct neighbour *neigh;
183
	u32 nexthop;
L
Linus Torvalds 已提交
184

185
	if (rt->rt_type == RTN_MULTICAST) {
186
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
187
	} else if (rt->rt_type == RTN_BROADCAST)
188
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
189

L
Linus Torvalds 已提交
190
	/* Be paranoid, rather than too clever. */
191
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
L
Linus Torvalds 已提交
192 193 194
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
195
		if (!skb2) {
L
Linus Torvalds 已提交
196 197 198 199 200
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
201
		consume_skb(skb);
L
Linus Torvalds 已提交
202 203 204
		skb = skb2;
	}

205
	rcu_read_lock_bh();
J
Julian Anastasov 已提交
206
	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
207 208 209
	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
	if (unlikely(!neigh))
		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
210
	if (!IS_ERR(neigh)) {
211
		int res = dst_neigh_output(dst, neigh, skb);
212

213
		rcu_read_unlock_bh();
214 215
		return res;
	}
216
	rcu_read_unlock_bh();
217

218 219
	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
L
Linus Torvalds 已提交
220 221 222 223
	kfree_skb(skb);
	return -EINVAL;
}

224 225
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
				unsigned int mtu)
226 227 228 229 230 231 232
{
	netdev_features_t features;
	struct sk_buff *segs;
	int ret = 0;

	/* common case: locally created skb or seglen is <= mtu */
	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
233
	      skb_gso_network_seglen(skb) <= mtu)
234
		return ip_finish_output2(sk, skb);
235 236 237 238 239 240 241 242 243 244

	/* Slowpath -  GSO segment length is exceeding the dst MTU.
	 *
	 * This can happen in two cases:
	 * 1) TCP GRO packet, DF bit not set
	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
	 * from host network stack.
	 */
	features = netif_skb_features(skb);
	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
245
	if (IS_ERR_OR_NULL(segs)) {
246 247 248 249 250 251 252 253 254 255 256
		kfree_skb(skb);
		return -ENOMEM;
	}

	consume_skb(skb);

	do {
		struct sk_buff *nskb = segs->next;
		int err;

		segs->next = NULL;
257
		err = ip_fragment(sk, segs, mtu, ip_finish_output2);
258 259 260 261 262 263 264 265 266

		if (err && ret == 0)
			ret = err;
		segs = nskb;
	} while (segs);

	return ret;
}

267
static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
268
{
269 270
	unsigned int mtu;

271 272
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
273
	if (skb_dst(skb)->xfrm) {
274
		IPCB(skb)->flags |= IPSKB_REROUTED;
275
		return dst_output(sk, skb);
276
	}
277
#endif
278
	mtu = ip_skb_dst_mtu(skb);
279
	if (skb_is_gso(skb))
280
		return ip_finish_output_gso(sk, skb, mtu);
281

282
	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
283
		return ip_fragment(sk, skb, mtu, ip_finish_output2);
284

285
	return ip_finish_output2(sk, skb);
L
Linus Torvalds 已提交
286 287
}

288
int ip_mc_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
289
{
E
Eric Dumazet 已提交
290
	struct rtable *rt = skb_rtable(skb);
291
	struct net_device *dev = rt->dst.dev;
292
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
293 294 295 296

	/*
	 *	If the indicated interface is up and running, send the packet.
	 */
297
	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
298 299 300 301 302 303 304 305 306

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	/*
	 *	Multicasts are looped back for other local users
	 */

	if (rt->rt_flags&RTCF_MULTICAST) {
307
		if (sk_mc_loop(sk)
L
Linus Torvalds 已提交
308 309 310 311 312 313 314 315 316
#ifdef CONFIG_IP_MROUTE
		/* Small optimization: do not loopback not local frames,
		   which returned after forwarding; they will be  dropped
		   by ip_mr_input in any case.
		   Note, that local frames are looped back to be delivered
		   to local recipients.

		   This check is duplicated in ip_mr_input at the moment.
		 */
317 318 319
		    &&
		    ((rt->rt_flags & RTCF_LOCAL) ||
		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
L
Linus Torvalds 已提交
320
#endif
321
		   ) {
L
Linus Torvalds 已提交
322 323
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
			if (newskb)
324
				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
325
					sk, newskb, NULL, newskb->dev,
326
					dev_loopback_xmit);
L
Linus Torvalds 已提交
327 328 329 330
		}

		/* Multicasts with ttl 0 must not go beyond the host */

331
		if (ip_hdr(skb)->ttl == 0) {
L
Linus Torvalds 已提交
332 333 334 335 336 337 338 339
			kfree_skb(skb);
			return 0;
		}
	}

	if (rt->rt_flags&RTCF_BROADCAST) {
		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
		if (newskb)
340
			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
341
				NULL, newskb->dev, dev_loopback_xmit);
L
Linus Torvalds 已提交
342 343
	}

344
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
345
			    skb->dev, ip_finish_output,
346
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
347 348
}

349
int ip_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
350
{
E
Eric Dumazet 已提交
351
	struct net_device *dev = skb_dst(skb)->dev;
352
	struct net *net = dev_net(dev);
353

354
	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
355

356 357 358
	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

359 360
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
			    NULL, dev,
361
			    ip_finish_output,
362
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
363 364
}

365 366 367 368 369 370 371 372 373 374 375 376 377 378
/*
 * copy saddr and daddr, possibly using 64bit load/stores
 * Equivalent to :
 *   iph->saddr = fl4->saddr;
 *   iph->daddr = fl4->daddr;
 */
static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
	memcpy(&iph->saddr, &fl4->saddr,
	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
}

379 380
/* Note: skb->sk can be different from sk, in case of tunnels */
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
L
Linus Torvalds 已提交
381 382
{
	struct inet_sock *inet = inet_sk(sk);
383
	struct ip_options_rcu *inet_opt;
384
	struct flowi4 *fl4;
L
Linus Torvalds 已提交
385 386
	struct rtable *rt;
	struct iphdr *iph;
387
	int res;
L
Linus Torvalds 已提交
388 389 390 391

	/* Skip all of this if the packet is already routed,
	 * f.e. by something like SCTP.
	 */
392
	rcu_read_lock();
393
	inet_opt = rcu_dereference(inet->inet_opt);
394
	fl4 = &fl->u.ip4;
E
Eric Dumazet 已提交
395
	rt = skb_rtable(skb);
396
	if (rt)
L
Linus Torvalds 已提交
397 398 399 400
		goto packet_routed;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);
401
	if (!rt) {
A
Al Viro 已提交
402
		__be32 daddr;
L
Linus Torvalds 已提交
403 404

		/* Use correct destination address if we have options. */
E
Eric Dumazet 已提交
405
		daddr = inet->inet_daddr;
406 407
		if (inet_opt && inet_opt->opt.srr)
			daddr = inet_opt->opt.faddr;
L
Linus Torvalds 已提交
408

409 410 411 412
		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times
		 * itself out.
		 */
413
		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
414 415 416 417 418 419 420 421
					   daddr, inet->inet_saddr,
					   inet->inet_dport,
					   inet->inet_sport,
					   sk->sk_protocol,
					   RT_CONN_FLAGS(sk),
					   sk->sk_bound_dev_if);
		if (IS_ERR(rt))
			goto no_route;
422
		sk_setup_caps(sk, &rt->dst);
L
Linus Torvalds 已提交
423
	}
424
	skb_dst_set_noref(skb, &rt->dst);
L
Linus Torvalds 已提交
425 426

packet_routed:
J
Julian Anastasov 已提交
427
	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
L
Linus Torvalds 已提交
428 429 430
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
431
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
432
	skb_reset_network_header(skb);
433
	iph = ip_hdr(skb);
434
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
W
WANG Cong 已提交
435
	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
L
Linus Torvalds 已提交
436 437 438
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
439
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
440
	iph->protocol = sk->sk_protocol;
441 442
	ip_copy_addrs(iph, fl4);

L
Linus Torvalds 已提交
443 444
	/* Transport layer set skb->h.foo itself. */

445 446 447
	if (inet_opt && inet_opt->opt.optlen) {
		iph->ihl += inet_opt->opt.optlen >> 2;
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
L
Linus Torvalds 已提交
448 449
	}

450 451
	ip_select_ident_segs(sock_net(sk), skb, sk,
			     skb_shinfo(skb)->gso_segs ?: 1);
L
Linus Torvalds 已提交
452

453
	/* TODO : should we use skb->sk here instead of sk ? */
L
Linus Torvalds 已提交
454
	skb->priority = sk->sk_priority;
455
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
456

457 458 459
	res = ip_local_out(skb);
	rcu_read_unlock();
	return res;
L
Linus Torvalds 已提交
460 461

no_route:
462
	rcu_read_unlock();
P
Pavel Emelyanov 已提交
463
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
464 465 466
	kfree_skb(skb);
	return -EHOSTUNREACH;
}
E
Eric Dumazet 已提交
467
EXPORT_SYMBOL(ip_queue_xmit);
L
Linus Torvalds 已提交
468 469 470 471 472 473

static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
	to->pkt_type = from->pkt_type;
	to->priority = from->priority;
	to->protocol = from->protocol;
E
Eric Dumazet 已提交
474
	skb_dst_drop(to);
475
	skb_dst_copy(to, from);
L
Linus Torvalds 已提交
476
	to->dev = from->dev;
T
Thomas Graf 已提交
477
	to->mark = from->mark;
L
Linus Torvalds 已提交
478 479 480 481 482 483 484

	/* Copy the flags to each fragment. */
	IPCB(to)->flags = IPCB(from)->flags;

#ifdef CONFIG_NET_SCHED
	to->tc_index = from->tc_index;
#endif
485
	nf_copy(to, from);
486 487
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
	to->ipvs_property = from->ipvs_property;
L
Linus Torvalds 已提交
488
#endif
489
	skb_copy_secmark(to, from);
L
Linus Torvalds 已提交
490 491
}

492
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
493
		       unsigned int mtu,
494 495 496 497
		       int (*output)(struct sock *, struct sk_buff *))
{
	struct iphdr *iph = ip_hdr(skb);

498 499 500 501
	if ((iph->frag_off & htons(IP_DF)) == 0)
		return ip_do_fragment(sk, skb, output);

	if (unlikely(!skb->ignore_df ||
502 503
		     (IPCB(skb)->frag_max_size &&
		      IPCB(skb)->frag_max_size > mtu))) {
504
		struct net *net = dev_net(skb_rtable(skb)->dst.dev);
505

506
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
507 508 509 510 511 512 513 514 515
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(mtu));
		kfree_skb(skb);
		return -EMSGSIZE;
	}

	return ip_do_fragment(sk, skb, output);
}

L
Linus Torvalds 已提交
516 517 518 519 520 521 522
/*
 *	This IP datagram is too large to be sent in one piece.  Break it up into
 *	smaller pieces (each of size equal to IP header plus
 *	a block of the data of the original IP data part) that will yet fit in a
 *	single device frame, and queue such a frame for sending.
 */

523 524
int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
		   int (*output)(struct sock *, struct sk_buff *))
L
Linus Torvalds 已提交
525 526 527 528 529
{
	struct iphdr *iph;
	int ptr;
	struct net_device *dev;
	struct sk_buff *skb2;
530
	unsigned int mtu, hlen, left, len, ll_rs;
L
Linus Torvalds 已提交
531
	int offset;
532
	__be16 not_last_frag;
E
Eric Dumazet 已提交
533
	struct rtable *rt = skb_rtable(skb);
534
	struct net *net;
L
Linus Torvalds 已提交
535 536
	int err = 0;

537
	dev = rt->dst.dev;
538
	net = dev_net(dev);
L
Linus Torvalds 已提交
539 540 541 542 543

	/*
	 *	Point into the IP datagram header.
	 */

544
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
545

546
	mtu = ip_skb_dst_mtu(skb);
547 548
	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
		mtu = IPCB(skb)->frag_max_size;
L
Linus Torvalds 已提交
549 550 551 552 553 554

	/*
	 *	Setup starting values.
	 */

	hlen = iph->ihl * 4;
555
	mtu = mtu - hlen;	/* Size of data space */
H
Herbert Xu 已提交
556
	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
L
Linus Torvalds 已提交
557 558 559 560 561 562 563 564

	/* When frag_list is given, use it. First, check its validity:
	 * some transformers could create wrong frag_list or break existing
	 * one, it is not prohibited. In this case fall back to copying.
	 *
	 * LATER: this step can be merged to real generation of fragments,
	 * we can switch to copy when see the first bad fragment.
	 */
565
	if (skb_has_frag_list(skb)) {
566
		struct sk_buff *frag, *frag2;
L
Linus Torvalds 已提交
567 568 569 570
		int first_len = skb_pagelen(skb);

		if (first_len - hlen > mtu ||
		    ((first_len - hlen) & 7) ||
571
		    ip_is_fragment(iph) ||
L
Linus Torvalds 已提交
572 573 574
		    skb_cloned(skb))
			goto slow_path;

575
		skb_walk_frags(skb, frag) {
L
Linus Torvalds 已提交
576 577 578 579
			/* Correct geometry. */
			if (frag->len > mtu ||
			    ((frag->len & 7) && frag->next) ||
			    skb_headroom(frag) < hlen)
580
				goto slow_path_clean;
L
Linus Torvalds 已提交
581 582 583

			/* Partially cloned skb? */
			if (skb_shared(frag))
584
				goto slow_path_clean;
585 586 587 588 589 590

			BUG_ON(frag->sk);
			if (skb->sk) {
				frag->sk = skb->sk;
				frag->destructor = sock_wfree;
			}
591
			skb->truesize -= frag->truesize;
L
Linus Torvalds 已提交
592 593 594 595 596 597 598
		}

		/* Everything is OK. Generate! */

		err = 0;
		offset = 0;
		frag = skb_shinfo(skb)->frag_list;
599
		skb_frag_list_init(skb);
L
Linus Torvalds 已提交
600 601 602 603 604 605 606 607 608 609 610
		skb->data_len = first_len - skb_headlen(skb);
		skb->len = first_len;
		iph->tot_len = htons(first_len);
		iph->frag_off = htons(IP_MF);
		ip_send_check(iph);

		for (;;) {
			/* Prepare header of the next frame,
			 * before previous one went down. */
			if (frag) {
				frag->ip_summed = CHECKSUM_NONE;
611
				skb_reset_transport_header(frag);
612 613
				__skb_push(frag, hlen);
				skb_reset_network_header(frag);
614
				memcpy(skb_network_header(frag), iph, hlen);
615
				iph = ip_hdr(frag);
L
Linus Torvalds 已提交
616 617 618 619 620 621
				iph->tot_len = htons(frag->len);
				ip_copy_metadata(frag, skb);
				if (offset == 0)
					ip_options_fragment(frag);
				offset += skb->len - hlen;
				iph->frag_off = htons(offset>>3);
622
				if (frag->next)
L
Linus Torvalds 已提交
623 624 625 626 627
					iph->frag_off |= htons(IP_MF);
				/* Ready, complete checksum */
				ip_send_check(iph);
			}

628
			err = output(sk, skb);
L
Linus Torvalds 已提交
629

630
			if (!err)
631
				IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
632 633 634 635 636 637 638 639 640
			if (err || !frag)
				break;

			skb = frag;
			frag = skb->next;
			skb->next = NULL;
		}

		if (err == 0) {
641
			IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
642 643 644 645 646 647 648 649
			return 0;
		}

		while (frag) {
			skb = frag->next;
			kfree_skb(frag);
			frag = skb;
		}
650
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
651
		return err;
652 653 654 655 656 657 658 659 660

slow_path_clean:
		skb_walk_frags(skb, frag2) {
			if (frag2 == frag)
				break;
			frag2->sk = NULL;
			frag2->destructor = NULL;
			skb->truesize += frag2->truesize;
		}
L
Linus Torvalds 已提交
661 662 663
	}

slow_path:
664 665 666
	/* for offloaded checksums cleanup checksum before fragmentation */
	if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
		goto fail;
667
	iph = ip_hdr(skb);
668

L
Linus Torvalds 已提交
669
	left = skb->len - hlen;		/* Space per frame */
670
	ptr = hlen;		/* Where to start from */
L
Linus Torvalds 已提交
671

672
	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
673

L
Linus Torvalds 已提交
674 675 676 677 678 679 680 681 682 683 684
	/*
	 *	Fragment the datagram.
	 */

	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
	not_last_frag = iph->frag_off & htons(IP_MF);

	/*
	 *	Keep copying data until we run out.
	 */

S
Stephen Hemminger 已提交
685
	while (left > 0) {
L
Linus Torvalds 已提交
686 687 688 689
		len = left;
		/* IF: it doesn't fit, use 'mtu' - the data space left */
		if (len > mtu)
			len = mtu;
L
Lucas De Marchi 已提交
690
		/* IF: we are not sending up to and including the packet end
L
Linus Torvalds 已提交
691 692 693 694 695
		   then align the next start on an eight byte boundary */
		if (len < left)	{
			len &= ~7;
		}

696 697 698
		/* Allocate buffer */
		skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
		if (!skb2) {
L
Linus Torvalds 已提交
699 700 701 702 703 704 705 706 707 708 709
			err = -ENOMEM;
			goto fail;
		}

		/*
		 *	Set up data on packet
		 */

		ip_copy_metadata(skb2, skb);
		skb_reserve(skb2, ll_rs);
		skb_put(skb2, len + hlen);
710
		skb_reset_network_header(skb2);
711
		skb2->transport_header = skb2->network_header + hlen;
L
Linus Torvalds 已提交
712 713 714 715 716 717 718 719 720 721 722 723 724

		/*
		 *	Charge the memory for the fragment to any owner
		 *	it might possess
		 */

		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);

		/*
		 *	Copy the packet header into the new buffer.
		 */

725
		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
L
Linus Torvalds 已提交
726 727 728 729

		/*
		 *	Copy a block of the IP datagram.
		 */
730
		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
L
Linus Torvalds 已提交
731 732 733 734 735 736
			BUG();
		left -= len;

		/*
		 *	Fill in the new header fields.
		 */
737
		iph = ip_hdr(skb2);
L
Linus Torvalds 已提交
738 739
		iph->frag_off = htons((offset >> 3));

740 741 742
		if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
			iph->frag_off |= htons(IP_DF);

L
Linus Torvalds 已提交
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
		/* ANK: dirty, but effective trick. Upgrade options only if
		 * the segment to be fragmented was THE FIRST (otherwise,
		 * options are already fixed) and make it ONCE
		 * on the initial skb, so that all the following fragments
		 * will inherit fixed options.
		 */
		if (offset == 0)
			ip_options_fragment(skb);

		/*
		 *	Added AC : If we are fragmenting a fragment that's not the
		 *		   last fragment then keep MF on each bit
		 */
		if (left > 0 || not_last_frag)
			iph->frag_off |= htons(IP_MF);
		ptr += len;
		offset += len;

		/*
		 *	Put this fragment into the sending queue.
		 */
		iph->tot_len = htons(len + hlen);

		ip_send_check(iph);

768
		err = output(sk, skb2);
L
Linus Torvalds 已提交
769 770
		if (err)
			goto fail;
771

772
		IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
773
	}
774
	consume_skb(skb);
775
	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
776 777 778
	return err;

fail:
779
	kfree_skb(skb);
780
	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
781 782
	return err;
}
783
EXPORT_SYMBOL(ip_do_fragment);
784

L
Linus Torvalds 已提交
785 786 787
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
788
	struct msghdr *msg = from;
L
Linus Torvalds 已提交
789

790
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
791
		if (copy_from_iter(to, len, &msg->msg_iter) != len)
L
Linus Torvalds 已提交
792 793
			return -EFAULT;
	} else {
794
		__wsum csum = 0;
795
		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
L
Linus Torvalds 已提交
796 797 798 799 800
			return -EFAULT;
		skb->csum = csum_block_add(skb->csum, csum, odd);
	}
	return 0;
}
E
Eric Dumazet 已提交
801
EXPORT_SYMBOL(ip_generic_getfrag);
L
Linus Torvalds 已提交
802

803
static inline __wsum
L
Linus Torvalds 已提交
804 805 806
csum_page(struct page *page, int offset, int copy)
{
	char *kaddr;
807
	__wsum csum;
L
Linus Torvalds 已提交
808 809 810 811 812 813
	kaddr = kmap(page);
	csum = csum_partial(kaddr + offset, copy, 0);
	kunmap(page);
	return csum;
}

A
Adrian Bunk 已提交
814
static inline int ip_ufo_append_data(struct sock *sk,
815
			struct sk_buff_head *queue,
816 817 818
			int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
			void *from, int length, int hh_len, int fragheaderlen,
819
			int transhdrlen, int maxfraglen, unsigned int flags)
820 821 822 823 824 825 826 827
{
	struct sk_buff *skb;
	int err;

	/* There is support for UDP fragmentation offload by network
	 * device, so create one single skb packet containing complete
	 * udp datagram
	 */
828 829
	skb = skb_peek_tail(queue);
	if (!skb) {
830 831 832 833
		skb = sock_alloc_send_skb(sk,
			hh_len + fragheaderlen + transhdrlen + 20,
			(flags & MSG_DONTWAIT), &err);

834
		if (!skb)
835 836 837 838 839 840
			return err;

		/* reserve space for Hardware header */
		skb_reserve(skb, hh_len);

		/* create space for UDP/IP header */
841
		skb_put(skb, fragheaderlen + transhdrlen);
842 843

		/* initialize network header pointer */
844
		skb_reset_network_header(skb);
845 846

		/* initialize protocol header pointer */
847
		skb->transport_header = skb->network_header + fragheaderlen;
848 849 850

		skb->csum = 0;

851
		__skb_queue_tail(queue, skb);
852 853
	} else if (skb_is_gso(skb)) {
		goto append;
854
	}
855

856 857 858 859 860 861
	skb->ip_summed = CHECKSUM_PARTIAL;
	/* specify the length of each IP datagram fragment */
	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;

append:
862 863
	return skb_append_datato_frags(sk, skb, getfrag, from,
				       (length - transhdrlen));
864 865
}

866 867 868
static int __ip_append_data(struct sock *sk,
			    struct flowi4 *fl4,
			    struct sk_buff_head *queue,
869
			    struct inet_cork *cork,
870
			    struct page_frag *pfrag,
871 872 873 874
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    unsigned int flags)
L
Linus Torvalds 已提交
875 876 877 878
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;

879
	struct ip_options *opt = cork->opt;
L
Linus Torvalds 已提交
880 881 882 883 884 885
	int hh_len;
	int exthdrlen;
	int mtu;
	int copy;
	int err;
	int offset = 0;
886
	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
L
Linus Torvalds 已提交
887
	int csummode = CHECKSUM_NONE;
888
	struct rtable *rt = (struct rtable *)cork->dst;
889
	u32 tskey = 0;
L
Linus Torvalds 已提交
890

891 892 893
	skb = skb_peek_tail(queue);

	exthdrlen = !skb ? rt->dst.header_len : 0;
894
	mtu = cork->fragsize;
895 896 897
	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
		tskey = sk->sk_tskey++;
L
Linus Torvalds 已提交
898

899
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
L
Linus Torvalds 已提交
900 901 902

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
903
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
904

905
	if (cork->length + length > maxnonfragsize - fragheaderlen) {
906
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
907
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
908 909 910 911 912 913 914 915 916
		return -EMSGSIZE;
	}

	/*
	 * transhdrlen > 0 means that this is the first fragment and we wish
	 * it won't be fragmented in the future.
	 */
	if (transhdrlen &&
	    length + fragheaderlen <= mtu &&
917
	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
L
Linus Torvalds 已提交
918
	    !exthdrlen)
919
		csummode = CHECKSUM_PARTIAL;
L
Linus Torvalds 已提交
920

921
	cork->length += length;
922
	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
923
	    (sk->sk_protocol == IPPROTO_UDP) &&
924 925
	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
	    (sk->sk_type == SOCK_DGRAM)) {
926 927
		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
					 hh_len, fragheaderlen, transhdrlen,
928
					 maxfraglen, flags);
929
		if (err)
930 931 932
			goto error;
		return 0;
	}
L
Linus Torvalds 已提交
933 934 935 936 937 938 939 940

	/* So, what's going on in the loop below?
	 *
	 * We use calculated fragment length to generate chained skb,
	 * each of segments is IP fragment ready for sending to network after
	 * adding appropriate IP header.
	 */

941
	if (!skb)
L
Linus Torvalds 已提交
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
		goto alloc_new_skb;

	while (length > 0) {
		/* Check if the remaining data fits into current packet. */
		copy = mtu - skb->len;
		if (copy < length)
			copy = maxfraglen - skb->len;
		if (copy <= 0) {
			char *data;
			unsigned int datalen;
			unsigned int fraglen;
			unsigned int fraggap;
			unsigned int alloclen;
			struct sk_buff *skb_prev;
alloc_new_skb:
			skb_prev = skb;
			if (skb_prev)
				fraggap = skb_prev->len - maxfraglen;
			else
				fraggap = 0;

			/*
			 * If remaining data exceeds the mtu,
			 * we know we need more fragment(s).
			 */
			datalen = length + fraggap;
			if (datalen > mtu - fragheaderlen)
				datalen = maxfraglen - fragheaderlen;
			fraglen = datalen + fragheaderlen;

972
			if ((flags & MSG_MORE) &&
973
			    !(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
974 975
				alloclen = mtu;
			else
976
				alloclen = fraglen;
L
Linus Torvalds 已提交
977

978 979
			alloclen += exthdrlen;

L
Linus Torvalds 已提交
980 981 982 983 984
			/* The last fragment gets additional space at tail.
			 * Note, with MSG_MORE we overallocate on fragments,
			 * because we have no idea what fragment will be
			 * the last.
			 */
985
			if (datalen == length + fraggap)
986
				alloclen += rt->dst.trailer_len;
987

L
Linus Torvalds 已提交
988
			if (transhdrlen) {
989
				skb = sock_alloc_send_skb(sk,
L
Linus Torvalds 已提交
990 991 992 993 994 995
						alloclen + hh_len + 15,
						(flags & MSG_DONTWAIT), &err);
			} else {
				skb = NULL;
				if (atomic_read(&sk->sk_wmem_alloc) <=
				    2 * sk->sk_sndbuf)
996
					skb = sock_wmalloc(sk,
L
Linus Torvalds 已提交
997 998
							   alloclen + hh_len + 15, 1,
							   sk->sk_allocation);
999
				if (unlikely(!skb))
L
Linus Torvalds 已提交
1000 1001
					err = -ENOBUFS;
			}
1002
			if (!skb)
L
Linus Torvalds 已提交
1003 1004 1005 1006 1007 1008 1009 1010
				goto error;

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = csummode;
			skb->csum = 0;
			skb_reserve(skb, hh_len);
1011 1012

			/* only the initial fragment is time stamped */
1013
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1014
			cork->tx_flags = 0;
1015 1016
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
L
Linus Torvalds 已提交
1017 1018 1019 1020

			/*
			 *	Find where to start putting bytes.
			 */
1021
			data = skb_put(skb, fraglen + exthdrlen);
1022
			skb_set_network_header(skb, exthdrlen);
1023 1024
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
1025
			data += fragheaderlen + exthdrlen;
L
Linus Torvalds 已提交
1026 1027 1028 1029 1030 1031 1032 1033

			if (fraggap) {
				skb->csum = skb_copy_and_csum_bits(
					skb_prev, maxfraglen,
					data + transhdrlen, fraggap, 0);
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
				data += fraggap;
1034
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
			}

			copy = datalen - transhdrlen - fraggap;
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				err = -EFAULT;
				kfree_skb(skb);
				goto error;
			}

			offset += copy;
			length -= datalen - fraggap;
			transhdrlen = 0;
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;

			/*
			 * Put the packet on the pending queue.
			 */
1053
			__skb_queue_tail(queue, skb);
L
Linus Torvalds 已提交
1054 1055 1056 1057 1058 1059
			continue;
		}

		if (copy > length)
			copy = length;

1060
		if (!(rt->dst.dev->features&NETIF_F_SG)) {
L
Linus Torvalds 已提交
1061 1062 1063
			unsigned int off;

			off = skb->len;
1064
			if (getfrag(from, skb_put(skb, copy),
L
Linus Torvalds 已提交
1065 1066 1067 1068 1069 1070 1071 1072
					offset, copy, off, skb) < 0) {
				__skb_trim(skb, off);
				err = -EFAULT;
				goto error;
			}
		} else {
			int i = skb_shinfo(skb)->nr_frags;

1073 1074
			err = -ENOMEM;
			if (!sk_page_frag_refill(sk, pfrag))
L
Linus Torvalds 已提交
1075
				goto error;
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086

			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				err = -EMSGSIZE;
				if (i == MAX_SKB_FRAGS)
					goto error;

				__skb_fill_page_desc(skb, i, pfrag->page,
						     pfrag->offset, 0);
				skb_shinfo(skb)->nr_frags = ++i;
				get_page(pfrag->page);
L
Linus Torvalds 已提交
1087
			}
1088 1089 1090 1091 1092 1093 1094 1095
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
			if (getfrag(from,
				    page_address(pfrag->page) + pfrag->offset,
				    offset, copy, skb->len, skb) < 0)
				goto error_efault;

			pfrag->offset += copy;
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
L
Linus Torvalds 已提交
1096 1097
			skb->len += copy;
			skb->data_len += copy;
1098 1099
			skb->truesize += copy;
			atomic_add(copy, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1100 1101 1102 1103 1104 1105 1106
		}
		offset += copy;
		length -= copy;
	}

	return 0;

1107 1108
error_efault:
	err = -EFAULT;
L
Linus Torvalds 已提交
1109
error:
1110
	cork->length -= length;
P
Pavel Emelyanov 已提交
1111
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1112
	return err;
L
Linus Torvalds 已提交
1113 1114
}

1115 1116 1117
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
			 struct ipcm_cookie *ipc, struct rtable **rtp)
{
1118
	struct ip_options_rcu *opt;
1119 1120 1121 1122 1123 1124 1125
	struct rtable *rt;

	/*
	 * setup for corking.
	 */
	opt = ipc->opt;
	if (opt) {
1126
		if (!cork->opt) {
1127 1128
			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
					    sk->sk_allocation);
1129
			if (unlikely(!cork->opt))
1130 1131
				return -ENOBUFS;
		}
1132
		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
		cork->flags |= IPCORK_OPT;
		cork->addr = ipc->addr;
	}
	rt = *rtp;
	if (unlikely(!rt))
		return -EFAULT;
	/*
	 * We steal reference to this route, caller should not release it
	 */
	*rtp = NULL;
1143 1144
	cork->fragsize = ip_sk_use_pmtu(sk) ?
			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1145 1146
	cork->dst = &rt->dst;
	cork->length = 0;
1147 1148 1149
	cork->ttl = ipc->ttl;
	cork->tos = ipc->tos;
	cork->priority = ipc->priority;
1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
	cork->tx_flags = ipc->tx_flags;

	return 0;
}

/*
 *	ip_append_data() and ip_append_page() can make one large IP datagram
 *	from many pieces of data. Each pieces will be holded on the socket
 *	until ip_push_pending_frames() is called. Each piece can be a page
 *	or non-page data.
 *
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
 *	this interface potentially.
 *
 *	LATER: length must be adjusted by pad at tail, when it is required.
 */
1166
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
		   int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
		   void *from, int length, int transhdrlen,
		   struct ipcm_cookie *ipc, struct rtable **rtp,
		   unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	int err;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue)) {
1180
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1181 1182 1183 1184 1185 1186
		if (err)
			return err;
	} else {
		transhdrlen = 0;
	}

1187 1188
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
				sk_page_frag(sk), getfrag,
1189 1190 1191
				from, length, transhdrlen, flags);
}

1192
ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
L
Linus Torvalds 已提交
1193 1194 1195 1196 1197 1198
		       int offset, size_t size, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;
	struct rtable *rt;
	struct ip_options *opt = NULL;
1199
	struct inet_cork *cork;
L
Linus Torvalds 已提交
1200 1201 1202 1203
	int hh_len;
	int mtu;
	int len;
	int err;
1204
	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
L
Linus Torvalds 已提交
1205 1206 1207 1208 1209 1210 1211 1212 1213 1214

	if (inet->hdrincl)
		return -EPERM;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue))
		return -EINVAL;

1215 1216 1217 1218
	cork = &inet->cork.base;
	rt = (struct rtable *)cork->dst;
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1219

1220
	if (!(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
1221 1222
		return -EOPNOTSUPP;

1223
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1224
	mtu = cork->fragsize;
L
Linus Torvalds 已提交
1225 1226 1227

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
1228
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
1229

1230
	if (cork->length + size > maxnonfragsize - fragheaderlen) {
1231 1232
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
1233 1234 1235
		return -EMSGSIZE;
	}

1236 1237
	skb = skb_peek_tail(&sk->sk_write_queue);
	if (!skb)
L
Linus Torvalds 已提交
1238 1239
		return -EINVAL;

1240
	cork->length += size;
1241 1242
	if ((size + skb->len > mtu) &&
	    (sk->sk_protocol == IPPROTO_UDP) &&
1243
	    (rt->dst.dev->features & NETIF_F_UFO)) {
1244
		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
H
Herbert Xu 已提交
1245
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1246
	}
1247

L
Linus Torvalds 已提交
1248
	while (size > 0) {
1249
		if (skb_is_gso(skb)) {
1250
			len = size;
1251
		} else {
1252 1253 1254 1255 1256 1257

			/* Check if the remaining data fits into current packet. */
			len = mtu - skb->len;
			if (len < size)
				len = maxfraglen - skb->len;
		}
L
Linus Torvalds 已提交
1258 1259 1260 1261 1262
		if (len <= 0) {
			struct sk_buff *skb_prev;
			int alloclen;

			skb_prev = skb;
1263
			fraggap = skb_prev->len - maxfraglen;
L
Linus Torvalds 已提交
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281

			alloclen = fragheaderlen + hh_len + fraggap + 15;
			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
			if (unlikely(!skb)) {
				err = -ENOBUFS;
				goto error;
			}

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = CHECKSUM_NONE;
			skb->csum = 0;
			skb_reserve(skb, hh_len);

			/*
			 *	Find where to start putting bytes.
			 */
1282
			skb_put(skb, fragheaderlen + fraggap);
1283
			skb_reset_network_header(skb);
1284 1285
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
L
Linus Torvalds 已提交
1286
			if (fraggap) {
1287 1288
				skb->csum = skb_copy_and_csum_bits(skb_prev,
								   maxfraglen,
1289
						    skb_transport_header(skb),
1290
								   fraggap, 0);
L
Linus Torvalds 已提交
1291 1292
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
1293
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304
			}

			/*
			 * Put the packet on the pending queue.
			 */
			__skb_queue_tail(&sk->sk_write_queue, skb);
			continue;
		}

		if (len > size)
			len = size;
1305 1306

		if (skb_append_pagefrags(skb, page, offset, len)) {
L
Linus Torvalds 已提交
1307 1308 1309 1310 1311
			err = -EMSGSIZE;
			goto error;
		}

		if (skb->ip_summed == CHECKSUM_NONE) {
1312
			__wsum csum;
L
Linus Torvalds 已提交
1313 1314 1315 1316 1317 1318
			csum = csum_page(page, offset, len);
			skb->csum = csum_block_add(skb->csum, csum, skb->len);
		}

		skb->len += len;
		skb->data_len += len;
1319 1320
		skb->truesize += len;
		atomic_add(len, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1321 1322 1323 1324 1325 1326
		offset += len;
		size -= len;
	}
	return 0;

error:
1327
	cork->length -= size;
P
Pavel Emelyanov 已提交
1328
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1329 1330 1331
	return err;
}

1332
static void ip_cork_release(struct inet_cork *cork)
1333
{
1334 1335 1336 1337 1338
	cork->flags &= ~IPCORK_OPT;
	kfree(cork->opt);
	cork->opt = NULL;
	dst_release(cork->dst);
	cork->dst = NULL;
1339 1340
}

L
Linus Torvalds 已提交
1341 1342 1343 1344
/*
 *	Combined all pending IP fragments on the socket as one IP datagram
 *	and push them out.
 */
1345
struct sk_buff *__ip_make_skb(struct sock *sk,
1346
			      struct flowi4 *fl4,
1347 1348
			      struct sk_buff_head *queue,
			      struct inet_cork *cork)
L
Linus Torvalds 已提交
1349 1350 1351 1352
{
	struct sk_buff *skb, *tmp_skb;
	struct sk_buff **tail_skb;
	struct inet_sock *inet = inet_sk(sk);
1353
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1354
	struct ip_options *opt = NULL;
1355
	struct rtable *rt = (struct rtable *)cork->dst;
L
Linus Torvalds 已提交
1356
	struct iphdr *iph;
1357
	__be16 df = 0;
L
Linus Torvalds 已提交
1358 1359
	__u8 ttl;

1360 1361
	skb = __skb_dequeue(queue);
	if (!skb)
L
Linus Torvalds 已提交
1362 1363 1364 1365
		goto out;
	tail_skb = &(skb_shinfo(skb)->frag_list);

	/* move skb->data to ip header from ext header */
1366
	if (skb->data < skb_network_header(skb))
1367
		__skb_pull(skb, skb_network_offset(skb));
1368
	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1369
		__skb_pull(tmp_skb, skb_network_header_len(skb));
L
Linus Torvalds 已提交
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382
		*tail_skb = tmp_skb;
		tail_skb = &(tmp_skb->next);
		skb->len += tmp_skb->len;
		skb->data_len += tmp_skb->len;
		skb->truesize += tmp_skb->truesize;
		tmp_skb->destructor = NULL;
		tmp_skb->sk = NULL;
	}

	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
	 * to fragment the frame generated here. No matter, what transforms
	 * how transforms change size of the packet, it will come out.
	 */
W
WANG Cong 已提交
1383
	skb->ignore_df = ip_sk_ignore_df(sk);
L
Linus Torvalds 已提交
1384 1385

	/* DF bit is set when we want to see DF on outgoing frames.
W
WANG Cong 已提交
1386
	 * If ignore_df is set too, we still allow to fragment this frame
L
Linus Torvalds 已提交
1387
	 * locally. */
1388 1389
	if (inet->pmtudisc == IP_PMTUDISC_DO ||
	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
1390 1391
	    (skb->len <= dst_mtu(&rt->dst) &&
	     ip_dont_fragment(sk, &rt->dst)))
L
Linus Torvalds 已提交
1392 1393
		df = htons(IP_DF);

1394 1395
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1396

1397 1398 1399
	if (cork->ttl != 0)
		ttl = cork->ttl;
	else if (rt->rt_type == RTN_MULTICAST)
L
Linus Torvalds 已提交
1400 1401
		ttl = inet->mc_ttl;
	else
1402
		ttl = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
1403

1404
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1405 1406
	iph->version = 4;
	iph->ihl = 5;
1407
	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
L
Linus Torvalds 已提交
1408 1409 1410
	iph->frag_off = df;
	iph->ttl = ttl;
	iph->protocol = sk->sk_protocol;
1411
	ip_copy_addrs(iph, fl4);
1412
	ip_select_ident(net, skb, sk);
L
Linus Torvalds 已提交
1413

1414 1415 1416 1417 1418
	if (opt) {
		iph->ihl += opt->optlen>>2;
		ip_options_build(skb, opt, cork->addr, rt, 0);
	}

1419
	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1420
	skb->mark = sk->sk_mark;
1421 1422 1423 1424
	/*
	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
	 * on dst refcount
	 */
1425
	cork->dst = NULL;
1426
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
1427

1428
	if (iph->protocol == IPPROTO_ICMP)
1429
		icmp_out_count(net, ((struct icmphdr *)
1430 1431
			skb_transport_header(skb))->type);

1432 1433 1434 1435 1436
	ip_cork_release(cork);
out:
	return skb;
}

E
Eric Dumazet 已提交
1437
int ip_send_skb(struct net *net, struct sk_buff *skb)
1438 1439 1440
{
	int err;

H
Herbert Xu 已提交
1441
	err = ip_local_out(skb);
L
Linus Torvalds 已提交
1442 1443
	if (err) {
		if (err > 0)
E
Eric Dumazet 已提交
1444
			err = net_xmit_errno(err);
L
Linus Torvalds 已提交
1445
		if (err)
1446
			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1447 1448 1449 1450 1451
	}

	return err;
}

1452
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1453
{
1454 1455
	struct sk_buff *skb;

1456
	skb = ip_finish_skb(sk, fl4);
1457 1458 1459 1460
	if (!skb)
		return 0;

	/* Netfilter gets whole the not fragmented skb. */
E
Eric Dumazet 已提交
1461
	return ip_send_skb(sock_net(sk), skb);
1462 1463
}

L
Linus Torvalds 已提交
1464 1465 1466
/*
 *	Throw away all pending data on the socket.
 */
1467 1468 1469
static void __ip_flush_pending_frames(struct sock *sk,
				      struct sk_buff_head *queue,
				      struct inet_cork *cork)
L
Linus Torvalds 已提交
1470 1471 1472
{
	struct sk_buff *skb;

1473
	while ((skb = __skb_dequeue_tail(queue)) != NULL)
L
Linus Torvalds 已提交
1474 1475
		kfree_skb(skb);

1476 1477 1478 1479 1480
	ip_cork_release(cork);
}

void ip_flush_pending_frames(struct sock *sk)
{
1481
	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
L
Linus Torvalds 已提交
1482 1483
}

1484
struct sk_buff *ip_make_skb(struct sock *sk,
1485
			    struct flowi4 *fl4,
1486 1487 1488 1489 1490 1491
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    struct ipcm_cookie *ipc, struct rtable **rtp,
			    unsigned int flags)
{
1492
	struct inet_cork cork;
1493 1494 1495 1496 1497 1498 1499 1500
	struct sk_buff_head queue;
	int err;

	if (flags & MSG_PROBE)
		return NULL;

	__skb_queue_head_init(&queue);

1501 1502
	cork.flags = 0;
	cork.addr = 0;
1503
	cork.opt = NULL;
1504 1505 1506 1507
	err = ip_setup_cork(sk, &cork, ipc, rtp);
	if (err)
		return ERR_PTR(err);

1508 1509
	err = __ip_append_data(sk, fl4, &queue, &cork,
			       &current->task_frag, getfrag,
1510 1511 1512 1513 1514 1515
			       from, length, transhdrlen, flags);
	if (err) {
		__ip_flush_pending_frames(sk, &queue, &cork);
		return ERR_PTR(err);
	}

1516
	return __ip_make_skb(sk, fl4, &queue, &cork);
1517
}
L
Linus Torvalds 已提交
1518 1519 1520 1521

/*
 *	Fetch data from kernel space and fill in checksum if needed.
 */
1522
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
L
Linus Torvalds 已提交
1523 1524
			      int len, int odd, struct sk_buff *skb)
{
1525
	__wsum csum;
L
Linus Torvalds 已提交
1526 1527 1528

	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
	skb->csum = csum_block_add(skb->csum, csum, odd);
1529
	return 0;
L
Linus Torvalds 已提交
1530 1531
}

1532
/*
L
Linus Torvalds 已提交
1533
 *	Generic function to send a packet as reply to another packet.
1534
 *	Used to send some TCP resets/acks so far.
L
Linus Torvalds 已提交
1535
 */
1536
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1537 1538 1539
			   const struct ip_options *sopt,
			   __be32 daddr, __be32 saddr,
			   const struct ip_reply_arg *arg,
1540
			   unsigned int len)
L
Linus Torvalds 已提交
1541
{
1542
	struct ip_options_data replyopts;
L
Linus Torvalds 已提交
1543
	struct ipcm_cookie ipc;
1544
	struct flowi4 fl4;
E
Eric Dumazet 已提交
1545
	struct rtable *rt = skb_rtable(skb);
1546
	struct net *net = sock_net(sk);
1547
	struct sk_buff *nskb;
1548
	int err;
1549
	int oif;
L
Linus Torvalds 已提交
1550

1551
	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
L
Linus Torvalds 已提交
1552 1553
		return;

1554
	ipc.addr = daddr;
L
Linus Torvalds 已提交
1555
	ipc.opt = NULL;
1556
	ipc.tx_flags = 0;
1557 1558
	ipc.ttl = 0;
	ipc.tos = -1;
L
Linus Torvalds 已提交
1559

1560
	if (replyopts.opt.opt.optlen) {
L
Linus Torvalds 已提交
1561 1562
		ipc.opt = &replyopts.opt;

1563 1564
		if (replyopts.opt.opt.srr)
			daddr = replyopts.opt.opt.faddr;
L
Linus Torvalds 已提交
1565 1566
	}

1567 1568 1569 1570 1571
	oif = arg->bound_dev_if;
	if (!oif && netif_index_is_vrf(net, skb->skb_iif))
		oif = skb->skb_iif;

	flowi4_init_output(&fl4, oif,
1572
			   IP4_REPLY_MARK(net, skb->mark),
1573
			   RT_TOS(arg->tos),
1574
			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1575
			   ip_reply_arg_flowi_flags(arg),
1576
			   daddr, saddr,
1577 1578
			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1579
	rt = ip_route_output_key(net, &fl4);
1580 1581
	if (IS_ERR(rt))
		return;
L
Linus Torvalds 已提交
1582

1583
	inet_sk(sk)->tos = arg->tos;
L
Linus Torvalds 已提交
1584 1585

	sk->sk_priority = skb->priority;
1586
	sk->sk_protocol = ip_hdr(skb)->protocol;
1587
	sk->sk_bound_dev_if = arg->bound_dev_if;
1588
	sk->sk_sndbuf = sysctl_wmem_default;
1589 1590 1591 1592 1593 1594 1595
	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
			     len, 0, &ipc, &rt, MSG_DONTWAIT);
	if (unlikely(err)) {
		ip_flush_pending_frames(sk);
		goto out;
	}

1596 1597
	nskb = skb_peek(&sk->sk_write_queue);
	if (nskb) {
L
Linus Torvalds 已提交
1598
		if (arg->csumoffset >= 0)
1599 1600
			*((__sum16 *)skb_transport_header(nskb) +
			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1601
								arg->csum));
1602 1603
		nskb->ip_summed = CHECKSUM_NONE;
		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1604
		ip_push_pending_frames(sk, &fl4);
L
Linus Torvalds 已提交
1605
	}
1606
out:
L
Linus Torvalds 已提交
1607 1608 1609 1610 1611 1612 1613 1614
	ip_rt_put(rt);
}

void __init ip_init(void)
{
	ip_rt_init();
	inet_initpeers();

1615 1616
#if defined(CONFIG_IP_MULTICAST)
	igmp_mc_init();
L
Linus Torvalds 已提交
1617 1618
#endif
}