ip_output.c 39.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The Internet Protocol (IP) output module.
 *
8
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Donald Becker, <becker@super.org>
 *		Alan Cox, <Alan.Cox@linux.org>
 *		Richard Underwood
 *		Stefan Becker, <stefanb@yello.ping.de>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 *	See ip_input.c for original log
 *
 *	Fixes:
 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23
 *		Bradford Johnson:	Fix faulty handling of some frames when
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33
 *					no route is found.
 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
 *					(in case if packet not accepted by
 *					output firewall rules)
 *		Mike McLagan	:	Routing by source
 *		Alexey Kuznetsov:	use new route cache
 *		Andi Kleen:		Fix broken PMTU recovery and remove
 *					some redundant tests.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 35 36
 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
 *					for decreased register pressure on x86
 *					and more readibility.
L
Linus Torvalds 已提交
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
 *					silently drop skb instead of failing with -EPERM.
 *		Detlev Wengorz	:	Copy protocol for fragments.
 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
 *					datagrams.
 *		Hirokazu Takahashi:	sendfile() on UDP works now.
 */

#include <asm/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
A
Al Viro 已提交
52
#include <linux/highmem.h>
53
#include <linux/slab.h>
L
Linus Torvalds 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
69
#include <net/xfrm.h>
L
Linus Torvalds 已提交
70 71 72 73 74 75 76 77 78 79 80
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
81
#include <linux/tcp.h>
L
Linus Torvalds 已提交
82

83
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84
EXPORT_SYMBOL(sysctl_ip_default_ttl);
L
Linus Torvalds 已提交
85

86 87 88 89
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
	    unsigned int mtu,
	    int (*output)(struct net *, struct sock *, struct sk_buff *));
90

L
Linus Torvalds 已提交
91
/* Generate a checksum for an outgoing IP datagram. */
92
void ip_send_check(struct iphdr *iph)
L
Linus Torvalds 已提交
93 94 95 96
{
	iph->check = 0;
	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
E
Eric Dumazet 已提交
97
EXPORT_SYMBOL(ip_send_check);
L
Linus Torvalds 已提交
98

99
static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
100
{
101
	struct net *net = dev_net(skb_dst(skb)->dev);
H
Herbert Xu 已提交
102 103 104 105
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);
106 107
	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		       net, sk, skb, NULL, skb_dst(skb)->dev,
108
		       dst_output_okfn);
109 110 111 112 113
}

int __ip_local_out(struct sk_buff *skb)
{
	return __ip_local_out_sk(skb->sk, skb);
H
Herbert Xu 已提交
114 115
}

116
int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
117 118 119
{
	int err;

120
	err = __ip_local_out_sk(sk, skb);
H
Herbert Xu 已提交
121
	if (likely(err == 1))
122
		err = dst_output(sk, skb);
H
Herbert Xu 已提交
123 124 125

	return err;
}
126
EXPORT_SYMBOL_GPL(ip_local_out_sk);
H
Herbert Xu 已提交
127

L
Linus Torvalds 已提交
128 129 130 131 132
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
	int ttl = inet->uc_ttl;

	if (ttl < 0)
133
		ttl = ip4_dst_hoplimit(dst);
L
Linus Torvalds 已提交
134 135 136
	return ttl;
}

137
/*
L
Linus Torvalds 已提交
138 139 140
 *		Add an ip header to a skbuff and send it out.
 *
 */
141
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
142
			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
L
Linus Torvalds 已提交
143 144
{
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
145
	struct rtable *rt = skb_rtable(skb);
L
Linus Torvalds 已提交
146 147 148
	struct iphdr *iph;

	/* Build the IP header. */
149
	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
150
	skb_reset_network_header(skb);
151
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
152 153 154
	iph->version  = 4;
	iph->ihl      = 5;
	iph->tos      = inet->tos;
155
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
156 157
	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
	iph->saddr    = saddr;
L
Linus Torvalds 已提交
158
	iph->protocol = sk->sk_protocol;
159 160 161 162 163 164 165
	if (ip_dont_fragment(sk, &rt->dst)) {
		iph->frag_off = htons(IP_DF);
		iph->id = 0;
	} else {
		iph->frag_off = 0;
		__ip_select_ident(sock_net(sk), iph, 1);
	}
L
Linus Torvalds 已提交
166

167 168 169
	if (opt && opt->opt.optlen) {
		iph->ihl += opt->opt.optlen>>2;
		ip_options_build(skb, &opt->opt, daddr, rt, 0);
L
Linus Torvalds 已提交
170 171 172
	}

	skb->priority = sk->sk_priority;
173
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
174 175

	/* Send it out. */
H
Herbert Xu 已提交
176
	return ip_local_out(skb);
L
Linus Torvalds 已提交
177
}
178 179
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);

180
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
181
{
E
Eric Dumazet 已提交
182
	struct dst_entry *dst = skb_dst(skb);
183
	struct rtable *rt = (struct rtable *)dst;
L
Linus Torvalds 已提交
184
	struct net_device *dev = dst->dev;
185
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
186
	struct neighbour *neigh;
187
	u32 nexthop;
L
Linus Torvalds 已提交
188

189
	if (rt->rt_type == RTN_MULTICAST) {
190
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
191
	} else if (rt->rt_type == RTN_BROADCAST)
192
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
193

L
Linus Torvalds 已提交
194
	/* Be paranoid, rather than too clever. */
195
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
L
Linus Torvalds 已提交
196 197 198
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
199
		if (!skb2) {
L
Linus Torvalds 已提交
200 201 202 203 204
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
205
		consume_skb(skb);
L
Linus Torvalds 已提交
206 207 208
		skb = skb2;
	}

209
	rcu_read_lock_bh();
J
Julian Anastasov 已提交
210
	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
211 212 213
	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
	if (unlikely(!neigh))
		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
214
	if (!IS_ERR(neigh)) {
215
		int res = dst_neigh_output(dst, neigh, skb);
216

217
		rcu_read_unlock_bh();
218 219
		return res;
	}
220
	rcu_read_unlock_bh();
221

222 223
	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
L
Linus Torvalds 已提交
224 225 226 227
	kfree_skb(skb);
	return -EINVAL;
}

228 229
static int ip_finish_output_gso(struct net *net, struct sock *sk,
				struct sk_buff *skb, unsigned int mtu)
230 231 232 233 234 235 236
{
	netdev_features_t features;
	struct sk_buff *segs;
	int ret = 0;

	/* common case: locally created skb or seglen is <= mtu */
	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
237
	      skb_gso_network_seglen(skb) <= mtu)
238
		return ip_finish_output2(net, sk, skb);
239 240 241 242 243 244 245 246 247 248

	/* Slowpath -  GSO segment length is exceeding the dst MTU.
	 *
	 * This can happen in two cases:
	 * 1) TCP GRO packet, DF bit not set
	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
	 * from host network stack.
	 */
	features = netif_skb_features(skb);
	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
249
	if (IS_ERR_OR_NULL(segs)) {
250 251 252 253 254 255 256 257 258 259 260
		kfree_skb(skb);
		return -ENOMEM;
	}

	consume_skb(skb);

	do {
		struct sk_buff *nskb = segs->next;
		int err;

		segs->next = NULL;
261
		err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
262 263 264 265 266 267 268 269 270

		if (err && ret == 0)
			ret = err;
		segs = nskb;
	} while (segs);

	return ret;
}

271
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
272
{
273 274
	unsigned int mtu;

275 276
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
277
	if (skb_dst(skb)->xfrm) {
278
		IPCB(skb)->flags |= IPSKB_REROUTED;
279
		return dst_output(sk, skb);
280
	}
281
#endif
282
	mtu = ip_skb_dst_mtu(skb);
283
	if (skb_is_gso(skb))
284
		return ip_finish_output_gso(net, sk, skb, mtu);
285

286
	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
287
		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
288

289
	return ip_finish_output2(net, sk, skb);
L
Linus Torvalds 已提交
290 291
}

292
int ip_mc_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
293
{
E
Eric Dumazet 已提交
294
	struct rtable *rt = skb_rtable(skb);
295
	struct net_device *dev = rt->dst.dev;
296
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
297 298 299 300

	/*
	 *	If the indicated interface is up and running, send the packet.
	 */
301
	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
302 303 304 305 306 307 308 309 310

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	/*
	 *	Multicasts are looped back for other local users
	 */

	if (rt->rt_flags&RTCF_MULTICAST) {
311
		if (sk_mc_loop(sk)
L
Linus Torvalds 已提交
312 313 314 315 316 317 318 319 320
#ifdef CONFIG_IP_MROUTE
		/* Small optimization: do not loopback not local frames,
		   which returned after forwarding; they will be  dropped
		   by ip_mr_input in any case.
		   Note, that local frames are looped back to be delivered
		   to local recipients.

		   This check is duplicated in ip_mr_input at the moment.
		 */
321 322 323
		    &&
		    ((rt->rt_flags & RTCF_LOCAL) ||
		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
L
Linus Torvalds 已提交
324
#endif
325
		   ) {
L
Linus Torvalds 已提交
326 327
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
			if (newskb)
328
				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
329
					net, sk, newskb, NULL, newskb->dev,
330
					dev_loopback_xmit);
L
Linus Torvalds 已提交
331 332 333 334
		}

		/* Multicasts with ttl 0 must not go beyond the host */

335
		if (ip_hdr(skb)->ttl == 0) {
L
Linus Torvalds 已提交
336 337 338 339 340 341 342 343
			kfree_skb(skb);
			return 0;
		}
	}

	if (rt->rt_flags&RTCF_BROADCAST) {
		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
		if (newskb)
344 345 346
			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				net, sk, newskb, NULL, newskb->dev,
				dev_loopback_xmit);
L
Linus Torvalds 已提交
347 348
	}

349 350 351
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, NULL, skb->dev,
			    ip_finish_output,
352
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
353 354
}

355
int ip_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
356
{
E
Eric Dumazet 已提交
357
	struct net_device *dev = skb_dst(skb)->dev;
358
	struct net *net = dev_net(dev);
359

360
	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
361

362 363 364
	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

365 366
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, NULL, dev,
367
			    ip_finish_output,
368
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
369 370
}

371 372 373 374 375 376 377 378 379 380 381 382 383 384
/*
 * copy saddr and daddr, possibly using 64bit load/stores
 * Equivalent to :
 *   iph->saddr = fl4->saddr;
 *   iph->daddr = fl4->daddr;
 */
static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
	memcpy(&iph->saddr, &fl4->saddr,
	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
}

385 386
/* Note: skb->sk can be different from sk, in case of tunnels */
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
L
Linus Torvalds 已提交
387 388
{
	struct inet_sock *inet = inet_sk(sk);
389
	struct ip_options_rcu *inet_opt;
390
	struct flowi4 *fl4;
L
Linus Torvalds 已提交
391 392
	struct rtable *rt;
	struct iphdr *iph;
393
	int res;
L
Linus Torvalds 已提交
394 395 396 397

	/* Skip all of this if the packet is already routed,
	 * f.e. by something like SCTP.
	 */
398
	rcu_read_lock();
399
	inet_opt = rcu_dereference(inet->inet_opt);
400
	fl4 = &fl->u.ip4;
E
Eric Dumazet 已提交
401
	rt = skb_rtable(skb);
402
	if (rt)
L
Linus Torvalds 已提交
403 404 405 406
		goto packet_routed;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);
407
	if (!rt) {
A
Al Viro 已提交
408
		__be32 daddr;
L
Linus Torvalds 已提交
409 410

		/* Use correct destination address if we have options. */
E
Eric Dumazet 已提交
411
		daddr = inet->inet_daddr;
412 413
		if (inet_opt && inet_opt->opt.srr)
			daddr = inet_opt->opt.faddr;
L
Linus Torvalds 已提交
414

415 416 417 418
		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times
		 * itself out.
		 */
419
		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
420 421 422 423 424 425 426 427
					   daddr, inet->inet_saddr,
					   inet->inet_dport,
					   inet->inet_sport,
					   sk->sk_protocol,
					   RT_CONN_FLAGS(sk),
					   sk->sk_bound_dev_if);
		if (IS_ERR(rt))
			goto no_route;
428
		sk_setup_caps(sk, &rt->dst);
L
Linus Torvalds 已提交
429
	}
430
	skb_dst_set_noref(skb, &rt->dst);
L
Linus Torvalds 已提交
431 432

packet_routed:
J
Julian Anastasov 已提交
433
	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
L
Linus Torvalds 已提交
434 435 436
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
437
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
438
	skb_reset_network_header(skb);
439
	iph = ip_hdr(skb);
440
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
W
WANG Cong 已提交
441
	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
L
Linus Torvalds 已提交
442 443 444
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
445
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
446
	iph->protocol = sk->sk_protocol;
447 448
	ip_copy_addrs(iph, fl4);

L
Linus Torvalds 已提交
449 450
	/* Transport layer set skb->h.foo itself. */

451 452 453
	if (inet_opt && inet_opt->opt.optlen) {
		iph->ihl += inet_opt->opt.optlen >> 2;
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
L
Linus Torvalds 已提交
454 455
	}

456 457
	ip_select_ident_segs(sock_net(sk), skb, sk,
			     skb_shinfo(skb)->gso_segs ?: 1);
L
Linus Torvalds 已提交
458

459
	/* TODO : should we use skb->sk here instead of sk ? */
L
Linus Torvalds 已提交
460
	skb->priority = sk->sk_priority;
461
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
462

463 464 465
	res = ip_local_out(skb);
	rcu_read_unlock();
	return res;
L
Linus Torvalds 已提交
466 467

no_route:
468
	rcu_read_unlock();
P
Pavel Emelyanov 已提交
469
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
470 471 472
	kfree_skb(skb);
	return -EHOSTUNREACH;
}
E
Eric Dumazet 已提交
473
EXPORT_SYMBOL(ip_queue_xmit);
L
Linus Torvalds 已提交
474 475 476 477 478 479

static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
	to->pkt_type = from->pkt_type;
	to->priority = from->priority;
	to->protocol = from->protocol;
E
Eric Dumazet 已提交
480
	skb_dst_drop(to);
481
	skb_dst_copy(to, from);
L
Linus Torvalds 已提交
482
	to->dev = from->dev;
T
Thomas Graf 已提交
483
	to->mark = from->mark;
L
Linus Torvalds 已提交
484 485 486 487 488 489 490

	/* Copy the flags to each fragment. */
	IPCB(to)->flags = IPCB(from)->flags;

#ifdef CONFIG_NET_SCHED
	to->tc_index = from->tc_index;
#endif
491
	nf_copy(to, from);
492 493
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
	to->ipvs_property = from->ipvs_property;
L
Linus Torvalds 已提交
494
#endif
495
	skb_copy_secmark(to, from);
L
Linus Torvalds 已提交
496 497
}

498
static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
499
		       unsigned int mtu,
500
		       int (*output)(struct net *, struct sock *, struct sk_buff *))
501 502 503
{
	struct iphdr *iph = ip_hdr(skb);

504
	if ((iph->frag_off & htons(IP_DF)) == 0)
505
		return ip_do_fragment(net, sk, skb, output);
506 507

	if (unlikely(!skb->ignore_df ||
508 509
		     (IPCB(skb)->frag_max_size &&
		      IPCB(skb)->frag_max_size > mtu))) {
510
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
511 512 513 514 515 516
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(mtu));
		kfree_skb(skb);
		return -EMSGSIZE;
	}

517
	return ip_do_fragment(net, sk, skb, output);
518 519
}

L
Linus Torvalds 已提交
520 521 522 523 524 525 526
/*
 *	This IP datagram is too large to be sent in one piece.  Break it up into
 *	smaller pieces (each of size equal to IP header plus
 *	a block of the data of the original IP data part) that will yet fit in a
 *	single device frame, and queue such a frame for sending.
 */

527 528
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
		   int (*output)(struct net *, struct sock *, struct sk_buff *))
L
Linus Torvalds 已提交
529 530 531 532 533
{
	struct iphdr *iph;
	int ptr;
	struct net_device *dev;
	struct sk_buff *skb2;
534
	unsigned int mtu, hlen, left, len, ll_rs;
L
Linus Torvalds 已提交
535
	int offset;
536
	__be16 not_last_frag;
E
Eric Dumazet 已提交
537
	struct rtable *rt = skb_rtable(skb);
L
Linus Torvalds 已提交
538 539
	int err = 0;

540
	dev = rt->dst.dev;
L
Linus Torvalds 已提交
541 542 543 544 545

	/*
	 *	Point into the IP datagram header.
	 */

546
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
547

548
	mtu = ip_skb_dst_mtu(skb);
549 550
	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
		mtu = IPCB(skb)->frag_max_size;
L
Linus Torvalds 已提交
551 552 553 554 555 556

	/*
	 *	Setup starting values.
	 */

	hlen = iph->ihl * 4;
557
	mtu = mtu - hlen;	/* Size of data space */
H
Herbert Xu 已提交
558
	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
L
Linus Torvalds 已提交
559 560 561 562 563 564 565 566

	/* When frag_list is given, use it. First, check its validity:
	 * some transformers could create wrong frag_list or break existing
	 * one, it is not prohibited. In this case fall back to copying.
	 *
	 * LATER: this step can be merged to real generation of fragments,
	 * we can switch to copy when see the first bad fragment.
	 */
567
	if (skb_has_frag_list(skb)) {
568
		struct sk_buff *frag, *frag2;
L
Linus Torvalds 已提交
569 570 571 572
		int first_len = skb_pagelen(skb);

		if (first_len - hlen > mtu ||
		    ((first_len - hlen) & 7) ||
573
		    ip_is_fragment(iph) ||
L
Linus Torvalds 已提交
574 575 576
		    skb_cloned(skb))
			goto slow_path;

577
		skb_walk_frags(skb, frag) {
L
Linus Torvalds 已提交
578 579 580 581
			/* Correct geometry. */
			if (frag->len > mtu ||
			    ((frag->len & 7) && frag->next) ||
			    skb_headroom(frag) < hlen)
582
				goto slow_path_clean;
L
Linus Torvalds 已提交
583 584 585

			/* Partially cloned skb? */
			if (skb_shared(frag))
586
				goto slow_path_clean;
587 588 589 590 591 592

			BUG_ON(frag->sk);
			if (skb->sk) {
				frag->sk = skb->sk;
				frag->destructor = sock_wfree;
			}
593
			skb->truesize -= frag->truesize;
L
Linus Torvalds 已提交
594 595 596 597 598 599 600
		}

		/* Everything is OK. Generate! */

		err = 0;
		offset = 0;
		frag = skb_shinfo(skb)->frag_list;
601
		skb_frag_list_init(skb);
L
Linus Torvalds 已提交
602 603 604 605 606 607 608 609 610 611 612
		skb->data_len = first_len - skb_headlen(skb);
		skb->len = first_len;
		iph->tot_len = htons(first_len);
		iph->frag_off = htons(IP_MF);
		ip_send_check(iph);

		for (;;) {
			/* Prepare header of the next frame,
			 * before previous one went down. */
			if (frag) {
				frag->ip_summed = CHECKSUM_NONE;
613
				skb_reset_transport_header(frag);
614 615
				__skb_push(frag, hlen);
				skb_reset_network_header(frag);
616
				memcpy(skb_network_header(frag), iph, hlen);
617
				iph = ip_hdr(frag);
L
Linus Torvalds 已提交
618 619 620 621 622 623
				iph->tot_len = htons(frag->len);
				ip_copy_metadata(frag, skb);
				if (offset == 0)
					ip_options_fragment(frag);
				offset += skb->len - hlen;
				iph->frag_off = htons(offset>>3);
624
				if (frag->next)
L
Linus Torvalds 已提交
625 626 627 628 629
					iph->frag_off |= htons(IP_MF);
				/* Ready, complete checksum */
				ip_send_check(iph);
			}

630
			err = output(net, sk, skb);
L
Linus Torvalds 已提交
631

632
			if (!err)
633
				IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
634 635 636 637 638 639 640 641 642
			if (err || !frag)
				break;

			skb = frag;
			frag = skb->next;
			skb->next = NULL;
		}

		if (err == 0) {
643
			IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
644 645 646 647 648 649 650 651
			return 0;
		}

		while (frag) {
			skb = frag->next;
			kfree_skb(frag);
			frag = skb;
		}
652
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
653
		return err;
654 655 656 657 658 659 660 661 662

slow_path_clean:
		skb_walk_frags(skb, frag2) {
			if (frag2 == frag)
				break;
			frag2->sk = NULL;
			frag2->destructor = NULL;
			skb->truesize += frag2->truesize;
		}
L
Linus Torvalds 已提交
663 664 665
	}

slow_path:
666 667 668
	/* for offloaded checksums cleanup checksum before fragmentation */
	if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
		goto fail;
669
	iph = ip_hdr(skb);
670

L
Linus Torvalds 已提交
671
	left = skb->len - hlen;		/* Space per frame */
672
	ptr = hlen;		/* Where to start from */
L
Linus Torvalds 已提交
673

674
	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
675

L
Linus Torvalds 已提交
676 677 678 679 680 681 682 683 684 685 686
	/*
	 *	Fragment the datagram.
	 */

	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
	not_last_frag = iph->frag_off & htons(IP_MF);

	/*
	 *	Keep copying data until we run out.
	 */

S
Stephen Hemminger 已提交
687
	while (left > 0) {
L
Linus Torvalds 已提交
688 689 690 691
		len = left;
		/* IF: it doesn't fit, use 'mtu' - the data space left */
		if (len > mtu)
			len = mtu;
L
Lucas De Marchi 已提交
692
		/* IF: we are not sending up to and including the packet end
L
Linus Torvalds 已提交
693 694 695 696 697
		   then align the next start on an eight byte boundary */
		if (len < left)	{
			len &= ~7;
		}

698 699 700
		/* Allocate buffer */
		skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
		if (!skb2) {
L
Linus Torvalds 已提交
701 702 703 704 705 706 707 708 709 710 711
			err = -ENOMEM;
			goto fail;
		}

		/*
		 *	Set up data on packet
		 */

		ip_copy_metadata(skb2, skb);
		skb_reserve(skb2, ll_rs);
		skb_put(skb2, len + hlen);
712
		skb_reset_network_header(skb2);
713
		skb2->transport_header = skb2->network_header + hlen;
L
Linus Torvalds 已提交
714 715 716 717 718 719 720 721 722 723 724 725 726

		/*
		 *	Charge the memory for the fragment to any owner
		 *	it might possess
		 */

		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);

		/*
		 *	Copy the packet header into the new buffer.
		 */

727
		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
L
Linus Torvalds 已提交
728 729 730 731

		/*
		 *	Copy a block of the IP datagram.
		 */
732
		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
L
Linus Torvalds 已提交
733 734 735 736 737 738
			BUG();
		left -= len;

		/*
		 *	Fill in the new header fields.
		 */
739
		iph = ip_hdr(skb2);
L
Linus Torvalds 已提交
740 741
		iph->frag_off = htons((offset >> 3));

742 743 744
		if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
			iph->frag_off |= htons(IP_DF);

L
Linus Torvalds 已提交
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
		/* ANK: dirty, but effective trick. Upgrade options only if
		 * the segment to be fragmented was THE FIRST (otherwise,
		 * options are already fixed) and make it ONCE
		 * on the initial skb, so that all the following fragments
		 * will inherit fixed options.
		 */
		if (offset == 0)
			ip_options_fragment(skb);

		/*
		 *	Added AC : If we are fragmenting a fragment that's not the
		 *		   last fragment then keep MF on each bit
		 */
		if (left > 0 || not_last_frag)
			iph->frag_off |= htons(IP_MF);
		ptr += len;
		offset += len;

		/*
		 *	Put this fragment into the sending queue.
		 */
		iph->tot_len = htons(len + hlen);

		ip_send_check(iph);

770
		err = output(net, sk, skb2);
L
Linus Torvalds 已提交
771 772
		if (err)
			goto fail;
773

774
		IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
775
	}
776
	consume_skb(skb);
777
	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
778 779 780
	return err;

fail:
781
	kfree_skb(skb);
782
	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
783 784
	return err;
}
785
EXPORT_SYMBOL(ip_do_fragment);
786

L
Linus Torvalds 已提交
787 788 789
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
790
	struct msghdr *msg = from;
L
Linus Torvalds 已提交
791

792
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
793
		if (copy_from_iter(to, len, &msg->msg_iter) != len)
L
Linus Torvalds 已提交
794 795
			return -EFAULT;
	} else {
796
		__wsum csum = 0;
797
		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
L
Linus Torvalds 已提交
798 799 800 801 802
			return -EFAULT;
		skb->csum = csum_block_add(skb->csum, csum, odd);
	}
	return 0;
}
E
Eric Dumazet 已提交
803
EXPORT_SYMBOL(ip_generic_getfrag);
L
Linus Torvalds 已提交
804

805
static inline __wsum
L
Linus Torvalds 已提交
806 807 808
csum_page(struct page *page, int offset, int copy)
{
	char *kaddr;
809
	__wsum csum;
L
Linus Torvalds 已提交
810 811 812 813 814 815
	kaddr = kmap(page);
	csum = csum_partial(kaddr + offset, copy, 0);
	kunmap(page);
	return csum;
}

A
Adrian Bunk 已提交
816
static inline int ip_ufo_append_data(struct sock *sk,
817
			struct sk_buff_head *queue,
818 819 820
			int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
			void *from, int length, int hh_len, int fragheaderlen,
821
			int transhdrlen, int maxfraglen, unsigned int flags)
822 823 824 825 826 827 828 829
{
	struct sk_buff *skb;
	int err;

	/* There is support for UDP fragmentation offload by network
	 * device, so create one single skb packet containing complete
	 * udp datagram
	 */
830 831
	skb = skb_peek_tail(queue);
	if (!skb) {
832 833 834 835
		skb = sock_alloc_send_skb(sk,
			hh_len + fragheaderlen + transhdrlen + 20,
			(flags & MSG_DONTWAIT), &err);

836
		if (!skb)
837 838 839 840 841 842
			return err;

		/* reserve space for Hardware header */
		skb_reserve(skb, hh_len);

		/* create space for UDP/IP header */
843
		skb_put(skb, fragheaderlen + transhdrlen);
844 845

		/* initialize network header pointer */
846
		skb_reset_network_header(skb);
847 848

		/* initialize protocol header pointer */
849
		skb->transport_header = skb->network_header + fragheaderlen;
850 851 852

		skb->csum = 0;

853
		__skb_queue_tail(queue, skb);
854 855
	} else if (skb_is_gso(skb)) {
		goto append;
856
	}
857

858 859 860 861 862 863
	skb->ip_summed = CHECKSUM_PARTIAL;
	/* specify the length of each IP datagram fragment */
	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;

append:
864 865
	return skb_append_datato_frags(sk, skb, getfrag, from,
				       (length - transhdrlen));
866 867
}

868 869 870
static int __ip_append_data(struct sock *sk,
			    struct flowi4 *fl4,
			    struct sk_buff_head *queue,
871
			    struct inet_cork *cork,
872
			    struct page_frag *pfrag,
873 874 875 876
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    unsigned int flags)
L
Linus Torvalds 已提交
877 878 879 880
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;

881
	struct ip_options *opt = cork->opt;
L
Linus Torvalds 已提交
882 883 884 885 886 887
	int hh_len;
	int exthdrlen;
	int mtu;
	int copy;
	int err;
	int offset = 0;
888
	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
L
Linus Torvalds 已提交
889
	int csummode = CHECKSUM_NONE;
890
	struct rtable *rt = (struct rtable *)cork->dst;
891
	u32 tskey = 0;
L
Linus Torvalds 已提交
892

893 894 895
	skb = skb_peek_tail(queue);

	exthdrlen = !skb ? rt->dst.header_len : 0;
896
	mtu = cork->fragsize;
897 898 899
	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
		tskey = sk->sk_tskey++;
L
Linus Torvalds 已提交
900

901
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
L
Linus Torvalds 已提交
902 903 904

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
905
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
906

907
	if (cork->length + length > maxnonfragsize - fragheaderlen) {
908
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
909
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
910 911 912 913 914 915 916 917 918
		return -EMSGSIZE;
	}

	/*
	 * transhdrlen > 0 means that this is the first fragment and we wish
	 * it won't be fragmented in the future.
	 */
	if (transhdrlen &&
	    length + fragheaderlen <= mtu &&
919
	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
L
Linus Torvalds 已提交
920
	    !exthdrlen)
921
		csummode = CHECKSUM_PARTIAL;
L
Linus Torvalds 已提交
922

923
	cork->length += length;
924
	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
925
	    (sk->sk_protocol == IPPROTO_UDP) &&
926 927
	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
	    (sk->sk_type == SOCK_DGRAM)) {
928 929
		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
					 hh_len, fragheaderlen, transhdrlen,
930
					 maxfraglen, flags);
931
		if (err)
932 933 934
			goto error;
		return 0;
	}
L
Linus Torvalds 已提交
935 936 937 938 939 940 941 942

	/* So, what's going on in the loop below?
	 *
	 * We use calculated fragment length to generate chained skb,
	 * each of segments is IP fragment ready for sending to network after
	 * adding appropriate IP header.
	 */

943
	if (!skb)
L
Linus Torvalds 已提交
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973
		goto alloc_new_skb;

	while (length > 0) {
		/* Check if the remaining data fits into current packet. */
		copy = mtu - skb->len;
		if (copy < length)
			copy = maxfraglen - skb->len;
		if (copy <= 0) {
			char *data;
			unsigned int datalen;
			unsigned int fraglen;
			unsigned int fraggap;
			unsigned int alloclen;
			struct sk_buff *skb_prev;
alloc_new_skb:
			skb_prev = skb;
			if (skb_prev)
				fraggap = skb_prev->len - maxfraglen;
			else
				fraggap = 0;

			/*
			 * If remaining data exceeds the mtu,
			 * we know we need more fragment(s).
			 */
			datalen = length + fraggap;
			if (datalen > mtu - fragheaderlen)
				datalen = maxfraglen - fragheaderlen;
			fraglen = datalen + fragheaderlen;

974
			if ((flags & MSG_MORE) &&
975
			    !(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
976 977
				alloclen = mtu;
			else
978
				alloclen = fraglen;
L
Linus Torvalds 已提交
979

980 981
			alloclen += exthdrlen;

L
Linus Torvalds 已提交
982 983 984 985 986
			/* The last fragment gets additional space at tail.
			 * Note, with MSG_MORE we overallocate on fragments,
			 * because we have no idea what fragment will be
			 * the last.
			 */
987
			if (datalen == length + fraggap)
988
				alloclen += rt->dst.trailer_len;
989

L
Linus Torvalds 已提交
990
			if (transhdrlen) {
991
				skb = sock_alloc_send_skb(sk,
L
Linus Torvalds 已提交
992 993 994 995 996 997
						alloclen + hh_len + 15,
						(flags & MSG_DONTWAIT), &err);
			} else {
				skb = NULL;
				if (atomic_read(&sk->sk_wmem_alloc) <=
				    2 * sk->sk_sndbuf)
998
					skb = sock_wmalloc(sk,
L
Linus Torvalds 已提交
999 1000
							   alloclen + hh_len + 15, 1,
							   sk->sk_allocation);
1001
				if (unlikely(!skb))
L
Linus Torvalds 已提交
1002 1003
					err = -ENOBUFS;
			}
1004
			if (!skb)
L
Linus Torvalds 已提交
1005 1006 1007 1008 1009 1010 1011 1012
				goto error;

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = csummode;
			skb->csum = 0;
			skb_reserve(skb, hh_len);
1013 1014

			/* only the initial fragment is time stamped */
1015
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1016
			cork->tx_flags = 0;
1017 1018
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
L
Linus Torvalds 已提交
1019 1020 1021 1022

			/*
			 *	Find where to start putting bytes.
			 */
1023
			data = skb_put(skb, fraglen + exthdrlen);
1024
			skb_set_network_header(skb, exthdrlen);
1025 1026
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
1027
			data += fragheaderlen + exthdrlen;
L
Linus Torvalds 已提交
1028 1029 1030 1031 1032 1033 1034 1035

			if (fraggap) {
				skb->csum = skb_copy_and_csum_bits(
					skb_prev, maxfraglen,
					data + transhdrlen, fraggap, 0);
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
				data += fraggap;
1036
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
			}

			copy = datalen - transhdrlen - fraggap;
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				err = -EFAULT;
				kfree_skb(skb);
				goto error;
			}

			offset += copy;
			length -= datalen - fraggap;
			transhdrlen = 0;
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;

			/*
			 * Put the packet on the pending queue.
			 */
1055
			__skb_queue_tail(queue, skb);
L
Linus Torvalds 已提交
1056 1057 1058 1059 1060 1061
			continue;
		}

		if (copy > length)
			copy = length;

1062
		if (!(rt->dst.dev->features&NETIF_F_SG)) {
L
Linus Torvalds 已提交
1063 1064 1065
			unsigned int off;

			off = skb->len;
1066
			if (getfrag(from, skb_put(skb, copy),
L
Linus Torvalds 已提交
1067 1068 1069 1070 1071 1072 1073 1074
					offset, copy, off, skb) < 0) {
				__skb_trim(skb, off);
				err = -EFAULT;
				goto error;
			}
		} else {
			int i = skb_shinfo(skb)->nr_frags;

1075 1076
			err = -ENOMEM;
			if (!sk_page_frag_refill(sk, pfrag))
L
Linus Torvalds 已提交
1077
				goto error;
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088

			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				err = -EMSGSIZE;
				if (i == MAX_SKB_FRAGS)
					goto error;

				__skb_fill_page_desc(skb, i, pfrag->page,
						     pfrag->offset, 0);
				skb_shinfo(skb)->nr_frags = ++i;
				get_page(pfrag->page);
L
Linus Torvalds 已提交
1089
			}
1090 1091 1092 1093 1094 1095 1096 1097
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
			if (getfrag(from,
				    page_address(pfrag->page) + pfrag->offset,
				    offset, copy, skb->len, skb) < 0)
				goto error_efault;

			pfrag->offset += copy;
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
L
Linus Torvalds 已提交
1098 1099
			skb->len += copy;
			skb->data_len += copy;
1100 1101
			skb->truesize += copy;
			atomic_add(copy, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1102 1103 1104 1105 1106 1107 1108
		}
		offset += copy;
		length -= copy;
	}

	return 0;

1109 1110
error_efault:
	err = -EFAULT;
L
Linus Torvalds 已提交
1111
error:
1112
	cork->length -= length;
P
Pavel Emelyanov 已提交
1113
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1114
	return err;
L
Linus Torvalds 已提交
1115 1116
}

1117 1118 1119
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
			 struct ipcm_cookie *ipc, struct rtable **rtp)
{
1120
	struct ip_options_rcu *opt;
1121 1122 1123 1124 1125 1126 1127
	struct rtable *rt;

	/*
	 * setup for corking.
	 */
	opt = ipc->opt;
	if (opt) {
1128
		if (!cork->opt) {
1129 1130
			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
					    sk->sk_allocation);
1131
			if (unlikely(!cork->opt))
1132 1133
				return -ENOBUFS;
		}
1134
		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
		cork->flags |= IPCORK_OPT;
		cork->addr = ipc->addr;
	}
	rt = *rtp;
	if (unlikely(!rt))
		return -EFAULT;
	/*
	 * We steal reference to this route, caller should not release it
	 */
	*rtp = NULL;
1145 1146
	cork->fragsize = ip_sk_use_pmtu(sk) ?
			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1147 1148
	cork->dst = &rt->dst;
	cork->length = 0;
1149 1150 1151
	cork->ttl = ipc->ttl;
	cork->tos = ipc->tos;
	cork->priority = ipc->priority;
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
	cork->tx_flags = ipc->tx_flags;

	return 0;
}

/*
 *	ip_append_data() and ip_append_page() can make one large IP datagram
 *	from many pieces of data. Each pieces will be holded on the socket
 *	until ip_push_pending_frames() is called. Each piece can be a page
 *	or non-page data.
 *
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
 *	this interface potentially.
 *
 *	LATER: length must be adjusted by pad at tail, when it is required.
 */
1168
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
		   int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
		   void *from, int length, int transhdrlen,
		   struct ipcm_cookie *ipc, struct rtable **rtp,
		   unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	int err;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue)) {
1182
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1183 1184 1185 1186 1187 1188
		if (err)
			return err;
	} else {
		transhdrlen = 0;
	}

1189 1190
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
				sk_page_frag(sk), getfrag,
1191 1192 1193
				from, length, transhdrlen, flags);
}

1194
ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
L
Linus Torvalds 已提交
1195 1196 1197 1198 1199 1200
		       int offset, size_t size, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;
	struct rtable *rt;
	struct ip_options *opt = NULL;
1201
	struct inet_cork *cork;
L
Linus Torvalds 已提交
1202 1203 1204 1205
	int hh_len;
	int mtu;
	int len;
	int err;
1206
	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
L
Linus Torvalds 已提交
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216

	if (inet->hdrincl)
		return -EPERM;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue))
		return -EINVAL;

1217 1218 1219 1220
	cork = &inet->cork.base;
	rt = (struct rtable *)cork->dst;
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1221

1222
	if (!(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
1223 1224
		return -EOPNOTSUPP;

1225
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1226
	mtu = cork->fragsize;
L
Linus Torvalds 已提交
1227 1228 1229

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
1230
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
1231

1232
	if (cork->length + size > maxnonfragsize - fragheaderlen) {
1233 1234
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
1235 1236 1237
		return -EMSGSIZE;
	}

1238 1239
	skb = skb_peek_tail(&sk->sk_write_queue);
	if (!skb)
L
Linus Torvalds 已提交
1240 1241
		return -EINVAL;

1242
	cork->length += size;
1243 1244
	if ((size + skb->len > mtu) &&
	    (sk->sk_protocol == IPPROTO_UDP) &&
1245
	    (rt->dst.dev->features & NETIF_F_UFO)) {
1246
		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
H
Herbert Xu 已提交
1247
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1248
	}
1249

L
Linus Torvalds 已提交
1250
	while (size > 0) {
1251
		if (skb_is_gso(skb)) {
1252
			len = size;
1253
		} else {
1254 1255 1256 1257 1258 1259

			/* Check if the remaining data fits into current packet. */
			len = mtu - skb->len;
			if (len < size)
				len = maxfraglen - skb->len;
		}
L
Linus Torvalds 已提交
1260 1261 1262 1263 1264
		if (len <= 0) {
			struct sk_buff *skb_prev;
			int alloclen;

			skb_prev = skb;
1265
			fraggap = skb_prev->len - maxfraglen;
L
Linus Torvalds 已提交
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283

			alloclen = fragheaderlen + hh_len + fraggap + 15;
			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
			if (unlikely(!skb)) {
				err = -ENOBUFS;
				goto error;
			}

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = CHECKSUM_NONE;
			skb->csum = 0;
			skb_reserve(skb, hh_len);

			/*
			 *	Find where to start putting bytes.
			 */
1284
			skb_put(skb, fragheaderlen + fraggap);
1285
			skb_reset_network_header(skb);
1286 1287
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
L
Linus Torvalds 已提交
1288
			if (fraggap) {
1289 1290
				skb->csum = skb_copy_and_csum_bits(skb_prev,
								   maxfraglen,
1291
						    skb_transport_header(skb),
1292
								   fraggap, 0);
L
Linus Torvalds 已提交
1293 1294
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
1295
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
			}

			/*
			 * Put the packet on the pending queue.
			 */
			__skb_queue_tail(&sk->sk_write_queue, skb);
			continue;
		}

		if (len > size)
			len = size;
1307 1308

		if (skb_append_pagefrags(skb, page, offset, len)) {
L
Linus Torvalds 已提交
1309 1310 1311 1312 1313
			err = -EMSGSIZE;
			goto error;
		}

		if (skb->ip_summed == CHECKSUM_NONE) {
1314
			__wsum csum;
L
Linus Torvalds 已提交
1315 1316 1317 1318 1319 1320
			csum = csum_page(page, offset, len);
			skb->csum = csum_block_add(skb->csum, csum, skb->len);
		}

		skb->len += len;
		skb->data_len += len;
1321 1322
		skb->truesize += len;
		atomic_add(len, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1323 1324 1325 1326 1327 1328
		offset += len;
		size -= len;
	}
	return 0;

error:
1329
	cork->length -= size;
P
Pavel Emelyanov 已提交
1330
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1331 1332 1333
	return err;
}

1334
static void ip_cork_release(struct inet_cork *cork)
1335
{
1336 1337 1338 1339 1340
	cork->flags &= ~IPCORK_OPT;
	kfree(cork->opt);
	cork->opt = NULL;
	dst_release(cork->dst);
	cork->dst = NULL;
1341 1342
}

L
Linus Torvalds 已提交
1343 1344 1345 1346
/*
 *	Combined all pending IP fragments on the socket as one IP datagram
 *	and push them out.
 */
1347
struct sk_buff *__ip_make_skb(struct sock *sk,
1348
			      struct flowi4 *fl4,
1349 1350
			      struct sk_buff_head *queue,
			      struct inet_cork *cork)
L
Linus Torvalds 已提交
1351 1352 1353 1354
{
	struct sk_buff *skb, *tmp_skb;
	struct sk_buff **tail_skb;
	struct inet_sock *inet = inet_sk(sk);
1355
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1356
	struct ip_options *opt = NULL;
1357
	struct rtable *rt = (struct rtable *)cork->dst;
L
Linus Torvalds 已提交
1358
	struct iphdr *iph;
1359
	__be16 df = 0;
L
Linus Torvalds 已提交
1360 1361
	__u8 ttl;

1362 1363
	skb = __skb_dequeue(queue);
	if (!skb)
L
Linus Torvalds 已提交
1364 1365 1366 1367
		goto out;
	tail_skb = &(skb_shinfo(skb)->frag_list);

	/* move skb->data to ip header from ext header */
1368
	if (skb->data < skb_network_header(skb))
1369
		__skb_pull(skb, skb_network_offset(skb));
1370
	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1371
		__skb_pull(tmp_skb, skb_network_header_len(skb));
L
Linus Torvalds 已提交
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
		*tail_skb = tmp_skb;
		tail_skb = &(tmp_skb->next);
		skb->len += tmp_skb->len;
		skb->data_len += tmp_skb->len;
		skb->truesize += tmp_skb->truesize;
		tmp_skb->destructor = NULL;
		tmp_skb->sk = NULL;
	}

	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
	 * to fragment the frame generated here. No matter, what transforms
	 * how transforms change size of the packet, it will come out.
	 */
W
WANG Cong 已提交
1385
	skb->ignore_df = ip_sk_ignore_df(sk);
L
Linus Torvalds 已提交
1386 1387

	/* DF bit is set when we want to see DF on outgoing frames.
W
WANG Cong 已提交
1388
	 * If ignore_df is set too, we still allow to fragment this frame
L
Linus Torvalds 已提交
1389
	 * locally. */
1390 1391
	if (inet->pmtudisc == IP_PMTUDISC_DO ||
	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
1392 1393
	    (skb->len <= dst_mtu(&rt->dst) &&
	     ip_dont_fragment(sk, &rt->dst)))
L
Linus Torvalds 已提交
1394 1395
		df = htons(IP_DF);

1396 1397
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1398

1399 1400 1401
	if (cork->ttl != 0)
		ttl = cork->ttl;
	else if (rt->rt_type == RTN_MULTICAST)
L
Linus Torvalds 已提交
1402 1403
		ttl = inet->mc_ttl;
	else
1404
		ttl = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
1405

1406
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1407 1408
	iph->version = 4;
	iph->ihl = 5;
1409
	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
L
Linus Torvalds 已提交
1410 1411 1412
	iph->frag_off = df;
	iph->ttl = ttl;
	iph->protocol = sk->sk_protocol;
1413
	ip_copy_addrs(iph, fl4);
1414
	ip_select_ident(net, skb, sk);
L
Linus Torvalds 已提交
1415

1416 1417 1418 1419 1420
	if (opt) {
		iph->ihl += opt->optlen>>2;
		ip_options_build(skb, opt, cork->addr, rt, 0);
	}

1421
	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1422
	skb->mark = sk->sk_mark;
1423 1424 1425 1426
	/*
	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
	 * on dst refcount
	 */
1427
	cork->dst = NULL;
1428
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
1429

1430
	if (iph->protocol == IPPROTO_ICMP)
1431
		icmp_out_count(net, ((struct icmphdr *)
1432 1433
			skb_transport_header(skb))->type);

1434 1435 1436 1437 1438
	ip_cork_release(cork);
out:
	return skb;
}

E
Eric Dumazet 已提交
1439
int ip_send_skb(struct net *net, struct sk_buff *skb)
1440 1441 1442
{
	int err;

H
Herbert Xu 已提交
1443
	err = ip_local_out(skb);
L
Linus Torvalds 已提交
1444 1445
	if (err) {
		if (err > 0)
E
Eric Dumazet 已提交
1446
			err = net_xmit_errno(err);
L
Linus Torvalds 已提交
1447
		if (err)
1448
			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1449 1450 1451 1452 1453
	}

	return err;
}

1454
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1455
{
1456 1457
	struct sk_buff *skb;

1458
	skb = ip_finish_skb(sk, fl4);
1459 1460 1461 1462
	if (!skb)
		return 0;

	/* Netfilter gets whole the not fragmented skb. */
E
Eric Dumazet 已提交
1463
	return ip_send_skb(sock_net(sk), skb);
1464 1465
}

L
Linus Torvalds 已提交
1466 1467 1468
/*
 *	Throw away all pending data on the socket.
 */
1469 1470 1471
static void __ip_flush_pending_frames(struct sock *sk,
				      struct sk_buff_head *queue,
				      struct inet_cork *cork)
L
Linus Torvalds 已提交
1472 1473 1474
{
	struct sk_buff *skb;

1475
	while ((skb = __skb_dequeue_tail(queue)) != NULL)
L
Linus Torvalds 已提交
1476 1477
		kfree_skb(skb);

1478 1479 1480 1481 1482
	ip_cork_release(cork);
}

void ip_flush_pending_frames(struct sock *sk)
{
1483
	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
L
Linus Torvalds 已提交
1484 1485
}

1486
struct sk_buff *ip_make_skb(struct sock *sk,
1487
			    struct flowi4 *fl4,
1488 1489 1490 1491 1492 1493
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    struct ipcm_cookie *ipc, struct rtable **rtp,
			    unsigned int flags)
{
1494
	struct inet_cork cork;
1495 1496 1497 1498 1499 1500 1501 1502
	struct sk_buff_head queue;
	int err;

	if (flags & MSG_PROBE)
		return NULL;

	__skb_queue_head_init(&queue);

1503 1504
	cork.flags = 0;
	cork.addr = 0;
1505
	cork.opt = NULL;
1506 1507 1508 1509
	err = ip_setup_cork(sk, &cork, ipc, rtp);
	if (err)
		return ERR_PTR(err);

1510 1511
	err = __ip_append_data(sk, fl4, &queue, &cork,
			       &current->task_frag, getfrag,
1512 1513 1514 1515 1516 1517
			       from, length, transhdrlen, flags);
	if (err) {
		__ip_flush_pending_frames(sk, &queue, &cork);
		return ERR_PTR(err);
	}

1518
	return __ip_make_skb(sk, fl4, &queue, &cork);
1519
}
L
Linus Torvalds 已提交
1520 1521 1522 1523

/*
 *	Fetch data from kernel space and fill in checksum if needed.
 */
1524
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
L
Linus Torvalds 已提交
1525 1526
			      int len, int odd, struct sk_buff *skb)
{
1527
	__wsum csum;
L
Linus Torvalds 已提交
1528 1529 1530

	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
	skb->csum = csum_block_add(skb->csum, csum, odd);
1531
	return 0;
L
Linus Torvalds 已提交
1532 1533
}

1534
/*
L
Linus Torvalds 已提交
1535
 *	Generic function to send a packet as reply to another packet.
1536
 *	Used to send some TCP resets/acks so far.
L
Linus Torvalds 已提交
1537
 */
1538
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1539 1540 1541
			   const struct ip_options *sopt,
			   __be32 daddr, __be32 saddr,
			   const struct ip_reply_arg *arg,
1542
			   unsigned int len)
L
Linus Torvalds 已提交
1543
{
1544
	struct ip_options_data replyopts;
L
Linus Torvalds 已提交
1545
	struct ipcm_cookie ipc;
1546
	struct flowi4 fl4;
E
Eric Dumazet 已提交
1547
	struct rtable *rt = skb_rtable(skb);
1548
	struct net *net = sock_net(sk);
1549
	struct sk_buff *nskb;
1550
	int err;
1551
	int oif;
L
Linus Torvalds 已提交
1552

1553
	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
L
Linus Torvalds 已提交
1554 1555
		return;

1556
	ipc.addr = daddr;
L
Linus Torvalds 已提交
1557
	ipc.opt = NULL;
1558
	ipc.tx_flags = 0;
1559 1560
	ipc.ttl = 0;
	ipc.tos = -1;
L
Linus Torvalds 已提交
1561

1562
	if (replyopts.opt.opt.optlen) {
L
Linus Torvalds 已提交
1563 1564
		ipc.opt = &replyopts.opt;

1565 1566
		if (replyopts.opt.opt.srr)
			daddr = replyopts.opt.opt.faddr;
L
Linus Torvalds 已提交
1567 1568
	}

1569
	oif = arg->bound_dev_if;
1570
	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
1571 1572 1573
		oif = skb->skb_iif;

	flowi4_init_output(&fl4, oif,
1574
			   IP4_REPLY_MARK(net, skb->mark),
1575
			   RT_TOS(arg->tos),
1576
			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1577
			   ip_reply_arg_flowi_flags(arg),
1578
			   daddr, saddr,
1579 1580
			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1581
	rt = ip_route_output_key(net, &fl4);
1582 1583
	if (IS_ERR(rt))
		return;
L
Linus Torvalds 已提交
1584

1585
	inet_sk(sk)->tos = arg->tos;
L
Linus Torvalds 已提交
1586 1587

	sk->sk_priority = skb->priority;
1588
	sk->sk_protocol = ip_hdr(skb)->protocol;
1589
	sk->sk_bound_dev_if = arg->bound_dev_if;
1590
	sk->sk_sndbuf = sysctl_wmem_default;
1591 1592 1593 1594 1595 1596 1597
	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
			     len, 0, &ipc, &rt, MSG_DONTWAIT);
	if (unlikely(err)) {
		ip_flush_pending_frames(sk);
		goto out;
	}

1598 1599
	nskb = skb_peek(&sk->sk_write_queue);
	if (nskb) {
L
Linus Torvalds 已提交
1600
		if (arg->csumoffset >= 0)
1601 1602
			*((__sum16 *)skb_transport_header(nskb) +
			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1603
								arg->csum));
1604 1605
		nskb->ip_summed = CHECKSUM_NONE;
		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1606
		ip_push_pending_frames(sk, &fl4);
L
Linus Torvalds 已提交
1607
	}
1608
out:
L
Linus Torvalds 已提交
1609 1610 1611 1612 1613 1614 1615 1616
	ip_rt_put(rt);
}

void __init ip_init(void)
{
	ip_rt_init();
	inet_initpeers();

1617 1618
#if defined(CONFIG_IP_MULTICAST)
	igmp_mc_init();
L
Linus Torvalds 已提交
1619 1620
#endif
}