ip_output.c 39.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The Internet Protocol (IP) output module.
 *
8
 * Authors:	Ross Biro
L
Linus Torvalds 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Donald Becker, <becker@super.org>
 *		Alan Cox, <Alan.Cox@linux.org>
 *		Richard Underwood
 *		Stefan Becker, <stefanb@yello.ping.de>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 *	See ip_input.c for original log
 *
 *	Fixes:
 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23
 *		Bradford Johnson:	Fix faulty handling of some frames when
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33
 *					no route is found.
 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
 *					(in case if packet not accepted by
 *					output firewall rules)
 *		Mike McLagan	:	Routing by source
 *		Alexey Kuznetsov:	use new route cache
 *		Andi Kleen:		Fix broken PMTU recovery and remove
 *					some redundant tests.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34 35 36
 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
 *					for decreased register pressure on x86
 *					and more readibility.
L
Linus Torvalds 已提交
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
 *					silently drop skb instead of failing with -EPERM.
 *		Detlev Wengorz	:	Copy protocol for fragments.
 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
 *					datagrams.
 *		Hirokazu Takahashi:	sendfile() on UDP works now.
 */

#include <asm/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
A
Al Viro 已提交
52
#include <linux/highmem.h>
53
#include <linux/slab.h>
L
Linus Torvalds 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
69
#include <net/xfrm.h>
L
Linus Torvalds 已提交
70 71 72 73 74 75 76 77 78 79 80
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
81
#include <linux/tcp.h>
L
Linus Torvalds 已提交
82

83
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84
EXPORT_SYMBOL(sysctl_ip_default_ttl);
L
Linus Torvalds 已提交
85

86 87 88
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
		       int (*output)(struct sock *, struct sk_buff *));

L
Linus Torvalds 已提交
89
/* Generate a checksum for an outgoing IP datagram. */
90
void ip_send_check(struct iphdr *iph)
L
Linus Torvalds 已提交
91 92 93 94
{
	iph->check = 0;
	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
E
Eric Dumazet 已提交
95
EXPORT_SYMBOL(ip_send_check);
L
Linus Torvalds 已提交
96

97
static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
98 99 100 101 102
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);
103 104 105 106 107 108 109
	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
		       skb_dst(skb)->dev, dst_output_sk);
}

int __ip_local_out(struct sk_buff *skb)
{
	return __ip_local_out_sk(skb->sk, skb);
H
Herbert Xu 已提交
110 111
}

112
int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
H
Herbert Xu 已提交
113 114 115 116 117
{
	int err;

	err = __ip_local_out(skb);
	if (likely(err == 1))
118
		err = dst_output_sk(sk, skb);
H
Herbert Xu 已提交
119 120 121

	return err;
}
122
EXPORT_SYMBOL_GPL(ip_local_out_sk);
H
Herbert Xu 已提交
123

L
Linus Torvalds 已提交
124 125 126 127 128
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
	int ttl = inet->uc_ttl;

	if (ttl < 0)
129
		ttl = ip4_dst_hoplimit(dst);
L
Linus Torvalds 已提交
130 131 132
	return ttl;
}

133
/*
L
Linus Torvalds 已提交
134 135 136 137
 *		Add an ip header to a skbuff and send it out.
 *
 */
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
138
			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
L
Linus Torvalds 已提交
139 140
{
	struct inet_sock *inet = inet_sk(sk);
E
Eric Dumazet 已提交
141
	struct rtable *rt = skb_rtable(skb);
L
Linus Torvalds 已提交
142 143 144
	struct iphdr *iph;

	/* Build the IP header. */
145
	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
146
	skb_reset_network_header(skb);
147
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
148 149 150
	iph->version  = 4;
	iph->ihl      = 5;
	iph->tos      = inet->tos;
151
	if (ip_dont_fragment(sk, &rt->dst))
L
Linus Torvalds 已提交
152 153 154
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
155
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
156 157
	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
	iph->saddr    = saddr;
L
Linus Torvalds 已提交
158
	iph->protocol = sk->sk_protocol;
159
	ip_select_ident(sock_net(sk), skb, sk);
L
Linus Torvalds 已提交
160

161 162 163
	if (opt && opt->opt.optlen) {
		iph->ihl += opt->opt.optlen>>2;
		ip_options_build(skb, &opt->opt, daddr, rt, 0);
L
Linus Torvalds 已提交
164 165 166
	}

	skb->priority = sk->sk_priority;
167
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
168 169

	/* Send it out. */
H
Herbert Xu 已提交
170
	return ip_local_out(skb);
L
Linus Torvalds 已提交
171
}
172 173
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);

174
static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
175
{
E
Eric Dumazet 已提交
176
	struct dst_entry *dst = skb_dst(skb);
177
	struct rtable *rt = (struct rtable *)dst;
L
Linus Torvalds 已提交
178
	struct net_device *dev = dst->dev;
179
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
180
	struct neighbour *neigh;
181
	u32 nexthop;
L
Linus Torvalds 已提交
182

183 184 185 186
	if (rt->rt_type == RTN_MULTICAST) {
		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
	} else if (rt->rt_type == RTN_BROADCAST)
		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
187

L
Linus Torvalds 已提交
188
	/* Be paranoid, rather than too clever. */
189
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
L
Linus Torvalds 已提交
190 191 192
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
193
		if (!skb2) {
L
Linus Torvalds 已提交
194 195 196 197 198
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
199
		consume_skb(skb);
L
Linus Torvalds 已提交
200 201 202
		skb = skb2;
	}

203
	rcu_read_lock_bh();
J
Julian Anastasov 已提交
204
	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
205 206 207
	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
	if (unlikely(!neigh))
		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
208
	if (!IS_ERR(neigh)) {
209
		int res = dst_neigh_output(dst, neigh, skb);
210

211
		rcu_read_unlock_bh();
212 213
		return res;
	}
214
	rcu_read_unlock_bh();
215

216 217
	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
L
Linus Torvalds 已提交
218 219 220 221
	kfree_skb(skb);
	return -EINVAL;
}

222
static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
223 224 225 226 227 228 229 230
{
	netdev_features_t features;
	struct sk_buff *segs;
	int ret = 0;

	/* common case: locally created skb or seglen is <= mtu */
	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
	      skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
231
		return ip_finish_output2(sk, skb);
232 233 234 235 236 237 238 239 240 241

	/* Slowpath -  GSO segment length is exceeding the dst MTU.
	 *
	 * This can happen in two cases:
	 * 1) TCP GRO packet, DF bit not set
	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
	 * from host network stack.
	 */
	features = netif_skb_features(skb);
	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
242
	if (IS_ERR_OR_NULL(segs)) {
243 244 245 246 247 248 249 250 251 252 253
		kfree_skb(skb);
		return -ENOMEM;
	}

	consume_skb(skb);

	do {
		struct sk_buff *nskb = segs->next;
		int err;

		segs->next = NULL;
254
		err = ip_fragment(sk, segs, ip_finish_output2);
255 256 257 258 259 260 261 262 263

		if (err && ret == 0)
			ret = err;
		segs = nskb;
	} while (segs);

	return ret;
}

264
static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
265
{
266 267
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
268
	if (skb_dst(skb)->xfrm) {
269
		IPCB(skb)->flags |= IPSKB_REROUTED;
270
		return dst_output_sk(sk, skb);
271
	}
272
#endif
273
	if (skb_is_gso(skb))
274
		return ip_finish_output_gso(sk, skb);
275 276

	if (skb->len > ip_skb_dst_mtu(skb))
277
		return ip_fragment(sk, skb, ip_finish_output2);
278

279
	return ip_finish_output2(sk, skb);
L
Linus Torvalds 已提交
280 281
}

282
int ip_mc_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
283
{
E
Eric Dumazet 已提交
284
	struct rtable *rt = skb_rtable(skb);
285
	struct net_device *dev = rt->dst.dev;
L
Linus Torvalds 已提交
286 287 288 289

	/*
	 *	If the indicated interface is up and running, send the packet.
	 */
290
	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
291 292 293 294 295 296 297 298 299

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	/*
	 *	Multicasts are looped back for other local users
	 */

	if (rt->rt_flags&RTCF_MULTICAST) {
300
		if (sk_mc_loop(sk)
L
Linus Torvalds 已提交
301 302 303 304 305 306 307 308 309
#ifdef CONFIG_IP_MROUTE
		/* Small optimization: do not loopback not local frames,
		   which returned after forwarding; they will be  dropped
		   by ip_mr_input in any case.
		   Note, that local frames are looped back to be delivered
		   to local recipients.

		   This check is duplicated in ip_mr_input at the moment.
		 */
310 311 312
		    &&
		    ((rt->rt_flags & RTCF_LOCAL) ||
		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
L
Linus Torvalds 已提交
313
#endif
314
		   ) {
L
Linus Torvalds 已提交
315 316
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
			if (newskb)
317
				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
318
					sk, newskb, NULL, newskb->dev,
319
					dev_loopback_xmit);
L
Linus Torvalds 已提交
320 321 322 323
		}

		/* Multicasts with ttl 0 must not go beyond the host */

324
		if (ip_hdr(skb)->ttl == 0) {
L
Linus Torvalds 已提交
325 326 327 328 329 330 331 332
			kfree_skb(skb);
			return 0;
		}
	}

	if (rt->rt_flags&RTCF_BROADCAST) {
		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
		if (newskb)
333
			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
334
				NULL, newskb->dev, dev_loopback_xmit);
L
Linus Torvalds 已提交
335 336
	}

337
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
338
			    skb->dev, ip_finish_output,
339
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
340 341
}

342
int ip_output(struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
343
{
E
Eric Dumazet 已提交
344
	struct net_device *dev = skb_dst(skb)->dev;
345

346
	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
L
Linus Torvalds 已提交
347

348 349 350
	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

351 352
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
			    NULL, dev,
353
			    ip_finish_output,
354
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
L
Linus Torvalds 已提交
355 356
}

357 358 359 360 361 362 363 364 365 366 367 368 369 370
/*
 * copy saddr and daddr, possibly using 64bit load/stores
 * Equivalent to :
 *   iph->saddr = fl4->saddr;
 *   iph->daddr = fl4->daddr;
 */
static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
	memcpy(&iph->saddr, &fl4->saddr,
	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
}

371 372
/* Note: skb->sk can be different from sk, in case of tunnels */
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
L
Linus Torvalds 已提交
373 374
{
	struct inet_sock *inet = inet_sk(sk);
375
	struct ip_options_rcu *inet_opt;
376
	struct flowi4 *fl4;
L
Linus Torvalds 已提交
377 378
	struct rtable *rt;
	struct iphdr *iph;
379
	int res;
L
Linus Torvalds 已提交
380 381 382 383

	/* Skip all of this if the packet is already routed,
	 * f.e. by something like SCTP.
	 */
384
	rcu_read_lock();
385
	inet_opt = rcu_dereference(inet->inet_opt);
386
	fl4 = &fl->u.ip4;
E
Eric Dumazet 已提交
387
	rt = skb_rtable(skb);
388
	if (rt)
L
Linus Torvalds 已提交
389 390 391 392
		goto packet_routed;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);
393
	if (!rt) {
A
Al Viro 已提交
394
		__be32 daddr;
L
Linus Torvalds 已提交
395 396

		/* Use correct destination address if we have options. */
E
Eric Dumazet 已提交
397
		daddr = inet->inet_daddr;
398 399
		if (inet_opt && inet_opt->opt.srr)
			daddr = inet_opt->opt.faddr;
L
Linus Torvalds 已提交
400

401 402 403 404
		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times
		 * itself out.
		 */
405
		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
406 407 408 409 410 411 412 413
					   daddr, inet->inet_saddr,
					   inet->inet_dport,
					   inet->inet_sport,
					   sk->sk_protocol,
					   RT_CONN_FLAGS(sk),
					   sk->sk_bound_dev_if);
		if (IS_ERR(rt))
			goto no_route;
414
		sk_setup_caps(sk, &rt->dst);
L
Linus Torvalds 已提交
415
	}
416
	skb_dst_set_noref(skb, &rt->dst);
L
Linus Torvalds 已提交
417 418

packet_routed:
J
Julian Anastasov 已提交
419
	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
L
Linus Torvalds 已提交
420 421 422
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
423
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
424
	skb_reset_network_header(skb);
425
	iph = ip_hdr(skb);
426
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
W
WANG Cong 已提交
427
	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
L
Linus Torvalds 已提交
428 429 430
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
431
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
432
	iph->protocol = sk->sk_protocol;
433 434
	ip_copy_addrs(iph, fl4);

L
Linus Torvalds 已提交
435 436
	/* Transport layer set skb->h.foo itself. */

437 438 439
	if (inet_opt && inet_opt->opt.optlen) {
		iph->ihl += inet_opt->opt.optlen >> 2;
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
L
Linus Torvalds 已提交
440 441
	}

442 443
	ip_select_ident_segs(sock_net(sk), skb, sk,
			     skb_shinfo(skb)->gso_segs ?: 1);
L
Linus Torvalds 已提交
444

445
	/* TODO : should we use skb->sk here instead of sk ? */
L
Linus Torvalds 已提交
446
	skb->priority = sk->sk_priority;
447
	skb->mark = sk->sk_mark;
L
Linus Torvalds 已提交
448

449 450 451
	res = ip_local_out(skb);
	rcu_read_unlock();
	return res;
L
Linus Torvalds 已提交
452 453

no_route:
454
	rcu_read_unlock();
P
Pavel Emelyanov 已提交
455
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
456 457 458
	kfree_skb(skb);
	return -EHOSTUNREACH;
}
E
Eric Dumazet 已提交
459
EXPORT_SYMBOL(ip_queue_xmit);
L
Linus Torvalds 已提交
460 461 462 463 464 465

static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
	to->pkt_type = from->pkt_type;
	to->priority = from->priority;
	to->protocol = from->protocol;
E
Eric Dumazet 已提交
466
	skb_dst_drop(to);
467
	skb_dst_copy(to, from);
L
Linus Torvalds 已提交
468
	to->dev = from->dev;
T
Thomas Graf 已提交
469
	to->mark = from->mark;
L
Linus Torvalds 已提交
470 471 472 473 474 475 476

	/* Copy the flags to each fragment. */
	IPCB(to)->flags = IPCB(from)->flags;

#ifdef CONFIG_NET_SCHED
	to->tc_index = from->tc_index;
#endif
477
	nf_copy(to, from);
478 479
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
	to->ipvs_property = from->ipvs_property;
L
Linus Torvalds 已提交
480
#endif
481
	skb_copy_secmark(to, from);
L
Linus Torvalds 已提交
482 483
}

484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
		       int (*output)(struct sock *, struct sk_buff *))
{
	struct iphdr *iph = ip_hdr(skb);
	unsigned int mtu = ip_skb_dst_mtu(skb);

	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
		     (IPCB(skb)->frag_max_size &&
		      IPCB(skb)->frag_max_size > mtu))) {
		struct rtable *rt = skb_rtable(skb);
		struct net_device *dev = rt->dst.dev;

		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(mtu));
		kfree_skb(skb);
		return -EMSGSIZE;
	}

	return ip_do_fragment(sk, skb, output);
}

L
Linus Torvalds 已提交
506 507 508 509 510 511 512
/*
 *	This IP datagram is too large to be sent in one piece.  Break it up into
 *	smaller pieces (each of size equal to IP header plus
 *	a block of the data of the original IP data part) that will yet fit in a
 *	single device frame, and queue such a frame for sending.
 */

513 514
int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
		   int (*output)(struct sock *, struct sk_buff *))
L
Linus Torvalds 已提交
515 516 517 518 519
{
	struct iphdr *iph;
	int ptr;
	struct net_device *dev;
	struct sk_buff *skb2;
520
	unsigned int mtu, hlen, left, len, ll_rs;
L
Linus Torvalds 已提交
521
	int offset;
522
	__be16 not_last_frag;
E
Eric Dumazet 已提交
523
	struct rtable *rt = skb_rtable(skb);
L
Linus Torvalds 已提交
524 525
	int err = 0;

526
	dev = rt->dst.dev;
L
Linus Torvalds 已提交
527 528 529 530 531

	/*
	 *	Point into the IP datagram header.
	 */

532
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
533

534
	mtu = ip_skb_dst_mtu(skb);
L
Linus Torvalds 已提交
535 536 537 538 539 540

	/*
	 *	Setup starting values.
	 */

	hlen = iph->ihl * 4;
541
	mtu = mtu - hlen;	/* Size of data space */
542
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
543 544 545
	if (skb->nf_bridge)
		mtu -= nf_bridge_mtu_reduction(skb);
#endif
H
Herbert Xu 已提交
546
	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
L
Linus Torvalds 已提交
547 548 549 550 551 552 553 554

	/* When frag_list is given, use it. First, check its validity:
	 * some transformers could create wrong frag_list or break existing
	 * one, it is not prohibited. In this case fall back to copying.
	 *
	 * LATER: this step can be merged to real generation of fragments,
	 * we can switch to copy when see the first bad fragment.
	 */
555
	if (skb_has_frag_list(skb)) {
556
		struct sk_buff *frag, *frag2;
L
Linus Torvalds 已提交
557 558 559 560
		int first_len = skb_pagelen(skb);

		if (first_len - hlen > mtu ||
		    ((first_len - hlen) & 7) ||
561
		    ip_is_fragment(iph) ||
L
Linus Torvalds 已提交
562 563 564
		    skb_cloned(skb))
			goto slow_path;

565
		skb_walk_frags(skb, frag) {
L
Linus Torvalds 已提交
566 567 568 569
			/* Correct geometry. */
			if (frag->len > mtu ||
			    ((frag->len & 7) && frag->next) ||
			    skb_headroom(frag) < hlen)
570
				goto slow_path_clean;
L
Linus Torvalds 已提交
571 572 573

			/* Partially cloned skb? */
			if (skb_shared(frag))
574
				goto slow_path_clean;
575 576 577 578 579 580

			BUG_ON(frag->sk);
			if (skb->sk) {
				frag->sk = skb->sk;
				frag->destructor = sock_wfree;
			}
581
			skb->truesize -= frag->truesize;
L
Linus Torvalds 已提交
582 583 584 585 586 587 588
		}

		/* Everything is OK. Generate! */

		err = 0;
		offset = 0;
		frag = skb_shinfo(skb)->frag_list;
589
		skb_frag_list_init(skb);
L
Linus Torvalds 已提交
590 591 592 593 594 595 596 597 598 599 600
		skb->data_len = first_len - skb_headlen(skb);
		skb->len = first_len;
		iph->tot_len = htons(first_len);
		iph->frag_off = htons(IP_MF);
		ip_send_check(iph);

		for (;;) {
			/* Prepare header of the next frame,
			 * before previous one went down. */
			if (frag) {
				frag->ip_summed = CHECKSUM_NONE;
601
				skb_reset_transport_header(frag);
602 603
				__skb_push(frag, hlen);
				skb_reset_network_header(frag);
604
				memcpy(skb_network_header(frag), iph, hlen);
605
				iph = ip_hdr(frag);
L
Linus Torvalds 已提交
606 607 608 609 610 611
				iph->tot_len = htons(frag->len);
				ip_copy_metadata(frag, skb);
				if (offset == 0)
					ip_options_fragment(frag);
				offset += skb->len - hlen;
				iph->frag_off = htons(offset>>3);
612
				if (frag->next)
L
Linus Torvalds 已提交
613 614 615 616 617
					iph->frag_off |= htons(IP_MF);
				/* Ready, complete checksum */
				ip_send_check(iph);
			}

618
			err = output(sk, skb);
L
Linus Torvalds 已提交
619

620
			if (!err)
P
Pavel Emelyanov 已提交
621
				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
622 623 624 625 626 627 628 629 630
			if (err || !frag)
				break;

			skb = frag;
			frag = skb->next;
			skb->next = NULL;
		}

		if (err == 0) {
P
Pavel Emelyanov 已提交
631
			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
632 633 634 635 636 637 638 639
			return 0;
		}

		while (frag) {
			skb = frag->next;
			kfree_skb(frag);
			frag = skb;
		}
P
Pavel Emelyanov 已提交
640
		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
641
		return err;
642 643 644 645 646 647 648 649 650

slow_path_clean:
		skb_walk_frags(skb, frag2) {
			if (frag2 == frag)
				break;
			frag2->sk = NULL;
			frag2->destructor = NULL;
			skb->truesize += frag2->truesize;
		}
L
Linus Torvalds 已提交
651 652 653
	}

slow_path:
654 655 656
	/* for offloaded checksums cleanup checksum before fragmentation */
	if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
		goto fail;
657
	iph = ip_hdr(skb);
658

L
Linus Torvalds 已提交
659
	left = skb->len - hlen;		/* Space per frame */
660
	ptr = hlen;		/* Where to start from */
L
Linus Torvalds 已提交
661

662
	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
663

L
Linus Torvalds 已提交
664 665 666 667 668 669 670 671 672 673 674
	/*
	 *	Fragment the datagram.
	 */

	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
	not_last_frag = iph->frag_off & htons(IP_MF);

	/*
	 *	Keep copying data until we run out.
	 */

S
Stephen Hemminger 已提交
675
	while (left > 0) {
L
Linus Torvalds 已提交
676 677 678 679
		len = left;
		/* IF: it doesn't fit, use 'mtu' - the data space left */
		if (len > mtu)
			len = mtu;
L
Lucas De Marchi 已提交
680
		/* IF: we are not sending up to and including the packet end
L
Linus Torvalds 已提交
681 682 683 684 685
		   then align the next start on an eight byte boundary */
		if (len < left)	{
			len &= ~7;
		}

686 687 688
		/* Allocate buffer */
		skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
		if (!skb2) {
L
Linus Torvalds 已提交
689 690 691 692 693 694 695 696 697 698 699
			err = -ENOMEM;
			goto fail;
		}

		/*
		 *	Set up data on packet
		 */

		ip_copy_metadata(skb2, skb);
		skb_reserve(skb2, ll_rs);
		skb_put(skb2, len + hlen);
700
		skb_reset_network_header(skb2);
701
		skb2->transport_header = skb2->network_header + hlen;
L
Linus Torvalds 已提交
702 703 704 705 706 707 708 709 710 711 712 713 714

		/*
		 *	Charge the memory for the fragment to any owner
		 *	it might possess
		 */

		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);

		/*
		 *	Copy the packet header into the new buffer.
		 */

715
		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
L
Linus Torvalds 已提交
716 717 718 719

		/*
		 *	Copy a block of the IP datagram.
		 */
720
		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
L
Linus Torvalds 已提交
721 722 723 724 725 726
			BUG();
		left -= len;

		/*
		 *	Fill in the new header fields.
		 */
727
		iph = ip_hdr(skb2);
L
Linus Torvalds 已提交
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
		iph->frag_off = htons((offset >> 3));

		/* ANK: dirty, but effective trick. Upgrade options only if
		 * the segment to be fragmented was THE FIRST (otherwise,
		 * options are already fixed) and make it ONCE
		 * on the initial skb, so that all the following fragments
		 * will inherit fixed options.
		 */
		if (offset == 0)
			ip_options_fragment(skb);

		/*
		 *	Added AC : If we are fragmenting a fragment that's not the
		 *		   last fragment then keep MF on each bit
		 */
		if (left > 0 || not_last_frag)
			iph->frag_off |= htons(IP_MF);
		ptr += len;
		offset += len;

		/*
		 *	Put this fragment into the sending queue.
		 */
		iph->tot_len = htons(len + hlen);

		ip_send_check(iph);

755
		err = output(sk, skb2);
L
Linus Torvalds 已提交
756 757
		if (err)
			goto fail;
758

P
Pavel Emelyanov 已提交
759
		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
L
Linus Torvalds 已提交
760
	}
761
	consume_skb(skb);
P
Pavel Emelyanov 已提交
762
	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
L
Linus Torvalds 已提交
763 764 765
	return err;

fail:
766
	kfree_skb(skb);
P
Pavel Emelyanov 已提交
767
	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
768 769
	return err;
}
770
EXPORT_SYMBOL(ip_do_fragment);
771

L
Linus Torvalds 已提交
772 773 774
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
775
	struct msghdr *msg = from;
L
Linus Torvalds 已提交
776

777
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
778
		if (copy_from_iter(to, len, &msg->msg_iter) != len)
L
Linus Torvalds 已提交
779 780
			return -EFAULT;
	} else {
781
		__wsum csum = 0;
782
		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
L
Linus Torvalds 已提交
783 784 785 786 787
			return -EFAULT;
		skb->csum = csum_block_add(skb->csum, csum, odd);
	}
	return 0;
}
E
Eric Dumazet 已提交
788
EXPORT_SYMBOL(ip_generic_getfrag);
L
Linus Torvalds 已提交
789

790
static inline __wsum
L
Linus Torvalds 已提交
791 792 793
csum_page(struct page *page, int offset, int copy)
{
	char *kaddr;
794
	__wsum csum;
L
Linus Torvalds 已提交
795 796 797 798 799 800
	kaddr = kmap(page);
	csum = csum_partial(kaddr + offset, copy, 0);
	kunmap(page);
	return csum;
}

A
Adrian Bunk 已提交
801
static inline int ip_ufo_append_data(struct sock *sk,
802
			struct sk_buff_head *queue,
803 804 805
			int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
			void *from, int length, int hh_len, int fragheaderlen,
806
			int transhdrlen, int maxfraglen, unsigned int flags)
807 808 809 810 811 812 813 814
{
	struct sk_buff *skb;
	int err;

	/* There is support for UDP fragmentation offload by network
	 * device, so create one single skb packet containing complete
	 * udp datagram
	 */
815 816
	skb = skb_peek_tail(queue);
	if (!skb) {
817 818 819 820
		skb = sock_alloc_send_skb(sk,
			hh_len + fragheaderlen + transhdrlen + 20,
			(flags & MSG_DONTWAIT), &err);

821
		if (!skb)
822 823 824 825 826 827
			return err;

		/* reserve space for Hardware header */
		skb_reserve(skb, hh_len);

		/* create space for UDP/IP header */
828
		skb_put(skb, fragheaderlen + transhdrlen);
829 830

		/* initialize network header pointer */
831
		skb_reset_network_header(skb);
832 833

		/* initialize protocol header pointer */
834
		skb->transport_header = skb->network_header + fragheaderlen;
835 836 837

		skb->csum = 0;

838
		__skb_queue_tail(queue, skb);
839 840
	} else if (skb_is_gso(skb)) {
		goto append;
841
	}
842

843 844 845 846 847 848
	skb->ip_summed = CHECKSUM_PARTIAL;
	/* specify the length of each IP datagram fragment */
	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;

append:
849 850
	return skb_append_datato_frags(sk, skb, getfrag, from,
				       (length - transhdrlen));
851 852
}

853 854 855
static int __ip_append_data(struct sock *sk,
			    struct flowi4 *fl4,
			    struct sk_buff_head *queue,
856
			    struct inet_cork *cork,
857
			    struct page_frag *pfrag,
858 859 860 861
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    unsigned int flags)
L
Linus Torvalds 已提交
862 863 864 865
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;

866
	struct ip_options *opt = cork->opt;
L
Linus Torvalds 已提交
867 868 869 870 871 872
	int hh_len;
	int exthdrlen;
	int mtu;
	int copy;
	int err;
	int offset = 0;
873
	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
L
Linus Torvalds 已提交
874
	int csummode = CHECKSUM_NONE;
875
	struct rtable *rt = (struct rtable *)cork->dst;
876
	u32 tskey = 0;
L
Linus Torvalds 已提交
877

878 879 880
	skb = skb_peek_tail(queue);

	exthdrlen = !skb ? rt->dst.header_len : 0;
881
	mtu = cork->fragsize;
882 883 884
	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
		tskey = sk->sk_tskey++;
L
Linus Torvalds 已提交
885

886
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
L
Linus Torvalds 已提交
887 888 889

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
890
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
891

892
	if (cork->length + length > maxnonfragsize - fragheaderlen) {
893
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
894
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
895 896 897 898 899 900 901 902 903
		return -EMSGSIZE;
	}

	/*
	 * transhdrlen > 0 means that this is the first fragment and we wish
	 * it won't be fragmented in the future.
	 */
	if (transhdrlen &&
	    length + fragheaderlen <= mtu &&
904
	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
L
Linus Torvalds 已提交
905
	    !exthdrlen)
906
		csummode = CHECKSUM_PARTIAL;
L
Linus Torvalds 已提交
907

908
	cork->length += length;
909
	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
910
	    (sk->sk_protocol == IPPROTO_UDP) &&
911 912
	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
	    (sk->sk_type == SOCK_DGRAM)) {
913 914
		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
					 hh_len, fragheaderlen, transhdrlen,
915
					 maxfraglen, flags);
916
		if (err)
917 918 919
			goto error;
		return 0;
	}
L
Linus Torvalds 已提交
920 921 922 923 924 925 926 927

	/* So, what's going on in the loop below?
	 *
	 * We use calculated fragment length to generate chained skb,
	 * each of segments is IP fragment ready for sending to network after
	 * adding appropriate IP header.
	 */

928
	if (!skb)
L
Linus Torvalds 已提交
929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958
		goto alloc_new_skb;

	while (length > 0) {
		/* Check if the remaining data fits into current packet. */
		copy = mtu - skb->len;
		if (copy < length)
			copy = maxfraglen - skb->len;
		if (copy <= 0) {
			char *data;
			unsigned int datalen;
			unsigned int fraglen;
			unsigned int fraggap;
			unsigned int alloclen;
			struct sk_buff *skb_prev;
alloc_new_skb:
			skb_prev = skb;
			if (skb_prev)
				fraggap = skb_prev->len - maxfraglen;
			else
				fraggap = 0;

			/*
			 * If remaining data exceeds the mtu,
			 * we know we need more fragment(s).
			 */
			datalen = length + fraggap;
			if (datalen > mtu - fragheaderlen)
				datalen = maxfraglen - fragheaderlen;
			fraglen = datalen + fragheaderlen;

959
			if ((flags & MSG_MORE) &&
960
			    !(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
961 962
				alloclen = mtu;
			else
963
				alloclen = fraglen;
L
Linus Torvalds 已提交
964

965 966
			alloclen += exthdrlen;

L
Linus Torvalds 已提交
967 968 969 970 971
			/* The last fragment gets additional space at tail.
			 * Note, with MSG_MORE we overallocate on fragments,
			 * because we have no idea what fragment will be
			 * the last.
			 */
972
			if (datalen == length + fraggap)
973
				alloclen += rt->dst.trailer_len;
974

L
Linus Torvalds 已提交
975
			if (transhdrlen) {
976
				skb = sock_alloc_send_skb(sk,
L
Linus Torvalds 已提交
977 978 979 980 981 982
						alloclen + hh_len + 15,
						(flags & MSG_DONTWAIT), &err);
			} else {
				skb = NULL;
				if (atomic_read(&sk->sk_wmem_alloc) <=
				    2 * sk->sk_sndbuf)
983
					skb = sock_wmalloc(sk,
L
Linus Torvalds 已提交
984 985
							   alloclen + hh_len + 15, 1,
							   sk->sk_allocation);
986
				if (unlikely(!skb))
L
Linus Torvalds 已提交
987 988
					err = -ENOBUFS;
			}
989
			if (!skb)
L
Linus Torvalds 已提交
990 991 992 993 994 995 996 997
				goto error;

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = csummode;
			skb->csum = 0;
			skb_reserve(skb, hh_len);
998 999

			/* only the initial fragment is time stamped */
1000
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1001
			cork->tx_flags = 0;
1002 1003
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
L
Linus Torvalds 已提交
1004 1005 1006 1007

			/*
			 *	Find where to start putting bytes.
			 */
1008
			data = skb_put(skb, fraglen + exthdrlen);
1009
			skb_set_network_header(skb, exthdrlen);
1010 1011
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
1012
			data += fragheaderlen + exthdrlen;
L
Linus Torvalds 已提交
1013 1014 1015 1016 1017 1018 1019 1020

			if (fraggap) {
				skb->csum = skb_copy_and_csum_bits(
					skb_prev, maxfraglen,
					data + transhdrlen, fraggap, 0);
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
				data += fraggap;
1021
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
			}

			copy = datalen - transhdrlen - fraggap;
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				err = -EFAULT;
				kfree_skb(skb);
				goto error;
			}

			offset += copy;
			length -= datalen - fraggap;
			transhdrlen = 0;
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;

			/*
			 * Put the packet on the pending queue.
			 */
1040
			__skb_queue_tail(queue, skb);
L
Linus Torvalds 已提交
1041 1042 1043 1044 1045 1046
			continue;
		}

		if (copy > length)
			copy = length;

1047
		if (!(rt->dst.dev->features&NETIF_F_SG)) {
L
Linus Torvalds 已提交
1048 1049 1050
			unsigned int off;

			off = skb->len;
1051
			if (getfrag(from, skb_put(skb, copy),
L
Linus Torvalds 已提交
1052 1053 1054 1055 1056 1057 1058 1059
					offset, copy, off, skb) < 0) {
				__skb_trim(skb, off);
				err = -EFAULT;
				goto error;
			}
		} else {
			int i = skb_shinfo(skb)->nr_frags;

1060 1061
			err = -ENOMEM;
			if (!sk_page_frag_refill(sk, pfrag))
L
Linus Torvalds 已提交
1062
				goto error;
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073

			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				err = -EMSGSIZE;
				if (i == MAX_SKB_FRAGS)
					goto error;

				__skb_fill_page_desc(skb, i, pfrag->page,
						     pfrag->offset, 0);
				skb_shinfo(skb)->nr_frags = ++i;
				get_page(pfrag->page);
L
Linus Torvalds 已提交
1074
			}
1075 1076 1077 1078 1079 1080 1081 1082
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
			if (getfrag(from,
				    page_address(pfrag->page) + pfrag->offset,
				    offset, copy, skb->len, skb) < 0)
				goto error_efault;

			pfrag->offset += copy;
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
L
Linus Torvalds 已提交
1083 1084
			skb->len += copy;
			skb->data_len += copy;
1085 1086
			skb->truesize += copy;
			atomic_add(copy, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1087 1088 1089 1090 1091 1092 1093
		}
		offset += copy;
		length -= copy;
	}

	return 0;

1094 1095
error_efault:
	err = -EFAULT;
L
Linus Torvalds 已提交
1096
error:
1097
	cork->length -= length;
P
Pavel Emelyanov 已提交
1098
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1099
	return err;
L
Linus Torvalds 已提交
1100 1101
}

1102 1103 1104
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
			 struct ipcm_cookie *ipc, struct rtable **rtp)
{
1105
	struct ip_options_rcu *opt;
1106 1107 1108 1109 1110 1111 1112
	struct rtable *rt;

	/*
	 * setup for corking.
	 */
	opt = ipc->opt;
	if (opt) {
1113
		if (!cork->opt) {
1114 1115
			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
					    sk->sk_allocation);
1116
			if (unlikely(!cork->opt))
1117 1118
				return -ENOBUFS;
		}
1119
		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
		cork->flags |= IPCORK_OPT;
		cork->addr = ipc->addr;
	}
	rt = *rtp;
	if (unlikely(!rt))
		return -EFAULT;
	/*
	 * We steal reference to this route, caller should not release it
	 */
	*rtp = NULL;
1130 1131
	cork->fragsize = ip_sk_use_pmtu(sk) ?
			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1132 1133
	cork->dst = &rt->dst;
	cork->length = 0;
1134 1135 1136
	cork->ttl = ipc->ttl;
	cork->tos = ipc->tos;
	cork->priority = ipc->priority;
1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
	cork->tx_flags = ipc->tx_flags;

	return 0;
}

/*
 *	ip_append_data() and ip_append_page() can make one large IP datagram
 *	from many pieces of data. Each pieces will be holded on the socket
 *	until ip_push_pending_frames() is called. Each piece can be a page
 *	or non-page data.
 *
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
 *	this interface potentially.
 *
 *	LATER: length must be adjusted by pad at tail, when it is required.
 */
1153
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
		   int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
		   void *from, int length, int transhdrlen,
		   struct ipcm_cookie *ipc, struct rtable **rtp,
		   unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	int err;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue)) {
1167
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1168 1169 1170 1171 1172 1173
		if (err)
			return err;
	} else {
		transhdrlen = 0;
	}

1174 1175
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
				sk_page_frag(sk), getfrag,
1176 1177 1178
				from, length, transhdrlen, flags);
}

1179
ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
L
Linus Torvalds 已提交
1180 1181 1182 1183 1184 1185
		       int offset, size_t size, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;
	struct rtable *rt;
	struct ip_options *opt = NULL;
1186
	struct inet_cork *cork;
L
Linus Torvalds 已提交
1187 1188 1189 1190
	int hh_len;
	int mtu;
	int len;
	int err;
1191
	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
L
Linus Torvalds 已提交
1192 1193 1194 1195 1196 1197 1198 1199 1200 1201

	if (inet->hdrincl)
		return -EPERM;

	if (flags&MSG_PROBE)
		return 0;

	if (skb_queue_empty(&sk->sk_write_queue))
		return -EINVAL;

1202 1203 1204 1205
	cork = &inet->cork.base;
	rt = (struct rtable *)cork->dst;
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1206

1207
	if (!(rt->dst.dev->features&NETIF_F_SG))
L
Linus Torvalds 已提交
1208 1209
		return -EOPNOTSUPP;

1210
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1211
	mtu = cork->fragsize;
L
Linus Torvalds 已提交
1212 1213 1214

	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
W
WANG Cong 已提交
1215
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
L
Linus Torvalds 已提交
1216

1217
	if (cork->length + size > maxnonfragsize - fragheaderlen) {
1218 1219
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
			       mtu - (opt ? opt->optlen : 0));
L
Linus Torvalds 已提交
1220 1221 1222
		return -EMSGSIZE;
	}

1223 1224
	skb = skb_peek_tail(&sk->sk_write_queue);
	if (!skb)
L
Linus Torvalds 已提交
1225 1226
		return -EINVAL;

1227
	cork->length += size;
1228 1229
	if ((size + skb->len > mtu) &&
	    (sk->sk_protocol == IPPROTO_UDP) &&
1230
	    (rt->dst.dev->features & NETIF_F_UFO)) {
1231
		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
H
Herbert Xu 已提交
1232
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1233
	}
1234

L
Linus Torvalds 已提交
1235
	while (size > 0) {
1236
		if (skb_is_gso(skb)) {
1237
			len = size;
1238
		} else {
1239 1240 1241 1242 1243 1244

			/* Check if the remaining data fits into current packet. */
			len = mtu - skb->len;
			if (len < size)
				len = maxfraglen - skb->len;
		}
L
Linus Torvalds 已提交
1245 1246 1247 1248 1249
		if (len <= 0) {
			struct sk_buff *skb_prev;
			int alloclen;

			skb_prev = skb;
1250
			fraggap = skb_prev->len - maxfraglen;
L
Linus Torvalds 已提交
1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268

			alloclen = fragheaderlen + hh_len + fraggap + 15;
			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
			if (unlikely(!skb)) {
				err = -ENOBUFS;
				goto error;
			}

			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = CHECKSUM_NONE;
			skb->csum = 0;
			skb_reserve(skb, hh_len);

			/*
			 *	Find where to start putting bytes.
			 */
1269
			skb_put(skb, fragheaderlen + fraggap);
1270
			skb_reset_network_header(skb);
1271 1272
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
L
Linus Torvalds 已提交
1273
			if (fraggap) {
1274 1275
				skb->csum = skb_copy_and_csum_bits(skb_prev,
								   maxfraglen,
1276
						    skb_transport_header(skb),
1277
								   fraggap, 0);
L
Linus Torvalds 已提交
1278 1279
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
1280
				pskb_trim_unique(skb_prev, maxfraglen);
L
Linus Torvalds 已提交
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
			}

			/*
			 * Put the packet on the pending queue.
			 */
			__skb_queue_tail(&sk->sk_write_queue, skb);
			continue;
		}

		if (len > size)
			len = size;
1292 1293

		if (skb_append_pagefrags(skb, page, offset, len)) {
L
Linus Torvalds 已提交
1294 1295 1296 1297 1298
			err = -EMSGSIZE;
			goto error;
		}

		if (skb->ip_summed == CHECKSUM_NONE) {
1299
			__wsum csum;
L
Linus Torvalds 已提交
1300 1301 1302 1303 1304 1305
			csum = csum_page(page, offset, len);
			skb->csum = csum_block_add(skb->csum, csum, skb->len);
		}

		skb->len += len;
		skb->data_len += len;
1306 1307
		skb->truesize += len;
		atomic_add(len, &sk->sk_wmem_alloc);
L
Linus Torvalds 已提交
1308 1309 1310 1311 1312 1313
		offset += len;
		size -= len;
	}
	return 0;

error:
1314
	cork->length -= size;
P
Pavel Emelyanov 已提交
1315
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1316 1317 1318
	return err;
}

1319
static void ip_cork_release(struct inet_cork *cork)
1320
{
1321 1322 1323 1324 1325
	cork->flags &= ~IPCORK_OPT;
	kfree(cork->opt);
	cork->opt = NULL;
	dst_release(cork->dst);
	cork->dst = NULL;
1326 1327
}

L
Linus Torvalds 已提交
1328 1329 1330 1331
/*
 *	Combined all pending IP fragments on the socket as one IP datagram
 *	and push them out.
 */
1332
struct sk_buff *__ip_make_skb(struct sock *sk,
1333
			      struct flowi4 *fl4,
1334 1335
			      struct sk_buff_head *queue,
			      struct inet_cork *cork)
L
Linus Torvalds 已提交
1336 1337 1338 1339
{
	struct sk_buff *skb, *tmp_skb;
	struct sk_buff **tail_skb;
	struct inet_sock *inet = inet_sk(sk);
1340
	struct net *net = sock_net(sk);
L
Linus Torvalds 已提交
1341
	struct ip_options *opt = NULL;
1342
	struct rtable *rt = (struct rtable *)cork->dst;
L
Linus Torvalds 已提交
1343
	struct iphdr *iph;
1344
	__be16 df = 0;
L
Linus Torvalds 已提交
1345 1346
	__u8 ttl;

1347 1348
	skb = __skb_dequeue(queue);
	if (!skb)
L
Linus Torvalds 已提交
1349 1350 1351 1352
		goto out;
	tail_skb = &(skb_shinfo(skb)->frag_list);

	/* move skb->data to ip header from ext header */
1353
	if (skb->data < skb_network_header(skb))
1354
		__skb_pull(skb, skb_network_offset(skb));
1355
	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1356
		__skb_pull(tmp_skb, skb_network_header_len(skb));
L
Linus Torvalds 已提交
1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369
		*tail_skb = tmp_skb;
		tail_skb = &(tmp_skb->next);
		skb->len += tmp_skb->len;
		skb->data_len += tmp_skb->len;
		skb->truesize += tmp_skb->truesize;
		tmp_skb->destructor = NULL;
		tmp_skb->sk = NULL;
	}

	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
	 * to fragment the frame generated here. No matter, what transforms
	 * how transforms change size of the packet, it will come out.
	 */
W
WANG Cong 已提交
1370
	skb->ignore_df = ip_sk_ignore_df(sk);
L
Linus Torvalds 已提交
1371 1372

	/* DF bit is set when we want to see DF on outgoing frames.
W
WANG Cong 已提交
1373
	 * If ignore_df is set too, we still allow to fragment this frame
L
Linus Torvalds 已提交
1374
	 * locally. */
1375 1376
	if (inet->pmtudisc == IP_PMTUDISC_DO ||
	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
1377 1378
	    (skb->len <= dst_mtu(&rt->dst) &&
	     ip_dont_fragment(sk, &rt->dst)))
L
Linus Torvalds 已提交
1379 1380
		df = htons(IP_DF);

1381 1382
	if (cork->flags & IPCORK_OPT)
		opt = cork->opt;
L
Linus Torvalds 已提交
1383

1384 1385 1386
	if (cork->ttl != 0)
		ttl = cork->ttl;
	else if (rt->rt_type == RTN_MULTICAST)
L
Linus Torvalds 已提交
1387 1388
		ttl = inet->mc_ttl;
	else
1389
		ttl = ip_select_ttl(inet, &rt->dst);
L
Linus Torvalds 已提交
1390

1391
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1392 1393
	iph->version = 4;
	iph->ihl = 5;
1394
	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
L
Linus Torvalds 已提交
1395 1396 1397
	iph->frag_off = df;
	iph->ttl = ttl;
	iph->protocol = sk->sk_protocol;
1398
	ip_copy_addrs(iph, fl4);
1399
	ip_select_ident(net, skb, sk);
L
Linus Torvalds 已提交
1400

1401 1402 1403 1404 1405
	if (opt) {
		iph->ihl += opt->optlen>>2;
		ip_options_build(skb, opt, cork->addr, rt, 0);
	}

1406
	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1407
	skb->mark = sk->sk_mark;
1408 1409 1410 1411
	/*
	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
	 * on dst refcount
	 */
1412
	cork->dst = NULL;
1413
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
1414

1415
	if (iph->protocol == IPPROTO_ICMP)
1416
		icmp_out_count(net, ((struct icmphdr *)
1417 1418
			skb_transport_header(skb))->type);

1419 1420 1421 1422 1423
	ip_cork_release(cork);
out:
	return skb;
}

E
Eric Dumazet 已提交
1424
int ip_send_skb(struct net *net, struct sk_buff *skb)
1425 1426 1427
{
	int err;

H
Herbert Xu 已提交
1428
	err = ip_local_out(skb);
L
Linus Torvalds 已提交
1429 1430
	if (err) {
		if (err > 0)
E
Eric Dumazet 已提交
1431
			err = net_xmit_errno(err);
L
Linus Torvalds 已提交
1432
		if (err)
1433
			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
L
Linus Torvalds 已提交
1434 1435 1436 1437 1438
	}

	return err;
}

1439
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1440
{
1441 1442
	struct sk_buff *skb;

1443
	skb = ip_finish_skb(sk, fl4);
1444 1445 1446 1447
	if (!skb)
		return 0;

	/* Netfilter gets whole the not fragmented skb. */
E
Eric Dumazet 已提交
1448
	return ip_send_skb(sock_net(sk), skb);
1449 1450
}

L
Linus Torvalds 已提交
1451 1452 1453
/*
 *	Throw away all pending data on the socket.
 */
1454 1455 1456
static void __ip_flush_pending_frames(struct sock *sk,
				      struct sk_buff_head *queue,
				      struct inet_cork *cork)
L
Linus Torvalds 已提交
1457 1458 1459
{
	struct sk_buff *skb;

1460
	while ((skb = __skb_dequeue_tail(queue)) != NULL)
L
Linus Torvalds 已提交
1461 1462
		kfree_skb(skb);

1463 1464 1465 1466 1467
	ip_cork_release(cork);
}

void ip_flush_pending_frames(struct sock *sk)
{
1468
	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
L
Linus Torvalds 已提交
1469 1470
}

1471
struct sk_buff *ip_make_skb(struct sock *sk,
1472
			    struct flowi4 *fl4,
1473 1474 1475 1476 1477 1478
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    struct ipcm_cookie *ipc, struct rtable **rtp,
			    unsigned int flags)
{
1479
	struct inet_cork cork;
1480 1481 1482 1483 1484 1485 1486 1487
	struct sk_buff_head queue;
	int err;

	if (flags & MSG_PROBE)
		return NULL;

	__skb_queue_head_init(&queue);

1488 1489
	cork.flags = 0;
	cork.addr = 0;
1490
	cork.opt = NULL;
1491 1492 1493 1494
	err = ip_setup_cork(sk, &cork, ipc, rtp);
	if (err)
		return ERR_PTR(err);

1495 1496
	err = __ip_append_data(sk, fl4, &queue, &cork,
			       &current->task_frag, getfrag,
1497 1498 1499 1500 1501 1502
			       from, length, transhdrlen, flags);
	if (err) {
		__ip_flush_pending_frames(sk, &queue, &cork);
		return ERR_PTR(err);
	}

1503
	return __ip_make_skb(sk, fl4, &queue, &cork);
1504
}
L
Linus Torvalds 已提交
1505 1506 1507 1508

/*
 *	Fetch data from kernel space and fill in checksum if needed.
 */
1509
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
L
Linus Torvalds 已提交
1510 1511
			      int len, int odd, struct sk_buff *skb)
{
1512
	__wsum csum;
L
Linus Torvalds 已提交
1513 1514 1515

	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
	skb->csum = csum_block_add(skb->csum, csum, odd);
1516
	return 0;
L
Linus Torvalds 已提交
1517 1518
}

1519
/*
L
Linus Torvalds 已提交
1520
 *	Generic function to send a packet as reply to another packet.
1521
 *	Used to send some TCP resets/acks so far.
L
Linus Torvalds 已提交
1522
 */
1523
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1524 1525 1526
			   const struct ip_options *sopt,
			   __be32 daddr, __be32 saddr,
			   const struct ip_reply_arg *arg,
1527
			   unsigned int len)
L
Linus Torvalds 已提交
1528
{
1529
	struct ip_options_data replyopts;
L
Linus Torvalds 已提交
1530
	struct ipcm_cookie ipc;
1531
	struct flowi4 fl4;
E
Eric Dumazet 已提交
1532
	struct rtable *rt = skb_rtable(skb);
1533
	struct net *net = sock_net(sk);
1534
	struct sk_buff *nskb;
1535
	int err;
L
Linus Torvalds 已提交
1536

1537
	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
L
Linus Torvalds 已提交
1538 1539
		return;

1540
	ipc.addr = daddr;
L
Linus Torvalds 已提交
1541
	ipc.opt = NULL;
1542
	ipc.tx_flags = 0;
1543 1544
	ipc.ttl = 0;
	ipc.tos = -1;
L
Linus Torvalds 已提交
1545

1546
	if (replyopts.opt.opt.optlen) {
L
Linus Torvalds 已提交
1547 1548
		ipc.opt = &replyopts.opt;

1549 1550
		if (replyopts.opt.opt.srr)
			daddr = replyopts.opt.opt.faddr;
L
Linus Torvalds 已提交
1551 1552
	}

1553 1554
	flowi4_init_output(&fl4, arg->bound_dev_if,
			   IP4_REPLY_MARK(net, skb->mark),
1555
			   RT_TOS(arg->tos),
1556
			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1557
			   ip_reply_arg_flowi_flags(arg),
1558
			   daddr, saddr,
1559 1560
			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1561
	rt = ip_route_output_key(net, &fl4);
1562 1563
	if (IS_ERR(rt))
		return;
L
Linus Torvalds 已提交
1564

1565
	inet_sk(sk)->tos = arg->tos;
L
Linus Torvalds 已提交
1566 1567

	sk->sk_priority = skb->priority;
1568
	sk->sk_protocol = ip_hdr(skb)->protocol;
1569
	sk->sk_bound_dev_if = arg->bound_dev_if;
1570
	sk->sk_sndbuf = sysctl_wmem_default;
1571 1572 1573 1574 1575 1576 1577
	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
			     len, 0, &ipc, &rt, MSG_DONTWAIT);
	if (unlikely(err)) {
		ip_flush_pending_frames(sk);
		goto out;
	}

1578 1579
	nskb = skb_peek(&sk->sk_write_queue);
	if (nskb) {
L
Linus Torvalds 已提交
1580
		if (arg->csumoffset >= 0)
1581 1582
			*((__sum16 *)skb_transport_header(nskb) +
			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1583
								arg->csum));
1584 1585
		nskb->ip_summed = CHECKSUM_NONE;
		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1586
		ip_push_pending_frames(sk, &fl4);
L
Linus Torvalds 已提交
1587
	}
1588
out:
L
Linus Torvalds 已提交
1589 1590 1591 1592 1593 1594 1595 1596
	ip_rt_put(rt);
}

void __init ip_init(void)
{
	ip_rt_init();
	inet_initpeers();

1597 1598
#if defined(CONFIG_IP_MULTICAST)
	igmp_mc_init();
L
Linus Torvalds 已提交
1599 1600
#endif
}