datagram.c 21.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 *	SUCS NET3:
 *
 *	Generic datagram handling routines. These are generic for all
 *	protocols. Possibly a generic IP version on top of these would
 *	make sense. Not tonight however 8-).
 *	This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
 *	NetROM layer all have identical poll code and mostly
 *	identical recvmsg() code. So we share it here. The poll was
 *	shared before but buried in udp.c so I moved it.
 *
12
 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 *						     udp.c code)
 *
 *	Fixes:
 *		Alan Cox	:	NULL return from skb_peek_copy()
 *					understood
 *		Alan Cox	:	Rewrote skb_read_datagram to avoid the
 *					skb_peek_copy stuff.
 *		Alan Cox	:	Added support for SOCK_SEQPACKET.
 *					IPX can no longer use the SO_TYPE hack
 *					but AX.25 now works right, and SPX is
 *					feasible.
 *		Alan Cox	:	Fixed write poll of non IP protocol
 *					crash.
 *		Florian  La Roche:	Changed for my new skbuff handling.
 *		Darryl Miles	:	Fixed non-blocking SOCK_SEQPACKET.
 *		Linus Torvalds	:	BSD semantic fixes.
 *		Alan Cox	:	Datagram iovec handling
 *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM.
 *		Alan Cox	:	POSIXisms
 *		Pete Wyckoff    :       Unconnected accept() fix.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
49
#include <linux/spinlock.h>
50
#include <linux/slab.h>
51
#include <linux/pagemap.h>
L
Linus Torvalds 已提交
52 53 54 55

#include <net/protocol.h>
#include <linux/skbuff.h>

56 57 58
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
59
#include <trace/events/skb.h>
60
#include <net/busy_poll.h>
L
Linus Torvalds 已提交
61 62 63 64 65 66 67 68 69

/*
 *	Is a socket 'connection oriented' ?
 */
static inline int connection_based(struct sock *sk)
{
	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}

70
static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
71 72 73 74 75 76 77 78 79 80 81
				  void *key)
{
	unsigned long bits = (unsigned long)key;

	/*
	 * Avoid a wakeup if event not interesting for us
	 */
	if (bits && !(bits & (POLLIN | POLLERR)))
		return 0;
	return autoremove_wake_function(wait, mode, sync, key);
}
L
Linus Torvalds 已提交
82
/*
83
 * Wait for the last received packet to be different from skb
L
Linus Torvalds 已提交
84
 */
85 86
static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
				 const struct sk_buff *skb)
L
Linus Torvalds 已提交
87 88
{
	int error;
89
	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
L
Linus Torvalds 已提交
90

E
Eric Dumazet 已提交
91
	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
L
Linus Torvalds 已提交
92 93 94 95 96 97

	/* Socket errors? */
	error = sock_error(sk);
	if (error)
		goto out_err;

98
	if (sk->sk_receive_queue.prev != skb)
L
Linus Torvalds 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
		goto out;

	/* Socket shut down? */
	if (sk->sk_shutdown & RCV_SHUTDOWN)
		goto out_noerr;

	/* Sequenced packets can come disconnected.
	 * If so we report the problem
	 */
	error = -ENOTCONN;
	if (connection_based(sk) &&
	    !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
		goto out_err;

	/* handle signals */
	if (signal_pending(current))
		goto interrupted;

	error = 0;
	*timeo_p = schedule_timeout(*timeo_p);
out:
E
Eric Dumazet 已提交
120
	finish_wait(sk_sleep(sk), &wait);
L
Linus Torvalds 已提交
121 122 123 124 125 126 127 128 129 130 131 132 133
	return error;
interrupted:
	error = sock_intr_errno(*timeo_p);
out_err:
	*err = error;
	goto out;
out_noerr:
	*err = 0;
	error = 1;
	goto out;
}

/**
134
 *	__skb_recv_datagram - Receive a datagram skbuff
135 136
 *	@sk: socket
 *	@flags: MSG_ flags
137
 *	@peeked: returns non-zero if this packet has been seen before
138 139
 *	@off: an offset in bytes to peek skb from. Returns an offset
 *	      within an skb where data actually starts
140
 *	@err: error code returned
L
Linus Torvalds 已提交
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 *
 *	Get a datagram skbuff, understands the peeking, nonblocking wakeups
 *	and possible races. This replaces identical code in packet, raw and
 *	udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
 *	the long standing peek and read race for datagram sockets. If you
 *	alter this routine remember it must be re-entrant.
 *
 *	This function will lock the socket if a skb is returned, so the caller
 *	needs to unlock the socket in that case (usually by calling
 *	skb_free_datagram)
 *
 *	* It does not lock socket since today. This function is
 *	* free of race conditions. This measure should/can improve
 *	* significantly datagram socket latencies at high loads,
 *	* when data copying to user space takes lots of time.
 *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
 *	*  8) Great win.)
 *	*			                    --ANK (980729)
 *
 *	The order of the tests when we find no data waiting are specified
 *	quite explicitly by POSIX 1003.1g, don't change them without having
 *	the standard around please.
 */
164
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
165
				    int *peeked, int *off, int *err)
L
Linus Torvalds 已提交
166
{
167
	struct sk_buff *skb, *last;
L
Linus Torvalds 已提交
168 169 170 171 172 173 174 175 176
	long timeo;
	/*
	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
	 */
	int error = sock_error(sk);

	if (error)
		goto no_packet;

177
	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
L
Linus Torvalds 已提交
178 179 180 181 182 183

	do {
		/* Again only user level code calls this function, so nothing
		 * interrupt level will suddenly eat the receive_queue.
		 *
		 * Look at current nfs client by the way...
184
		 * However, this function was correct in any case. 8)
L
Linus Torvalds 已提交
185
		 */
186
		unsigned long cpu_flags;
187
		struct sk_buff_head *queue = &sk->sk_receive_queue;
188
		int _off = *off;
189

190
		last = (struct sk_buff *)queue;
191
		spin_lock_irqsave(&queue->lock, cpu_flags);
192
		skb_queue_walk(queue, skb) {
193
			last = skb;
194 195
			*peeked = skb->peeked;
			if (flags & MSG_PEEK) {
196
				if (_off >= skb->len && (skb->len || _off ||
197
							 skb->peeked)) {
198
					_off -= skb->len;
199 200
					continue;
				}
201
				skb->peeked = 1;
L
Linus Torvalds 已提交
202
				atomic_inc(&skb->users);
203
			} else
204
				__skb_unlink(skb, queue);
L
Linus Torvalds 已提交
205

206
			spin_unlock_irqrestore(&queue->lock, cpu_flags);
207
			*off = _off;
L
Linus Torvalds 已提交
208
			return skb;
209 210
		}
		spin_unlock_irqrestore(&queue->lock, cpu_flags);
L
Linus Torvalds 已提交
211

212 213
		if (sk_can_busy_loop(sk) &&
		    sk_busy_loop(sk, flags & MSG_DONTWAIT))
214 215
			continue;

L
Linus Torvalds 已提交
216 217 218 219 220
		/* User doesn't want to wait */
		error = -EAGAIN;
		if (!timeo)
			goto no_packet;

221
	} while (!wait_for_more_packets(sk, err, &timeo, last));
L
Linus Torvalds 已提交
222 223 224 225 226 227 228

	return NULL;

no_packet:
	*err = error;
	return NULL;
}
229 230
EXPORT_SYMBOL(__skb_recv_datagram);

231
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
232 233
				  int noblock, int *err)
{
234
	int peeked, off = 0;
235 236

	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
237
				   &peeked, &off, err);
238
}
E
Eric Dumazet 已提交
239
EXPORT_SYMBOL(skb_recv_datagram);
L
Linus Torvalds 已提交
240 241 242

void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
243
	consume_skb(skb);
244
	sk_mem_reclaim_partial(sk);
L
Linus Torvalds 已提交
245
}
246 247 248 249
EXPORT_SYMBOL(skb_free_datagram);

void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
{
250 251
	bool slow;

252 253 254 255 256
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;

257
	slow = lock_sock_fast(sk);
E
Eric Dumazet 已提交
258 259
	skb_orphan(skb);
	sk_mem_reclaim_partial(sk);
260
	unlock_sock_fast(sk, slow);
E
Eric Dumazet 已提交
261

262 263
	/* skb is now orphaned, can be freed outside of locked section */
	__kfree_skb(skb);
264 265
}
EXPORT_SYMBOL(skb_free_datagram_locked);
L
Linus Torvalds 已提交
266

267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
/**
 *	skb_kill_datagram - Free a datagram skbuff forcibly
 *	@sk: socket
 *	@skb: datagram skbuff
 *	@flags: MSG_ flags
 *
 *	This function frees a datagram skbuff that was received by
 *	skb_recv_datagram.  The flags argument must match the one
 *	used for skb_recv_datagram.
 *
 *	If the MSG_PEEK flag is set, and the packet is still on the
 *	receive queue of the socket, it will be taken off the queue
 *	before it is freed.
 *
 *	This function currently only disables BH when acquiring the
 *	sk_receive_queue lock.  Therefore it must not be used in a
 *	context where that lock is acquired in an IRQ context.
284 285
 *
 *	It returns 0 if the packet was removed by us.
286 287
 */

288
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
289
{
290 291
	int err = 0;

292
	if (flags & MSG_PEEK) {
293
		err = -ENOENT;
294 295 296 297
		spin_lock_bh(&sk->sk_receive_queue.lock);
		if (skb == skb_peek(&sk->sk_receive_queue)) {
			__skb_unlink(skb, &sk->sk_receive_queue);
			atomic_dec(&skb->users);
298
			err = 0;
299 300 301 302
		}
		spin_unlock_bh(&sk->sk_receive_queue.lock);
	}

303
	kfree_skb(skb);
E
Eric Dumazet 已提交
304
	atomic_inc(&sk->sk_drops);
305 306
	sk_mem_reclaim_partial(sk);

307
	return err;
308 309 310
}
EXPORT_SYMBOL(skb_kill_datagram);

L
Linus Torvalds 已提交
311 312
/**
 *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
313 314
 *	@skb: buffer to copy
 *	@offset: offset in the buffer to start copying from
315
 *	@to: io vector to copy to
316
 *	@len: amount of data to copy from buffer to iovec
L
Linus Torvalds 已提交
317 318 319 320 321 322
 *
 *	Note: the iovec is modified during the copy.
 */
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
			    struct iovec *to, int len)
{
323 324
	int start = skb_headlen(skb);
	int i, copy = start - offset;
325
	struct sk_buff *frag_iter;
326

327 328
	trace_skb_copy_datagram_iovec(skb, len);

329 330 331 332 333 334 335 336 337 338
	/* Copy header. */
	if (copy > 0) {
		if (copy > len)
			copy = len;
		if (memcpy_toiovec(to, skb->data + offset, copy))
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
	}
339

340 341
	/* Copy paged appendix. Hmm... why does this look so complicated? */
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
342
		int end;
E
Eric Dumazet 已提交
343
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
L
Linus Torvalds 已提交
344

345
		WARN_ON(start > offset + len);
346

E
Eric Dumazet 已提交
347
		end = start + skb_frag_size(frag);
348 349 350
		if ((copy = end - offset) > 0) {
			int err;
			u8  *vaddr;
351
			struct page *page = skb_frag_page(frag);
L
Linus Torvalds 已提交
352 353 354

			if (copy > len)
				copy = len;
355
			vaddr = kmap(page);
356 357
			err = memcpy_toiovec(to, vaddr + frag->page_offset +
					     offset - start, copy);
358
			kunmap(page);
L
Linus Torvalds 已提交
359 360 361 362 363 364
			if (err)
				goto fault;
			if (!(len -= copy))
				return 0;
			offset += copy;
		}
365
		start = end;
L
Linus Torvalds 已提交
366
	}
367

368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
	skb_walk_frags(skb, frag_iter) {
		int end;

		WARN_ON(start > offset + len);

		end = start + frag_iter->len;
		if ((copy = end - offset) > 0) {
			if (copy > len)
				copy = len;
			if (skb_copy_datagram_iovec(frag_iter,
						    offset - start,
						    to, copy))
				goto fault;
			if ((len -= copy) == 0)
				return 0;
			offset += copy;
384
		}
385
		start = end;
L
Linus Torvalds 已提交
386
	}
387 388 389
	if (!len)
		return 0;

L
Linus Torvalds 已提交
390 391 392
fault:
	return -EFAULT;
}
E
Eric Dumazet 已提交
393
EXPORT_SYMBOL(skb_copy_datagram_iovec);
L
Linus Torvalds 已提交
394

395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
/**
 *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
 *	@skb: buffer to copy
 *	@offset: offset in the buffer to start copying from
 *	@to: io vector to copy to
 *	@to_offset: offset in the io vector to start copying to
 *	@len: amount of data to copy from buffer to iovec
 *
 *	Returns 0 or -EFAULT.
 *	Note: the iovec is not modified during the copy.
 */
int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
				  const struct iovec *to, int to_offset,
				  int len)
{
	int start = skb_headlen(skb);
	int i, copy = start - offset;
412
	struct sk_buff *frag_iter;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428

	/* Copy header. */
	if (copy > 0) {
		if (copy > len)
			copy = len;
		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
		to_offset += copy;
	}

	/* Copy paged appendix. Hmm... why does this look so complicated? */
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
		int end;
E
Eric Dumazet 已提交
429
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
430 431 432

		WARN_ON(start > offset + len);

E
Eric Dumazet 已提交
433
		end = start + skb_frag_size(frag);
434 435 436
		if ((copy = end - offset) > 0) {
			int err;
			u8  *vaddr;
437
			struct page *page = skb_frag_page(frag);
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454

			if (copy > len)
				copy = len;
			vaddr = kmap(page);
			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
						offset - start, to_offset, copy);
			kunmap(page);
			if (err)
				goto fault;
			if (!(len -= copy))
				return 0;
			offset += copy;
			to_offset += copy;
		}
		start = end;
	}

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
	skb_walk_frags(skb, frag_iter) {
		int end;

		WARN_ON(start > offset + len);

		end = start + frag_iter->len;
		if ((copy = end - offset) > 0) {
			if (copy > len)
				copy = len;
			if (skb_copy_datagram_const_iovec(frag_iter,
							  offset - start,
							  to, to_offset,
							  copy))
				goto fault;
			if ((len -= copy) == 0)
				return 0;
			offset += copy;
			to_offset += copy;
473
		}
474
		start = end;
475 476 477 478 479 480 481 482 483
	}
	if (!len)
		return 0;

fault:
	return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_const_iovec);

484 485 486 487 488
/**
 *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
 *	@skb: buffer to copy
 *	@offset: offset in the buffer to start copying to
 *	@from: io vector to copy to
489
 *	@from_offset: offset in the io vector to start copying from
490 491 492
 *	@len: amount of data to copy to buffer from iovec
 *
 *	Returns 0 or -EFAULT.
493
 *	Note: the iovec is not modified during the copy.
494 495
 */
int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
496 497
				 const struct iovec *from, int from_offset,
				 int len)
498 499 500
{
	int start = skb_headlen(skb);
	int i, copy = start - offset;
501
	struct sk_buff *frag_iter;
502 503 504 505 506

	/* Copy header. */
	if (copy > 0) {
		if (copy > len)
			copy = len;
507 508
		if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
					copy))
509 510 511 512
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
513
		from_offset += copy;
514 515 516 517 518
	}

	/* Copy paged appendix. Hmm... why does this look so complicated? */
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
		int end;
E
Eric Dumazet 已提交
519
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
520 521 522

		WARN_ON(start > offset + len);

E
Eric Dumazet 已提交
523
		end = start + skb_frag_size(frag);
524 525 526
		if ((copy = end - offset) > 0) {
			int err;
			u8  *vaddr;
527
			struct page *page = skb_frag_page(frag);
528 529 530 531

			if (copy > len)
				copy = len;
			vaddr = kmap(page);
532 533 534
			err = memcpy_fromiovecend(vaddr + frag->page_offset +
						  offset - start,
						  from, from_offset, copy);
535 536 537 538 539 540 541
			kunmap(page);
			if (err)
				goto fault;

			if (!(len -= copy))
				return 0;
			offset += copy;
542
			from_offset += copy;
543 544 545 546
		}
		start = end;
	}

547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
	skb_walk_frags(skb, frag_iter) {
		int end;

		WARN_ON(start > offset + len);

		end = start + frag_iter->len;
		if ((copy = end - offset) > 0) {
			if (copy > len)
				copy = len;
			if (skb_copy_datagram_from_iovec(frag_iter,
							 offset - start,
							 from,
							 from_offset,
							 copy))
				goto fault;
			if ((len -= copy) == 0)
				return 0;
			offset += copy;
			from_offset += copy;
566
		}
567
		start = end;
568 569 570 571 572 573 574 575 576
	}
	if (!len)
		return 0;

fault:
	return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iovec);

577 578 579
/**
 *	zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
 *	@skb: buffer to copy
580
 *	@from: io vector to copy from
581 582 583 584 585 586 587 588 589 590 591 592 593
 *	@offset: offset in the io vector to start copying from
 *	@count: amount of vectors to copy to buffer from
 *
 *	The function will first copy up to headlen, and then pin the userspace
 *	pages and build frags through them.
 *
 *	Returns 0, -EFAULT or -EMSGSIZE.
 *	Note: the iovec is not modified during the copy
 */
int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
				  int offset, size_t count)
{
	int len = iov_length(from, count) - offset;
594 595
	int copy = min_t(int, skb_headlen(skb), len);
	int size;
596 597 598
	int i = 0;

	/* copy up to skb headlen */
599 600
	if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
		return -EFAULT;
601

602
	if (len == copy)
603 604
		return 0;

605
	offset += copy;
606 607 608 609 610 611
	while (count--) {
		struct page *page[MAX_SKB_FRAGS];
		int num_pages;
		unsigned long base;
		unsigned long truesize;

612 613 614
		/* Skip over from offset and copied */
		if (offset >= from->iov_len) {
			offset -= from->iov_len;
615 616 617
			++from;
			continue;
		}
618
		len = from->iov_len - offset;
619 620 621 622 623 624
		base = (unsigned long)from->iov_base + offset;
		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
		if (i + size > MAX_SKB_FRAGS)
			return -EMSGSIZE;
		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
		if (num_pages != size) {
625
			release_pages(&page[i], num_pages, 0);
626 627 628 629 630 631 632 633 634 635
			return -EFAULT;
		}
		truesize = size * PAGE_SIZE;
		skb->data_len += len;
		skb->len += len;
		skb->truesize += truesize;
		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
		while (len) {
			int off = base & ~PAGE_MASK;
			int size = min_t(int, len, PAGE_SIZE - off);
636
			skb_fill_page_desc(skb, i, page[i], off, size);
637 638 639 640 641 642 643 644 645 646 647
			base += size;
			len -= size;
			i++;
		}
		offset = 0;
		++from;
	}
	return 0;
}
EXPORT_SYMBOL(zerocopy_sg_from_iovec);

L
Linus Torvalds 已提交
648 649
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
				      u8 __user *to, int len,
650
				      __wsum *csump)
L
Linus Torvalds 已提交
651
{
652 653
	int start = skb_headlen(skb);
	int i, copy = start - offset;
654 655
	struct sk_buff *frag_iter;
	int pos = 0;
L
Linus Torvalds 已提交
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673

	/* Copy header. */
	if (copy > 0) {
		int err = 0;
		if (copy > len)
			copy = len;
		*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
					       *csump, &err);
		if (err)
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
		to += copy;
		pos = copy;
	}

	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
674
		int end;
E
Eric Dumazet 已提交
675
		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
L
Linus Torvalds 已提交
676

677
		WARN_ON(start > offset + len);
678

E
Eric Dumazet 已提交
679
		end = start + skb_frag_size(frag);
L
Linus Torvalds 已提交
680
		if ((copy = end - offset) > 0) {
681
			__wsum csum2;
L
Linus Torvalds 已提交
682 683
			int err = 0;
			u8  *vaddr;
684
			struct page *page = skb_frag_page(frag);
L
Linus Torvalds 已提交
685 686 687 688 689

			if (copy > len)
				copy = len;
			vaddr = kmap(page);
			csum2 = csum_and_copy_to_user(vaddr +
690 691
							frag->page_offset +
							offset - start,
L
Linus Torvalds 已提交
692 693 694 695 696 697 698 699 700 701 702
						      to, copy, 0, &err);
			kunmap(page);
			if (err)
				goto fault;
			*csump = csum_block_add(*csump, csum2, pos);
			if (!(len -= copy))
				return 0;
			offset += copy;
			to += copy;
			pos += copy;
		}
703
		start = end;
L
Linus Torvalds 已提交
704 705
	}

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726
	skb_walk_frags(skb, frag_iter) {
		int end;

		WARN_ON(start > offset + len);

		end = start + frag_iter->len;
		if ((copy = end - offset) > 0) {
			__wsum csum2 = 0;
			if (copy > len)
				copy = len;
			if (skb_copy_and_csum_datagram(frag_iter,
						       offset - start,
						       to, copy,
						       &csum2))
				goto fault;
			*csump = csum_block_add(*csump, csum2, pos);
			if ((len -= copy) == 0)
				return 0;
			offset += copy;
			to += copy;
			pos += copy;
L
Linus Torvalds 已提交
727
		}
728
		start = end;
L
Linus Torvalds 已提交
729 730 731 732 733 734 735 736
	}
	if (!len)
		return 0;

fault:
	return -EFAULT;
}

737
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
738
{
739
	__sum16 sum;
740

741
	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
742 743 744 745 746 747
	if (likely(!sum)) {
		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
		    !skb->csum_complete_sw)
			netdev_rx_csum_fault(skb->dev);
	}
	skb->csum_valid = !sum;
748 749
	return sum;
}
750 751 752 753
EXPORT_SYMBOL(__skb_checksum_complete_head);

__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
	__wsum csum;
	__sum16 sum;

	csum = skb_checksum(skb, 0, skb->len, 0);

	/* skb->csum holds pseudo checksum */
	sum = csum_fold(csum_add(skb->csum, csum));
	if (likely(!sum)) {
		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
		    !skb->csum_complete_sw)
			netdev_rx_csum_fault(skb->dev);
	}

	/* Save full packet checksum */
	skb->csum = csum;
	skb->ip_summed = CHECKSUM_COMPLETE;
	skb->csum_complete_sw = 1;
	skb->csum_valid = !sum;

	return sum;
774
}
775 776
EXPORT_SYMBOL(__skb_checksum_complete);

L
Linus Torvalds 已提交
777 778
/**
 *	skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
779 780
 *	@skb: skbuff
 *	@hlen: hardware length
781
 *	@iov: io vector
782
 *
L
Linus Torvalds 已提交
783 784 785 786 787 788 789
 *	Caller _must_ check that skb will fit to this iovec.
 *
 *	Returns: 0       - success.
 *		 -EINVAL - checksum failure.
 *		 -EFAULT - fault during copy. Beware, in this case iovec
 *			   can be modified!
 */
790
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
L
Linus Torvalds 已提交
791 792
				     int hlen, struct iovec *iov)
{
793
	__wsum csum;
L
Linus Torvalds 已提交
794 795
	int chunk = skb->len - hlen;

796 797 798
	if (!chunk)
		return 0;

L
Linus Torvalds 已提交
799 800 801 802 803 804 805
	/* Skip filled elements.
	 * Pretty silly, look at memcpy_toiovec, though 8)
	 */
	while (!iov->iov_len)
		iov++;

	if (iov->iov_len < chunk) {
806
		if (__skb_checksum_complete(skb))
L
Linus Torvalds 已提交
807 808 809 810 811 812 813 814
			goto csum_error;
		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
			goto fault;
	} else {
		csum = csum_partial(skb->data, hlen, skb->csum);
		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
					       chunk, &csum))
			goto fault;
815
		if (csum_fold(csum))
L
Linus Torvalds 已提交
816
			goto csum_error;
817
		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
818
			netdev_rx_csum_fault(skb->dev);
L
Linus Torvalds 已提交
819 820 821 822 823 824 825 826 827
		iov->iov_len -= chunk;
		iov->iov_base += chunk;
	}
	return 0;
csum_error:
	return -EINVAL;
fault:
	return -EFAULT;
}
E
Eric Dumazet 已提交
828
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
L
Linus Torvalds 已提交
829 830 831

/**
 * 	datagram_poll - generic datagram poll
832 833 834
 *	@file: file struct
 *	@sock: socket
 *	@wait: poll table
L
Linus Torvalds 已提交
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
 *
 *	Datagram poll: Again totally generic. This also handles
 *	sequenced packet sockets providing the socket receive queue
 *	is only ever holding data ready to receive.
 *
 *	Note: when you _don't_ use this routine for this protocol,
 *	and you use a different write policy from sock_writeable()
 *	then please supply your own write_space callback.
 */
unsigned int datagram_poll(struct file *file, struct socket *sock,
			   poll_table *wait)
{
	struct sock *sk = sock->sk;
	unsigned int mask;

E
Eric Dumazet 已提交
850
	sock_poll_wait(file, sk_sleep(sk), wait);
L
Linus Torvalds 已提交
851 852 853 854
	mask = 0;

	/* exceptional events? */
	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
855
		mask |= POLLERR |
856
			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
857

858
	if (sk->sk_shutdown & RCV_SHUTDOWN)
E
Eric Dumazet 已提交
859
		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
L
Linus Torvalds 已提交
860 861 862 863
	if (sk->sk_shutdown == SHUTDOWN_MASK)
		mask |= POLLHUP;

	/* readable? */
E
Eric Dumazet 已提交
864
	if (!skb_queue_empty(&sk->sk_receive_queue))
L
Linus Torvalds 已提交
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884
		mask |= POLLIN | POLLRDNORM;

	/* Connection-based need to check for termination and startup */
	if (connection_based(sk)) {
		if (sk->sk_state == TCP_CLOSE)
			mask |= POLLHUP;
		/* connection hasn't started yet? */
		if (sk->sk_state == TCP_SYN_SENT)
			return mask;
	}

	/* writable? */
	if (sock_writeable(sk))
		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
	else
		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

	return mask;
}
EXPORT_SYMBOL(datagram_poll);