datagram.c 15.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 *	SUCS NET3:
 *
 *	Generic datagram handling routines. These are generic for all
 *	protocols. Possibly a generic IP version on top of these would
 *	make sense. Not tonight however 8-).
 *	This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
 *	NetROM layer all have identical poll code and mostly
 *	identical recvmsg() code. So we share it here. The poll was
 *	shared before but buried in udp.c so I moved it.
 *
12
 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
L
Linus Torvalds 已提交
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
 *						     udp.c code)
 *
 *	Fixes:
 *		Alan Cox	:	NULL return from skb_peek_copy()
 *					understood
 *		Alan Cox	:	Rewrote skb_read_datagram to avoid the
 *					skb_peek_copy stuff.
 *		Alan Cox	:	Added support for SOCK_SEQPACKET.
 *					IPX can no longer use the SO_TYPE hack
 *					but AX.25 now works right, and SPX is
 *					feasible.
 *		Alan Cox	:	Fixed write poll of non IP protocol
 *					crash.
 *		Florian  La Roche:	Changed for my new skbuff handling.
 *		Darryl Miles	:	Fixed non-blocking SOCK_SEQPACKET.
 *		Linus Torvalds	:	BSD semantic fixes.
 *		Alan Cox	:	Datagram iovec handling
 *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM.
 *		Alan Cox	:	POSIXisms
 *		Pete Wyckoff    :       Unconnected accept() fix.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
50
#include <linux/spinlock.h>
L
Linus Torvalds 已提交
51 52 53 54

#include <net/protocol.h>
#include <linux/skbuff.h>

55 56 57
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
L
Linus Torvalds 已提交
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117

/*
 *	Is a socket 'connection oriented' ?
 */
static inline int connection_based(struct sock *sk)
{
	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}

/*
 * Wait for a packet..
 */
static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
{
	int error;
	DEFINE_WAIT(wait);

	prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);

	/* Socket errors? */
	error = sock_error(sk);
	if (error)
		goto out_err;

	if (!skb_queue_empty(&sk->sk_receive_queue))
		goto out;

	/* Socket shut down? */
	if (sk->sk_shutdown & RCV_SHUTDOWN)
		goto out_noerr;

	/* Sequenced packets can come disconnected.
	 * If so we report the problem
	 */
	error = -ENOTCONN;
	if (connection_based(sk) &&
	    !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
		goto out_err;

	/* handle signals */
	if (signal_pending(current))
		goto interrupted;

	error = 0;
	*timeo_p = schedule_timeout(*timeo_p);
out:
	finish_wait(sk->sk_sleep, &wait);
	return error;
interrupted:
	error = sock_intr_errno(*timeo_p);
out_err:
	*err = error;
	goto out;
out_noerr:
	*err = 0;
	error = 1;
	goto out;
}

/**
118
 *	__skb_recv_datagram - Receive a datagram skbuff
119 120
 *	@sk: socket
 *	@flags: MSG_ flags
121
 *	@peeked: returns non-zero if this packet has been seen before
122
 *	@err: error code returned
L
Linus Torvalds 已提交
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 *
 *	Get a datagram skbuff, understands the peeking, nonblocking wakeups
 *	and possible races. This replaces identical code in packet, raw and
 *	udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
 *	the long standing peek and read race for datagram sockets. If you
 *	alter this routine remember it must be re-entrant.
 *
 *	This function will lock the socket if a skb is returned, so the caller
 *	needs to unlock the socket in that case (usually by calling
 *	skb_free_datagram)
 *
 *	* It does not lock socket since today. This function is
 *	* free of race conditions. This measure should/can improve
 *	* significantly datagram socket latencies at high loads,
 *	* when data copying to user space takes lots of time.
 *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
 *	*  8) Great win.)
 *	*			                    --ANK (980729)
 *
 *	The order of the tests when we find no data waiting are specified
 *	quite explicitly by POSIX 1003.1g, don't change them without having
 *	the standard around please.
 */
146 147
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
				    int *peeked, int *err)
L
Linus Torvalds 已提交
148 149 150 151 152 153 154 155 156 157 158
{
	struct sk_buff *skb;
	long timeo;
	/*
	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
	 */
	int error = sock_error(sk);

	if (error)
		goto no_packet;

159
	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
L
Linus Torvalds 已提交
160 161 162 163 164 165 166 167

	do {
		/* Again only user level code calls this function, so nothing
		 * interrupt level will suddenly eat the receive_queue.
		 *
		 * Look at current nfs client by the way...
		 * However, this function was corrent in any case. 8)
		 */
168 169 170 171 172 173 174 175
		unsigned long cpu_flags;

		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
		skb = skb_peek(&sk->sk_receive_queue);
		if (skb) {
			*peeked = skb->peeked;
			if (flags & MSG_PEEK) {
				skb->peeked = 1;
L
Linus Torvalds 已提交
176
				atomic_inc(&skb->users);
177 178 179 180
			} else
				__skb_unlink(skb, &sk->sk_receive_queue);
		}
		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
L
Linus Torvalds 已提交
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

		if (skb)
			return skb;

		/* User doesn't want to wait */
		error = -EAGAIN;
		if (!timeo)
			goto no_packet;

	} while (!wait_for_packet(sk, err, &timeo));

	return NULL;

no_packet:
	*err = error;
	return NULL;
}
198 199 200 201 202 203 204 205 206 207
EXPORT_SYMBOL(__skb_recv_datagram);

struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
				  int noblock, int *err)
{
	int peeked;

	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
				   &peeked, err);
}
L
Linus Torvalds 已提交
208 209 210

void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
211
	consume_skb(skb);
212
	sk_mem_reclaim_partial(sk);
L
Linus Torvalds 已提交
213 214
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
/**
 *	skb_kill_datagram - Free a datagram skbuff forcibly
 *	@sk: socket
 *	@skb: datagram skbuff
 *	@flags: MSG_ flags
 *
 *	This function frees a datagram skbuff that was received by
 *	skb_recv_datagram.  The flags argument must match the one
 *	used for skb_recv_datagram.
 *
 *	If the MSG_PEEK flag is set, and the packet is still on the
 *	receive queue of the socket, it will be taken off the queue
 *	before it is freed.
 *
 *	This function currently only disables BH when acquiring the
 *	sk_receive_queue lock.  Therefore it must not be used in a
 *	context where that lock is acquired in an IRQ context.
232 233
 *
 *	It returns 0 if the packet was removed by us.
234 235
 */

236
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
237
{
238 239
	int err = 0;

240
	if (flags & MSG_PEEK) {
241
		err = -ENOENT;
242 243 244 245
		spin_lock_bh(&sk->sk_receive_queue.lock);
		if (skb == skb_peek(&sk->sk_receive_queue)) {
			__skb_unlink(skb, &sk->sk_receive_queue);
			atomic_dec(&skb->users);
246
			err = 0;
247 248 249 250
		}
		spin_unlock_bh(&sk->sk_receive_queue.lock);
	}

251
	skb_free_datagram(sk, skb);
252
	return err;
253 254 255 256
}

EXPORT_SYMBOL(skb_kill_datagram);

L
Linus Torvalds 已提交
257 258
/**
 *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
259 260
 *	@skb: buffer to copy
 *	@offset: offset in the buffer to start copying from
261
 *	@to: io vector to copy to
262
 *	@len: amount of data to copy from buffer to iovec
L
Linus Torvalds 已提交
263 264 265 266 267 268
 *
 *	Note: the iovec is modified during the copy.
 */
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
			    struct iovec *to, int len)
{
269 270
	int start = skb_headlen(skb);
	int i, copy = start - offset;
271

272 273 274 275 276 277 278 279 280 281
	/* Copy header. */
	if (copy > 0) {
		if (copy > len)
			copy = len;
		if (memcpy_toiovec(to, skb->data + offset, copy))
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
	}
282

283 284
	/* Copy paged appendix. Hmm... why does this look so complicated? */
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
285
		int end;
L
Linus Torvalds 已提交
286

287
		WARN_ON(start > offset + len);
288 289

		end = start + skb_shinfo(skb)->frags[i].size;
290 291 292 293 294
		if ((copy = end - offset) > 0) {
			int err;
			u8  *vaddr;
			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
			struct page *page = frag->page;
L
Linus Torvalds 已提交
295 296 297

			if (copy > len)
				copy = len;
298
			vaddr = kmap(page);
299 300
			err = memcpy_toiovec(to, vaddr + frag->page_offset +
					     offset - start, copy);
301
			kunmap(page);
L
Linus Torvalds 已提交
302 303 304 305 306 307
			if (err)
				goto fault;
			if (!(len -= copy))
				return 0;
			offset += copy;
		}
308
		start = end;
L
Linus Torvalds 已提交
309
	}
310 311 312 313 314

	if (skb_shinfo(skb)->frag_list) {
		struct sk_buff *list = skb_shinfo(skb)->frag_list;

		for (; list; list = list->next) {
315 316
			int end;

317
			WARN_ON(start > offset + len);
318

319
			end = start + list->len;
320 321 322
			if ((copy = end - offset) > 0) {
				if (copy > len)
					copy = len;
323 324 325
				if (skb_copy_datagram_iovec(list,
							    offset - start,
							    to, copy))
326 327 328 329 330
					goto fault;
				if ((len -= copy) == 0)
					return 0;
				offset += copy;
			}
331
			start = end;
332
		}
L
Linus Torvalds 已提交
333
	}
334 335 336
	if (!len)
		return 0;

L
Linus Torvalds 已提交
337 338 339 340
fault:
	return -EFAULT;
}

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
/**
 *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
 *	@skb: buffer to copy
 *	@offset: offset in the buffer to start copying to
 *	@from: io vector to copy to
 *	@len: amount of data to copy to buffer from iovec
 *
 *	Returns 0 or -EFAULT.
 *	Note: the iovec is modified during the copy.
 */
int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
				 struct iovec *from, int len)
{
	int start = skb_headlen(skb);
	int i, copy = start - offset;

	/* Copy header. */
	if (copy > 0) {
		if (copy > len)
			copy = len;
		if (memcpy_fromiovec(skb->data + offset, from, copy))
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
	}

	/* Copy paged appendix. Hmm... why does this look so complicated? */
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
		int end;

		WARN_ON(start > offset + len);

		end = start + skb_shinfo(skb)->frags[i].size;
		if ((copy = end - offset) > 0) {
			int err;
			u8  *vaddr;
			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
			struct page *page = frag->page;

			if (copy > len)
				copy = len;
			vaddr = kmap(page);
			err = memcpy_fromiovec(vaddr + frag->page_offset +
					       offset - start, from, copy);
			kunmap(page);
			if (err)
				goto fault;

			if (!(len -= copy))
				return 0;
			offset += copy;
		}
		start = end;
	}

	if (skb_shinfo(skb)->frag_list) {
		struct sk_buff *list = skb_shinfo(skb)->frag_list;

		for (; list; list = list->next) {
			int end;

			WARN_ON(start > offset + len);

			end = start + list->len;
			if ((copy = end - offset) > 0) {
				if (copy > len)
					copy = len;
				if (skb_copy_datagram_from_iovec(list,
								 offset - start,
								 from, copy))
					goto fault;
				if ((len -= copy) == 0)
					return 0;
				offset += copy;
			}
			start = end;
		}
	}
	if (!len)
		return 0;

fault:
	return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iovec);

L
Linus Torvalds 已提交
428 429
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
				      u8 __user *to, int len,
430
				      __wsum *csump)
L
Linus Torvalds 已提交
431
{
432
	int start = skb_headlen(skb);
L
Linus Torvalds 已提交
433
	int pos = 0;
434
	int i, copy = start - offset;
L
Linus Torvalds 已提交
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452

	/* Copy header. */
	if (copy > 0) {
		int err = 0;
		if (copy > len)
			copy = len;
		*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
					       *csump, &err);
		if (err)
			goto fault;
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
		to += copy;
		pos = copy;
	}

	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
453
		int end;
L
Linus Torvalds 已提交
454

455
		WARN_ON(start > offset + len);
456 457

		end = start + skb_shinfo(skb)->frags[i].size;
L
Linus Torvalds 已提交
458
		if ((copy = end - offset) > 0) {
459
			__wsum csum2;
L
Linus Torvalds 已提交
460 461 462 463 464 465 466 467 468
			int err = 0;
			u8  *vaddr;
			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
			struct page *page = frag->page;

			if (copy > len)
				copy = len;
			vaddr = kmap(page);
			csum2 = csum_and_copy_to_user(vaddr +
469 470
							frag->page_offset +
							offset - start,
L
Linus Torvalds 已提交
471 472 473 474 475 476 477 478 479 480 481
						      to, copy, 0, &err);
			kunmap(page);
			if (err)
				goto fault;
			*csump = csum_block_add(*csump, csum2, pos);
			if (!(len -= copy))
				return 0;
			offset += copy;
			to += copy;
			pos += copy;
		}
482
		start = end;
L
Linus Torvalds 已提交
483 484 485 486 487 488
	}

	if (skb_shinfo(skb)->frag_list) {
		struct sk_buff *list = skb_shinfo(skb)->frag_list;

		for (; list; list=list->next) {
489 490
			int end;

491
			WARN_ON(start > offset + len);
L
Linus Torvalds 已提交
492

493
			end = start + list->len;
L
Linus Torvalds 已提交
494
			if ((copy = end - offset) > 0) {
495
				__wsum csum2 = 0;
L
Linus Torvalds 已提交
496 497
				if (copy > len)
					copy = len;
498 499
				if (skb_copy_and_csum_datagram(list,
							       offset - start,
L
Linus Torvalds 已提交
500 501 502 503 504 505 506 507 508 509
							       to, copy,
							       &csum2))
					goto fault;
				*csump = csum_block_add(*csump, csum2, pos);
				if ((len -= copy) == 0)
					return 0;
				offset += copy;
				to += copy;
				pos += copy;
			}
510
			start = end;
L
Linus Torvalds 已提交
511 512 513 514 515 516 517 518 519
		}
	}
	if (!len)
		return 0;

fault:
	return -EFAULT;
}

520
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
521
{
522
	__sum16 sum;
523

524
	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
525
	if (likely(!sum)) {
526
		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
527 528 529 530 531
			netdev_rx_csum_fault(skb->dev);
		skb->ip_summed = CHECKSUM_UNNECESSARY;
	}
	return sum;
}
532 533 534 535 536 537
EXPORT_SYMBOL(__skb_checksum_complete_head);

__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
	return __skb_checksum_complete_head(skb, skb->len);
}
538 539
EXPORT_SYMBOL(__skb_checksum_complete);

L
Linus Torvalds 已提交
540 541
/**
 *	skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
542 543
 *	@skb: skbuff
 *	@hlen: hardware length
544
 *	@iov: io vector
545
 *
L
Linus Torvalds 已提交
546 547 548 549 550 551 552
 *	Caller _must_ check that skb will fit to this iovec.
 *
 *	Returns: 0       - success.
 *		 -EINVAL - checksum failure.
 *		 -EFAULT - fault during copy. Beware, in this case iovec
 *			   can be modified!
 */
553
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
L
Linus Torvalds 已提交
554 555
				     int hlen, struct iovec *iov)
{
556
	__wsum csum;
L
Linus Torvalds 已提交
557 558
	int chunk = skb->len - hlen;

559 560 561
	if (!chunk)
		return 0;

L
Linus Torvalds 已提交
562 563 564 565 566 567 568
	/* Skip filled elements.
	 * Pretty silly, look at memcpy_toiovec, though 8)
	 */
	while (!iov->iov_len)
		iov++;

	if (iov->iov_len < chunk) {
569
		if (__skb_checksum_complete(skb))
L
Linus Torvalds 已提交
570 571 572 573 574 575 576 577
			goto csum_error;
		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
			goto fault;
	} else {
		csum = csum_partial(skb->data, hlen, skb->csum);
		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
					       chunk, &csum))
			goto fault;
578
		if (csum_fold(csum))
L
Linus Torvalds 已提交
579
			goto csum_error;
580
		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
581
			netdev_rx_csum_fault(skb->dev);
L
Linus Torvalds 已提交
582 583 584 585 586 587 588 589 590 591 592 593
		iov->iov_len -= chunk;
		iov->iov_base += chunk;
	}
	return 0;
csum_error:
	return -EINVAL;
fault:
	return -EFAULT;
}

/**
 * 	datagram_poll - generic datagram poll
594 595 596
 *	@file: file struct
 *	@sock: socket
 *	@wait: poll table
L
Linus Torvalds 已提交
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
 *
 *	Datagram poll: Again totally generic. This also handles
 *	sequenced packet sockets providing the socket receive queue
 *	is only ever holding data ready to receive.
 *
 *	Note: when you _don't_ use this routine for this protocol,
 *	and you use a different write policy from sock_writeable()
 *	then please supply your own write_space callback.
 */
unsigned int datagram_poll(struct file *file, struct socket *sock,
			   poll_table *wait)
{
	struct sock *sk = sock->sk;
	unsigned int mask;

	poll_wait(file, sk->sk_sleep, wait);
	mask = 0;

	/* exceptional events? */
	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
		mask |= POLLERR;
618 619
	if (sk->sk_shutdown & RCV_SHUTDOWN)
		mask |= POLLRDHUP;
L
Linus Torvalds 已提交
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
	if (sk->sk_shutdown == SHUTDOWN_MASK)
		mask |= POLLHUP;

	/* readable? */
	if (!skb_queue_empty(&sk->sk_receive_queue) ||
	    (sk->sk_shutdown & RCV_SHUTDOWN))
		mask |= POLLIN | POLLRDNORM;

	/* Connection-based need to check for termination and startup */
	if (connection_based(sk)) {
		if (sk->sk_state == TCP_CLOSE)
			mask |= POLLHUP;
		/* connection hasn't started yet? */
		if (sk->sk_state == TCP_SYN_SENT)
			return mask;
	}

	/* writable? */
	if (sock_writeable(sk))
		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
	else
		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

	return mask;
}

EXPORT_SYMBOL(datagram_poll);
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
EXPORT_SYMBOL(skb_copy_datagram_iovec);
EXPORT_SYMBOL(skb_free_datagram);
EXPORT_SYMBOL(skb_recv_datagram);