select.c 25.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

17
#include <linux/kernel.h>
18
#include <linux/sched.h>
L
Linus Torvalds 已提交
19
#include <linux/syscalls.h>
20
#include <linux/export.h>
L
Linus Torvalds 已提交
21 22 23 24
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
A
Al Viro 已提交
25
#include <linux/fdtable.h>
L
Linus Torvalds 已提交
26
#include <linux/fs.h>
27
#include <linux/rcupdate.h>
28
#include <linux/hrtimer.h>
29
#include <linux/sched/rt.h>
30
#include <net/ll_poll.h>
L
Linus Torvalds 已提交
31 32 33

#include <asm/uaccess.h>

34 35 36 37 38 39 40 41 42 43 44 45 46

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

47 48
#define MAX_SLACK	(100 * NSEC_PER_MSEC)

49
static long __estimate_accuracy(struct timespec *tv)
50
{
51
	long slack;
52 53
	int divfactor = 1000;

54 55 56
	if (tv->tv_sec < 0)
		return 0;

57
	if (task_nice(current) > 0)
58 59
		divfactor = divfactor / 5;

60 61 62
	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
		return MAX_SLACK;

63 64 65
	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

66 67
	if (slack > MAX_SLACK)
		return MAX_SLACK;
68

69 70 71
	return slack;
}

72
long select_estimate_accuracy(struct timespec *tv)
73 74 75 76 77 78 79 80
{
	unsigned long ret;
	struct timespec now;

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

81
	if (rt_task(current))
82 83 84 85 86 87 88 89 90 91 92 93
		return 0;

	ktime_get_ts(&now);
	now = timespec_sub(*tv, now);
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



L
Linus Torvalds 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
A
Adrian Bunk 已提交
115 116
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
L
Linus Torvalds 已提交
117 118 119 120

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
T
Tejun Heo 已提交
121
	pwq->polling_task = current;
122
	pwq->triggered = 0;
L
Linus Torvalds 已提交
123 124
	pwq->error = 0;
	pwq->table = NULL;
125
	pwq->inline_index = 0;
L
Linus Torvalds 已提交
126 127 128
}
EXPORT_SYMBOL(poll_initwait);

129 130
static void free_poll_entry(struct poll_table_entry *entry)
{
W
WANG Cong 已提交
131
	remove_wait_queue(entry->wait_address, &entry->wait);
132 133 134
	fput(entry->filp);
}

L
Linus Torvalds 已提交
135 136 137
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
138 139 140
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
L
Linus Torvalds 已提交
141 142 143 144 145 146 147
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
148
			free_poll_entry(entry);
L
Linus Torvalds 已提交
149 150 151 152 153 154 155 156
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

T
Tejun Heo 已提交
157
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
L
Linus Torvalds 已提交
158 159 160
{
	struct poll_table_page *table = p->table;

161 162 163
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

L
Linus Torvalds 已提交
164 165 166 167 168 169
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
170
			return NULL;
L
Linus Torvalds 已提交
171 172 173 174 175 176 177
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

178 179 180
	return table->entry++;
}

181
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
T
Tejun Heo 已提交
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	 * and is paired with set_mb() in poll_schedule_timeout.
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

207 208 209 210 211 212 213 214 215 216
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
	if (key && !((unsigned long)key & entry->key))
		return 0;
	return __pollwake(wait, mode, sync, key);
}

217 218 219 220
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
T
Tejun Heo 已提交
221 222
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
223 224
	if (!entry)
		return;
A
Al Viro 已提交
225
	entry->filp = get_file(filp);
226
	entry->wait_address = wait_address;
227
	entry->key = p->_key;
T
Tejun Heo 已提交
228 229
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
W
WANG Cong 已提交
230
	add_wait_queue(wait_address, &entry->wait);
L
Linus Torvalds 已提交
231 232
}

T
Tejun Heo 已提交
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
	 * The following set_mb() serves two purposes.  First, it's
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
	set_mb(pwq->triggered, 0);

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
/**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:		pointer to timespec variable for the final timeout
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
{
	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};

	if (!timespec_valid(&ts))
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
		ktime_get_ts(to);
		*to = timespec_add_safe(*to, ts);
	}
	return 0;
}

static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
				      int timeval, int ret)
{
	struct timespec rts;
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

	ktime_get_ts(&rts);
	rts = timespec_sub(*end_time, rts);
	if (rts.tv_sec < 0)
		rts.tv_sec = rts.tv_nsec = 0;

	if (timeval) {
310 311
		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
			memset(&rtv, 0, sizeof(rtv));
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
		rtv.tv_sec = rts.tv_sec;
		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

	} else if (!copy_to_user(p, &rts, sizeof(rts)))
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

L
Linus Torvalds 已提交
335 336 337 338 339 340 341 342 343 344 345
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
346
	struct fdtable *fdt;
L
Linus Torvalds 已提交
347 348

	/* handle last in-complete long-word first */
349 350
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	n /= BITS_PER_LONG;
351
	fdt = files_fdtable(current->files);
352
	open_fds = fdt->open_fds + n;
L
Linus Torvalds 已提交
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
377
		max += n * BITS_PER_LONG;
L
Linus Torvalds 已提交
378 379 380 381 382 383 384 385 386
	}

	return max;
}

#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

387
static inline void wait_key_set(poll_table *wait, unsigned long in,
388 389
				unsigned long out, unsigned long bit,
				unsigned int ll_flag)
390
{
391
	wait->_key = POLLEX_SET | ll_flag;
392 393 394 395
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
396 397
}

398
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
L
Linus Torvalds 已提交
399
{
400
	ktime_t expire, *to = NULL;
L
Linus Torvalds 已提交
401 402
	struct poll_wqueues table;
	poll_table *wait;
403
	int retval, i, timed_out = 0;
404
	unsigned long slack = 0;
405 406 407
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	u64 busy_start = busy_loop_start_time(busy_flag);
	u64 busy_end = busy_loop_end_time();
L
Linus Torvalds 已提交
408

409
	rcu_read_lock();
L
Linus Torvalds 已提交
410
	retval = max_select_fd(n, fds);
411
	rcu_read_unlock();
L
Linus Torvalds 已提交
412 413 414 415 416 417 418

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
419
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
420
		wait->_qproc = NULL;
421 422 423
		timed_out = 1;
	}

424
	if (end_time && !timed_out)
425
		slack = select_estimate_accuracy(end_time);
426

L
Linus Torvalds 已提交
427 428 429
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
430
		bool can_busy_loop = false;
L
Linus Torvalds 已提交
431 432 433 434 435 436 437 438 439 440 441

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
442
				i += BITS_PER_LONG;
L
Linus Torvalds 已提交
443 444 445
				continue;
			}

446
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
447
				struct fd f;
L
Linus Torvalds 已提交
448 449 450 451
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
452 453 454 455
				f = fdget(i);
				if (f.file) {
					const struct file_operations *f_op;
					f_op = f.file->f_op;
L
Linus Torvalds 已提交
456
					mask = DEFAULT_POLLMASK;
457
					if (f_op && f_op->poll) {
458
						wait_key_set(wait, in, out,
459
							     bit, busy_flag);
460
						mask = (*f_op->poll)(f.file, wait);
461
					}
462
					fdput(f);
L
Linus Torvalds 已提交
463 464 465
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
466
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
467 468 469 470
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
471
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
472 473 474 475
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
476
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
477
					}
478
					/* got something, stop busy polling */
479 480 481 482 483 484 485 486 487 488 489
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

L
Linus Torvalds 已提交
490 491 492 493 494 495 496 497
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
498
			cond_resched();
L
Linus Torvalds 已提交
499
		}
500
		wait->_qproc = NULL;
501
		if (retval || timed_out || signal_pending(current))
L
Linus Torvalds 已提交
502
			break;
P
Pavel Machek 已提交
503
		if (table.error) {
L
Linus Torvalds 已提交
504 505 506
			retval = table.error;
			break;
		}
507

508 509 510
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (!need_resched() && can_busy_loop &&
		    busy_loop_range(busy_start, busy_end))
511 512
			continue;

513 514 515 516 517 518 519 520
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
521
		}
522

T
Tejun Heo 已提交
523 524
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
525
			timed_out = 1;
L
Linus Torvalds 已提交
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
541
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
542
			   fd_set __user *exp, struct timespec *end_time)
L
Linus Torvalds 已提交
543 544
{
	fd_set_bits fds;
A
Andrew Morton 已提交
545
	void *bits;
546
	int ret, max_fds;
547
	unsigned int size;
548
	struct fdtable *fdt;
549
	/* Allocate small arguments on the stack to save memory and be faster */
550
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
L
Linus Torvalds 已提交
551 552 553 554 555

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

556
	/* max_fds can increase, so grab it once to avoid race */
557
	rcu_read_lock();
558
	fdt = files_fdtable(current->files);
559
	max_fds = fdt->max_fds;
560
	rcu_read_unlock();
561 562
	if (n > max_fds)
		n = max_fds;
L
Linus Torvalds 已提交
563 564 565 566 567 568 569

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
570 571 572 573
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
574
		bits = kmalloc(6 * size, GFP_KERNEL);
575 576 577
		if (!bits)
			goto out_nofds;
	}
A
Andrew Morton 已提交
578 579 580 581 582 583
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
L
Linus Torvalds 已提交
584 585 586 587 588 589 590 591 592

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

593
	ret = do_select(n, &fds, end_time);
L
Linus Torvalds 已提交
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
610 611
	if (bits != stack_fds)
		kfree(bits);
L
Linus Torvalds 已提交
612 613 614 615
out_nofds:
	return ret;
}

616 617
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
618
{
619
	struct timespec end_time, *to = NULL;
620 621 622 623 624 625 626
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

627
		to = &end_time;
628 629 630
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
631 632 633
			return -EINVAL;
	}

634 635
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
636 637 638 639

	return ret;
}

640 641 642
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timespec __user *tsp,
		       const sigset_t __user *sigmask, size_t sigsetsize)
643 644
{
	sigset_t ksigmask, sigsaved;
645
	struct timespec ts, end_time, *to = NULL;
646 647 648 649 650 651
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

652 653
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
654 655 656 657 658 659 660 661 662 663 664 665 666 667
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

B
Bernd Schmidt 已提交
668
	ret = core_sys_select(n, inp, outp, exp, to);
669
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
670 671 672 673 674 675 676 677 678 679

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
680
			set_restore_sigmask();
681 682 683 684 685 686 687 688 689 690 691 692 693
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
694 695 696
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timespec __user *, tsp,
		void __user *, sig)
697 698 699 700 701 702
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
703
		    || __get_user(up, (sigset_t __user * __user *)sig)
704
		    || __get_user(sigsetsize,
705
				(size_t __user *)(sig+sizeof(void *))))
706 707 708
			return -EFAULT;
	}

709
	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
710 711
}

C
Christoph Hellwig 已提交
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
	unsigned long n;
	fd_set __user *inp, *outp, *exp;
	struct timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
	struct sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

L
Linus Torvalds 已提交
729 730 731 732 733 734 735 736
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

737 738 739 740 741
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
742
 * if pwait->_qproc is non-NULL.
743
 */
744
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
745 746
				     bool *can_busy_poll,
				     unsigned int busy_flag)
L
Linus Torvalds 已提交
747
{
748 749 750 751 752 753
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
754
		struct fd f = fdget(fd);
755
		mask = POLLNVAL;
756
		if (f.file) {
757
			mask = DEFAULT_POLLMASK;
758
			if (f.file->f_op && f.file->f_op->poll) {
759
				pwait->_key = pollfd->events|POLLERR|POLLHUP;
760
				pwait->_key |= busy_flag;
761
				mask = f.file->f_op->poll(f.file, pwait);
762 763
				if (mask & busy_flag)
					*can_busy_poll = true;
764
			}
765 766
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
767
			fdput(f);
L
Linus Torvalds 已提交
768 769
		}
	}
770 771 772
	pollfd->revents = mask;

	return mask;
L
Linus Torvalds 已提交
773 774 775
}

static int do_poll(unsigned int nfds,  struct poll_list *list,
776
		   struct poll_wqueues *wait, struct timespec *end_time)
L
Linus Torvalds 已提交
777 778
{
	poll_table* pt = &wait->pt;
779 780
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
781
	unsigned long slack = 0;
782 783 784 785
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	u64 busy_start = busy_loop_start_time(busy_flag);
	u64 busy_end = busy_loop_end_time();

L
Linus Torvalds 已提交
786

787
	/* Optimise the no-wait case */
788
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
789
		pt->_qproc = NULL;
790 791
		timed_out = 1;
	}
792

793
	if (end_time && !timed_out)
794
		slack = select_estimate_accuracy(end_time);
795

L
Linus Torvalds 已提交
796 797
	for (;;) {
		struct poll_list *walk;
798
		bool can_busy_loop = false;
799

800 801 802 803 804 805 806 807
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
808
				 * and kill poll_table->_qproc, so we don't
809 810 811 812
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
813 814
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
815
					count++;
816
					pt->_qproc = NULL;
817 818 819
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
820 821
				}
			}
L
Linus Torvalds 已提交
822
		}
823 824
		/*
		 * All waiters have already been registered, so don't provide
825
		 * a poll_table->_qproc to them on the next loop iteration.
826
		 */
827
		pt->_qproc = NULL;
828 829 830 831 832
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
833
		if (count || timed_out)
L
Linus Torvalds 已提交
834
			break;
835

836 837 838
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (!need_resched() && can_busy_loop &&
		    busy_loop_range(busy_start, busy_end))
839
			continue;
840

841 842 843 844 845 846 847 848
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
849 850
		}

T
Tejun Heo 已提交
851
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
852
			timed_out = 1;
L
Linus Torvalds 已提交
853 854 855 856
	}
	return count;
}

857 858 859
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

860 861
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
L
Linus Torvalds 已提交
862 863
{
	struct poll_wqueues table;
864
 	int err = -EFAULT, fdcount, len, size;
865 866 867 868
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
869 870 871
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
L
Linus Torvalds 已提交
872

J
Jiri Slaby 已提交
873
	if (nfds > rlimit(RLIMIT_NOFILE))
L
Linus Torvalds 已提交
874 875
		return -EINVAL;

876 877 878 879 880 881
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
L
Linus Torvalds 已提交
882

883 884 885 886 887 888 889
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
L
Linus Torvalds 已提交
890

891 892 893 894 895
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
L
Linus Torvalds 已提交
896 897 898
			goto out_fds;
		}
	}
899

900
	poll_initwait(&table);
901
	fdcount = do_poll(nfds, head, &table, end_time);
902
	poll_freewait(&table);
L
Linus Torvalds 已提交
903

904
	for (walk = head; walk; walk = walk->next) {
L
Linus Torvalds 已提交
905 906 907
		struct pollfd *fds = walk->entries;
		int j;

908 909
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
L
Linus Torvalds 已提交
910 911
				goto out_fds;
  	}
912

L
Linus Torvalds 已提交
913 914
	err = fdcount;
out_fds:
915 916 917 918 919
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
L
Linus Torvalds 已提交
920
	}
921

L
Linus Torvalds 已提交
922 923
	return err;
}
924

925 926
static long do_restart_poll(struct restart_block *restart_block)
{
927 928 929
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
	struct timespec *to = NULL, end_time;
930 931
	int ret;

932 933 934 935 936 937 938 939
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

940 941 942 943 944 945 946
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

947
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
948
		int, timeout_msecs)
949
{
950
	struct timespec end_time, *to = NULL;
951
	int ret;
952

953 954 955 956
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
957 958
	}

959 960
	ret = do_sys_poll(ufds, nfds, to);

961 962
	if (ret == -EINTR) {
		struct restart_block *restart_block;
963

964 965
		restart_block = &current_thread_info()->restart_block;
		restart_block->fn = do_restart_poll;
966 967 968 969 970 971 972 973 974 975
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

976 977 978
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
979 980
}

981 982 983
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
		size_t, sigsetsize)
984 985
{
	sigset_t ksigmask, sigsaved;
986
	struct timespec ts, end_time, *to = NULL;
987 988 989 990 991 992
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

993 994 995
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

1009
	ret = do_sys_poll(ufds, nfds, to);
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
1021
			set_restore_sigmask();
1022 1023 1024 1025 1026
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

1027
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1028 1029 1030

	return ret;
}