select.c 23.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

17
#include <linux/kernel.h>
L
Linus Torvalds 已提交
18 19 20 21 22 23
#include <linux/syscalls.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
A
Al Viro 已提交
24
#include <linux/fdtable.h>
L
Linus Torvalds 已提交
25
#include <linux/fs.h>
26
#include <linux/rcupdate.h>
27
#include <linux/hrtimer.h>
L
Linus Torvalds 已提交
28 29 30

#include <asm/uaccess.h>

31 32 33 34 35 36 37 38 39 40 41 42 43

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

44
static long __estimate_accuracy(struct timespec *tv)
45
{
46
	long slack;
47 48
	int divfactor = 1000;

49
	if (task_nice(current) > 0)
50 51 52 53 54 55 56
		divfactor = divfactor / 5;

	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

	if (slack > 100 * NSEC_PER_MSEC)
		slack =  100 * NSEC_PER_MSEC;
57 58 59

	if (slack < 0)
		slack = 0;
60 61 62
	return slack;
}

63
static long estimate_accuracy(struct timespec *tv)
64 65 66 67 68 69 70 71
{
	unsigned long ret;
	struct timespec now;

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

72
	if (rt_task(current))
73 74 75 76 77 78 79 80 81 82 83 84
		return 0;

	ktime_get_ts(&now);
	now = timespec_sub(*tv, now);
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
A
Adrian Bunk 已提交
106 107
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
L
Linus Torvalds 已提交
108 109 110 111

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
T
Tejun Heo 已提交
112
	pwq->polling_task = current;
L
Linus Torvalds 已提交
113 114
	pwq->error = 0;
	pwq->table = NULL;
115
	pwq->inline_index = 0;
L
Linus Torvalds 已提交
116 117 118
}
EXPORT_SYMBOL(poll_initwait);

119 120
static void free_poll_entry(struct poll_table_entry *entry)
{
W
WANG Cong 已提交
121
	remove_wait_queue(entry->wait_address, &entry->wait);
122 123 124
	fput(entry->filp);
}

L
Linus Torvalds 已提交
125 126 127
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
128 129 130
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
L
Linus Torvalds 已提交
131 132 133 134 135 136 137
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
138
			free_poll_entry(entry);
L
Linus Torvalds 已提交
139 140 141 142 143 144 145 146
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

T
Tejun Heo 已提交
147
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
L
Linus Torvalds 已提交
148 149 150
{
	struct poll_table_page *table = p->table;

151 152 153
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

L
Linus Torvalds 已提交
154 155 156 157 158 159
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
160
			return NULL;
L
Linus Torvalds 已提交
161 162 163 164 165 166 167
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

168 169 170
	return table->entry++;
}

T
Tejun Heo 已提交
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	 * and is paired with set_mb() in poll_schedule_timeout.
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

197 198 199 200
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
T
Tejun Heo 已提交
201 202
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
203 204 205 206 207
	if (!entry)
		return;
	get_file(filp);
	entry->filp = filp;
	entry->wait_address = wait_address;
T
Tejun Heo 已提交
208 209
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
W
WANG Cong 已提交
210
	add_wait_queue(wait_address, &entry->wait);
L
Linus Torvalds 已提交
211 212
}

T
Tejun Heo 已提交
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
	 * The following set_mb() serves two purposes.  First, it's
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
	set_mb(pwq->triggered, 0);

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
/**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:		pointer to timespec variable for the final timeout
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
{
	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};

	if (!timespec_valid(&ts))
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
		ktime_get_ts(to);
		*to = timespec_add_safe(*to, ts);
	}
	return 0;
}

static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
				      int timeval, int ret)
{
	struct timespec rts;
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

	ktime_get_ts(&rts);
	rts = timespec_sub(*end_time, rts);
	if (rts.tv_sec < 0)
		rts.tv_sec = rts.tv_nsec = 0;

	if (timeval) {
		rtv.tv_sec = rts.tv_sec;
		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

	} else if (!copy_to_user(p, &rts, sizeof(rts)))
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

L
Linus Torvalds 已提交
313 314 315 316 317 318 319 320 321 322 323
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
324
	struct fdtable *fdt;
L
Linus Torvalds 已提交
325 326 327 328

	/* handle last in-complete long-word first */
	set = ~(~0UL << (n & (__NFDBITS-1)));
	n /= __NFDBITS;
329 330
	fdt = files_fdtable(current->files);
	open_fds = fdt->open_fds->fds_bits+n;
L
Linus Torvalds 已提交
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
		max += n * __NFDBITS;
	}

	return max;
}

#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

365
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
L
Linus Torvalds 已提交
366
{
367
	ktime_t expire, *to = NULL;
L
Linus Torvalds 已提交
368 369
	struct poll_wqueues table;
	poll_table *wait;
370
	int retval, i, timed_out = 0;
371
	unsigned long slack = 0;
L
Linus Torvalds 已提交
372

373
	rcu_read_lock();
L
Linus Torvalds 已提交
374
	retval = max_select_fd(n, fds);
375
	rcu_read_unlock();
L
Linus Torvalds 已提交
376 377 378 379 380 381 382

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
383
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
L
Linus Torvalds 已提交
384
		wait = NULL;
385 386 387
		timed_out = 1;
	}

388
	if (end_time && !timed_out)
389 390
		slack = estimate_accuracy(end_time);

L
Linus Torvalds 已提交
391 392 393 394 395 396 397 398 399 400
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
401
			const struct file_operations *f_op = NULL;
L
Linus Torvalds 已提交
402 403 404 405 406 407 408 409 410 411
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
				i += __NFDBITS;
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
412
				int fput_needed;
L
Linus Torvalds 已提交
413 414 415 416
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
417
				file = fget_light(i, &fput_needed);
L
Linus Torvalds 已提交
418 419 420 421 422
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll)
						mask = (*f_op->poll)(file, retval ? NULL : wait);
423
					fput_light(file, fput_needed);
L
Linus Torvalds 已提交
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
					}
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
444
			cond_resched();
L
Linus Torvalds 已提交
445 446
		}
		wait = NULL;
447
		if (retval || timed_out || signal_pending(current))
L
Linus Torvalds 已提交
448
			break;
P
Pavel Machek 已提交
449
		if (table.error) {
L
Linus Torvalds 已提交
450 451 452
			retval = table.error;
			break;
		}
453

454 455 456 457 458 459 460 461
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
462
		}
463

T
Tejun Heo 已提交
464 465
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
466
			timed_out = 1;
L
Linus Torvalds 已提交
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
#define MAX_SELECT_SECONDS \
	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)

485
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
486
			   fd_set __user *exp, struct timespec *end_time)
L
Linus Torvalds 已提交
487 488
{
	fd_set_bits fds;
A
Andrew Morton 已提交
489
	void *bits;
490
	int ret, max_fds;
491
	unsigned int size;
492
	struct fdtable *fdt;
493
	/* Allocate small arguments on the stack to save memory and be faster */
494
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
L
Linus Torvalds 已提交
495 496 497 498 499

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

500
	/* max_fds can increase, so grab it once to avoid race */
501
	rcu_read_lock();
502
	fdt = files_fdtable(current->files);
503
	max_fds = fdt->max_fds;
504
	rcu_read_unlock();
505 506
	if (n > max_fds)
		n = max_fds;
L
Linus Torvalds 已提交
507 508 509 510 511 512 513

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
514 515 516 517
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
518
		bits = kmalloc(6 * size, GFP_KERNEL);
519 520 521
		if (!bits)
			goto out_nofds;
	}
A
Andrew Morton 已提交
522 523 524 525 526 527
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
L
Linus Torvalds 已提交
528 529 530 531 532 533 534 535 536

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

537
	ret = do_select(n, &fds, end_time);
L
Linus Torvalds 已提交
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
554 555
	if (bits != stack_fds)
		kfree(bits);
L
Linus Torvalds 已提交
556 557 558 559
out_nofds:
	return ret;
}

560 561 562
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
			fd_set __user *exp, struct timeval __user *tvp)
{
563
	struct timespec end_time, *to = NULL;
564 565 566 567 568 569 570
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

571
		to = &end_time;
572 573 574
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
575 576 577
			return -EINVAL;
	}

578 579
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
580 581 582 583

	return ret;
}

584
#ifdef HAVE_SET_RESTORE_SIGMASK
585 586 587 588 589
asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
		fd_set __user *exp, struct timespec __user *tsp,
		const sigset_t __user *sigmask, size_t sigsetsize)
{
	sigset_t ksigmask, sigsaved;
590
	struct timespec ts, end_time, *to = NULL;
591 592 593 594 595 596
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

597 598
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
599 600 601 602 603 604 605 606 607 608 609 610 611 612
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

613 614
	ret = core_sys_select(n, inp, outp, exp, &end_time);
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
615 616 617 618 619 620 621 622 623 624

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
625
			set_restore_sigmask();
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
	fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
647
		    || __get_user(up, (sigset_t __user * __user *)sig)
648
		    || __get_user(sigsetsize,
649
				(size_t __user *)(sig+sizeof(void *))))
650 651 652 653 654
			return -EFAULT;
	}

	return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
}
655
#endif /* HAVE_SET_RESTORE_SIGMASK */
656

L
Linus Torvalds 已提交
657 658 659 660 661 662 663 664
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

665 666 667 668 669 670 671 672
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if non-NULL.
 */
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
L
Linus Torvalds 已提交
673
{
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
		int fput_needed;
		struct file * file;

		file = fget_light(fd, &fput_needed);
		mask = POLLNVAL;
		if (file != NULL) {
			mask = DEFAULT_POLLMASK;
			if (file->f_op && file->f_op->poll)
				mask = file->f_op->poll(file, pwait);
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
			fput_light(file, fput_needed);
L
Linus Torvalds 已提交
692 693
		}
	}
694 695 696
	pollfd->revents = mask;

	return mask;
L
Linus Torvalds 已提交
697 698 699
}

static int do_poll(unsigned int nfds,  struct poll_list *list,
700
		   struct poll_wqueues *wait, struct timespec *end_time)
L
Linus Torvalds 已提交
701 702
{
	poll_table* pt = &wait->pt;
703 704
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
705
	unsigned long slack = 0;
L
Linus Torvalds 已提交
706

707
	/* Optimise the no-wait case */
708
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
L
Linus Torvalds 已提交
709
		pt = NULL;
710 711
		timed_out = 1;
	}
712

713
	if (end_time && !timed_out)
714 715
		slack = estimate_accuracy(end_time);

L
Linus Torvalds 已提交
716 717
	for (;;) {
		struct poll_list *walk;
718

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
				 * and kill the poll_table, so we don't
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
				if (do_pollfd(pfd, pt)) {
					count++;
					pt = NULL;
				}
			}
L
Linus Torvalds 已提交
737
		}
738 739 740 741
		/*
		 * All waiters have already been registered, so don't provide
		 * a poll_table to them on the next loop iteration.
		 */
L
Linus Torvalds 已提交
742
		pt = NULL;
743 744 745 746 747
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
748
		if (count || timed_out)
L
Linus Torvalds 已提交
749
			break;
750

751 752 753 754 755 756 757 758
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
759 760
		}

T
Tejun Heo 已提交
761
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
762
			timed_out = 1;
L
Linus Torvalds 已提交
763 764 765 766
	}
	return count;
}

767 768 769
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

770 771
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
L
Linus Torvalds 已提交
772 773
{
	struct poll_wqueues table;
774
 	int err = -EFAULT, fdcount, len, size;
775 776 777 778
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
779 780 781
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
L
Linus Torvalds 已提交
782

783
	if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
L
Linus Torvalds 已提交
784 785
		return -EINVAL;

786 787 788 789 790 791
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
L
Linus Torvalds 已提交
792

793 794 795 796 797 798 799
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
L
Linus Torvalds 已提交
800

801 802 803 804 805
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
L
Linus Torvalds 已提交
806 807 808
			goto out_fds;
		}
	}
809

810
	poll_initwait(&table);
811
	fdcount = do_poll(nfds, head, &table, end_time);
812
	poll_freewait(&table);
L
Linus Torvalds 已提交
813

814
	for (walk = head; walk; walk = walk->next) {
L
Linus Torvalds 已提交
815 816 817
		struct pollfd *fds = walk->entries;
		int j;

818 819
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
L
Linus Torvalds 已提交
820 821
				goto out_fds;
  	}
822

L
Linus Torvalds 已提交
823 824
	err = fdcount;
out_fds:
825 826 827 828 829
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
L
Linus Torvalds 已提交
830
	}
831

L
Linus Torvalds 已提交
832 833
	return err;
}
834

835 836
static long do_restart_poll(struct restart_block *restart_block)
{
837 838 839
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
	struct timespec *to = NULL, end_time;
840 841
	int ret;

842 843 844 845 846 847 848 849
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

850 851 852 853 854 855 856
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

857 858 859
asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
			long timeout_msecs)
{
860
	struct timespec end_time, *to = NULL;
861
	int ret;
862

863 864 865 866
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
867 868
	}

869 870
	ret = do_sys_poll(ufds, nfds, to);

871 872
	if (ret == -EINTR) {
		struct restart_block *restart_block;
873

874 875
		restart_block = &current_thread_info()->restart_block;
		restart_block->fn = do_restart_poll;
876 877 878 879 880 881 882 883 884 885
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

886 887 888
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
889 890
}

891
#ifdef HAVE_SET_RESTORE_SIGMASK
892 893 894 895 896
asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
	struct timespec __user *tsp, const sigset_t __user *sigmask,
	size_t sigsetsize)
{
	sigset_t ksigmask, sigsaved;
897
	struct timespec ts, end_time, *to = NULL;
898 899 900 901 902 903
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

904 905 906
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
907 908 909 910 911 912 913 914 915 916 917 918 919
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

920
	ret = do_sys_poll(ufds, nfds, to);
921 922 923 924 925 926 927 928 929 930 931

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
932
			set_restore_sigmask();
933 934 935 936 937
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

938
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
939 940 941

	return ret;
}
942
#endif /* HAVE_SET_RESTORE_SIGMASK */