select.c 34.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

17
#include <linux/kernel.h>
18 19
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
L
Linus Torvalds 已提交
20
#include <linux/syscalls.h>
21
#include <linux/export.h>
L
Linus Torvalds 已提交
22 23 24 25
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
A
Al Viro 已提交
26
#include <linux/fdtable.h>
L
Linus Torvalds 已提交
27
#include <linux/fs.h>
28
#include <linux/rcupdate.h>
29
#include <linux/hrtimer.h>
30
#include <linux/freezer.h>
31
#include <net/busy_poll.h>
32
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
33

34
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
35

36 37 38 39 40 41 42 43 44 45 46 47 48

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

49 50
#define MAX_SLACK	(100 * NSEC_PER_MSEC)

51
static long __estimate_accuracy(struct timespec64 *tv)
52
{
53
	long slack;
54 55
	int divfactor = 1000;

56 57 58
	if (tv->tv_sec < 0)
		return 0;

59
	if (task_nice(current) > 0)
60 61
		divfactor = divfactor / 5;

62 63 64
	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
		return MAX_SLACK;

65 66 67
	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

68 69
	if (slack > MAX_SLACK)
		return MAX_SLACK;
70

71 72 73
	return slack;
}

74
u64 select_estimate_accuracy(struct timespec64 *tv)
75
{
76
	u64 ret;
77
	struct timespec64 now;
78 79 80 81 82

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

83
	if (rt_task(current))
84 85
		return 0;

86 87
	ktime_get_ts64(&now);
	now = timespec64_sub(*tv, now);
88 89 90 91 92 93 94 95
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



L
Linus Torvalds 已提交
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
A
Adrian Bunk 已提交
117 118
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
L
Linus Torvalds 已提交
119 120 121 122

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
T
Tejun Heo 已提交
123
	pwq->polling_task = current;
124
	pwq->triggered = 0;
L
Linus Torvalds 已提交
125 126
	pwq->error = 0;
	pwq->table = NULL;
127
	pwq->inline_index = 0;
L
Linus Torvalds 已提交
128 129 130
}
EXPORT_SYMBOL(poll_initwait);

131 132
static void free_poll_entry(struct poll_table_entry *entry)
{
W
WANG Cong 已提交
133
	remove_wait_queue(entry->wait_address, &entry->wait);
134 135 136
	fput(entry->filp);
}

L
Linus Torvalds 已提交
137 138 139
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
140 141 142
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
L
Linus Torvalds 已提交
143 144 145 146 147 148 149
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
150
			free_poll_entry(entry);
L
Linus Torvalds 已提交
151 152 153 154 155 156 157 158
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

T
Tejun Heo 已提交
159
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
L
Linus Torvalds 已提交
160 161 162
{
	struct poll_table_page *table = p->table;

163 164 165
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

L
Linus Torvalds 已提交
166 167 168 169 170 171
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
172
			return NULL;
L
Linus Torvalds 已提交
173 174 175 176 177 178 179
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

180 181 182
	return table->entry++;
}

183
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
T
Tejun Heo 已提交
184 185 186 187 188 189 190 191 192
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
193
	 * and is paired with smp_store_mb() in poll_schedule_timeout.
T
Tejun Heo 已提交
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

209
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
210 211 212 213 214 215 216 217 218
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
	if (key && !((unsigned long)key & entry->key))
		return 0;
	return __pollwake(wait, mode, sync, key);
}

219 220 221 222
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
T
Tejun Heo 已提交
223 224
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
225 226
	if (!entry)
		return;
A
Al Viro 已提交
227
	entry->filp = get_file(filp);
228
	entry->wait_address = wait_address;
229
	entry->key = p->_key;
T
Tejun Heo 已提交
230 231
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
W
WANG Cong 已提交
232
	add_wait_queue(wait_address, &entry->wait);
L
Linus Torvalds 已提交
233 234
}

T
Tejun Heo 已提交
235 236 237 238 239 240 241
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
242
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
T
Tejun Heo 已提交
243 244 245 246 247
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
248
	 * The following smp_store_mb() serves two purposes.  First, it's
T
Tejun Heo 已提交
249 250 251 252 253 254 255
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
256
	smp_store_mb(pwq->triggered, 0);
T
Tejun Heo 已提交
257 258 259 260 261

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

262 263
/**
 * poll_select_set_timeout - helper function to setup the timeout value
264
 * @to:		pointer to timespec64 variable for the final timeout
265 266 267 268 269 270 271 272
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
273
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
274
{
275
	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
276

277
	if (!timespec64_valid(&ts))
278 279 280 281 282 283
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
284 285
		ktime_get_ts64(to);
		*to = timespec64_add_safe(*to, ts);
286 287 288 289
	}
	return 0;
}

290 291
static int poll_select_copy_remaining(struct timespec64 *end_time,
				      void __user *p,
292 293
				      int timeval, int ret)
{
294
	struct timespec64 rts64;
295 296 297 298 299 300 301 302 303 304 305 306 307
	struct timespec rts;
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

308 309 310 311 312 313
	ktime_get_ts64(&rts64);
	rts64 = timespec64_sub(*end_time, rts64);
	if (rts64.tv_sec < 0)
		rts64.tv_sec = rts64.tv_nsec = 0;

	rts = timespec64_to_timespec(rts64);
314 315

	if (timeval) {
316 317
		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
			memset(&rtv, 0, sizeof(rtv));
318 319
		rtv.tv_sec = rts64.tv_sec;
		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

	} else if (!copy_to_user(p, &rts, sizeof(rts)))
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
/*
 * Scalable version of the fd_set.
 */

typedef struct {
	unsigned long *in, *out, *ex;
	unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG	(8*sizeof(long))
#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))

/*
 * We do a VERIFY_WRITE here even though we are only reading this time:
 * we'll write to it eventually..
 *
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	nr = FDS_BYTES(nr);
	if (ufdset)
		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

	memset(fdset, 0, nr);
	return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	if (ufdset)
		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
	return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
	memset(fdset, 0, FDS_BYTES(nr));
}

L
Linus Torvalds 已提交
388 389 390 391 392 393 394 395 396 397 398
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
399
	struct fdtable *fdt;
L
Linus Torvalds 已提交
400 401

	/* handle last in-complete long-word first */
402 403
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	n /= BITS_PER_LONG;
404
	fdt = files_fdtable(current->files);
405
	open_fds = fdt->open_fds + n;
L
Linus Torvalds 已提交
406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
430
		max += n * BITS_PER_LONG;
L
Linus Torvalds 已提交
431 432 433 434 435 436 437 438 439
	}

	return max;
}

#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

440
static inline void wait_key_set(poll_table *wait, unsigned long in,
441 442
				unsigned long out, unsigned long bit,
				unsigned int ll_flag)
443
{
444
	wait->_key = POLLEX_SET | ll_flag;
445 446 447 448
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
449 450
}

451
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
L
Linus Torvalds 已提交
452
{
453
	ktime_t expire, *to = NULL;
L
Linus Torvalds 已提交
454 455
	struct poll_wqueues table;
	poll_table *wait;
456
	int retval, i, timed_out = 0;
457
	u64 slack = 0;
458
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
459
	unsigned long busy_start = 0;
L
Linus Torvalds 已提交
460

461
	rcu_read_lock();
L
Linus Torvalds 已提交
462
	retval = max_select_fd(n, fds);
463
	rcu_read_unlock();
L
Linus Torvalds 已提交
464 465 466 467 468 469 470

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
471
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
472
		wait->_qproc = NULL;
473 474 475
		timed_out = 1;
	}

476
	if (end_time && !timed_out)
477
		slack = select_estimate_accuracy(end_time);
478

L
Linus Torvalds 已提交
479 480 481
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
482
		bool can_busy_loop = false;
L
Linus Torvalds 已提交
483 484 485 486 487 488 489 490 491 492 493

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
494
				i += BITS_PER_LONG;
L
Linus Torvalds 已提交
495 496 497
				continue;
			}

498
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
499
				struct fd f;
L
Linus Torvalds 已提交
500 501 502 503
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
504 505 506 507
				f = fdget(i);
				if (f.file) {
					const struct file_operations *f_op;
					f_op = f.file->f_op;
L
Linus Torvalds 已提交
508
					mask = DEFAULT_POLLMASK;
A
Al Viro 已提交
509
					if (f_op->poll) {
510
						wait_key_set(wait, in, out,
511
							     bit, busy_flag);
512
						mask = (*f_op->poll)(f.file, wait);
513
					}
514
					fdput(f);
L
Linus Torvalds 已提交
515 516 517
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
518
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
519 520 521 522
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
523
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
524 525 526 527
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
528
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
529
					}
530
					/* got something, stop busy polling */
531 532 533 534 535 536 537 538 539 540 541
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

L
Linus Torvalds 已提交
542 543 544 545 546 547 548 549
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
550
			cond_resched();
L
Linus Torvalds 已提交
551
		}
552
		wait->_qproc = NULL;
553
		if (retval || timed_out || signal_pending(current))
L
Linus Torvalds 已提交
554
			break;
P
Pavel Machek 已提交
555
		if (table.error) {
L
Linus Torvalds 已提交
556 557 558
			retval = table.error;
			break;
		}
559

560
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
561
		if (can_busy_loop && !need_resched()) {
562 563
			if (!busy_start) {
				busy_start = busy_loop_current_time();
564 565
				continue;
			}
566
			if (!busy_loop_timeout(busy_start))
567 568 569
				continue;
		}
		busy_flag = 0;
570

571 572 573 574 575 576
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
577
			expire = timespec64_to_ktime(*end_time);
578
			to = &expire;
579
		}
580

T
Tejun Heo 已提交
581 582
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
583
			timed_out = 1;
L
Linus Torvalds 已提交
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
599
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
600
			   fd_set __user *exp, struct timespec64 *end_time)
L
Linus Torvalds 已提交
601 602
{
	fd_set_bits fds;
A
Andrew Morton 已提交
603
	void *bits;
604
	int ret, max_fds;
605
	size_t size, alloc_size;
606
	struct fdtable *fdt;
607
	/* Allocate small arguments on the stack to save memory and be faster */
608
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
L
Linus Torvalds 已提交
609 610 611 612 613

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

614
	/* max_fds can increase, so grab it once to avoid race */
615
	rcu_read_lock();
616
	fdt = files_fdtable(current->files);
617
	max_fds = fdt->max_fds;
618
	rcu_read_unlock();
619 620
	if (n > max_fds)
		n = max_fds;
L
Linus Torvalds 已提交
621 622 623 624 625 626 627

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
628 629 630 631
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
632 633 634 635
		if (size > (SIZE_MAX / 6))
			goto out_nofds;

		alloc_size = 6 * size;
636
		bits = kvmalloc(alloc_size, GFP_KERNEL);
637 638 639
		if (!bits)
			goto out_nofds;
	}
A
Andrew Morton 已提交
640 641 642 643 644 645
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
L
Linus Torvalds 已提交
646 647 648 649 650 651 652 653 654

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

655
	ret = do_select(n, &fds, end_time);
L
Linus Torvalds 已提交
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
672
	if (bits != stack_fds)
673
		kvfree(bits);
L
Linus Torvalds 已提交
674 675 676 677
out_nofds:
	return ret;
}

678 679
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
680
{
681
	struct timespec64 end_time, *to = NULL;
682 683 684 685 686 687 688
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

689
		to = &end_time;
690 691 692
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
693 694 695
			return -EINVAL;
	}

696 697
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
698 699 700 701

	return ret;
}

702 703 704
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timespec __user *tsp,
		       const sigset_t __user *sigmask, size_t sigsetsize)
705 706
{
	sigset_t ksigmask, sigsaved;
707 708
	struct timespec ts;
	struct timespec64 ts64, end_time, *to = NULL;
709 710 711 712 713
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;
714
		ts64 = timespec_to_timespec64(ts);
715

716
		to = &end_time;
717
		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
718 719 720 721 722 723 724 725 726 727 728 729 730 731
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

B
Bernd Schmidt 已提交
732
	ret = core_sys_select(n, inp, outp, exp, to);
733
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
734 735 736 737 738 739 740 741 742 743

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
744
			set_restore_sigmask();
745 746 747 748 749 750 751 752 753 754 755 756 757
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
758 759 760
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timespec __user *, tsp,
		void __user *, sig)
761 762 763 764 765 766
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
767
		    || __get_user(up, (sigset_t __user * __user *)sig)
768
		    || __get_user(sigsetsize,
769
				(size_t __user *)(sig+sizeof(void *))))
770 771 772
			return -EFAULT;
	}

773
	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
774 775
}

C
Christoph Hellwig 已提交
776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
	unsigned long n;
	fd_set __user *inp, *outp, *exp;
	struct timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
	struct sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

L
Linus Torvalds 已提交
793 794 795 796 797 798 799 800
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

801 802 803 804 805
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
806
 * if pwait->_qproc is non-NULL.
807
 */
808
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
809 810
				     bool *can_busy_poll,
				     unsigned int busy_flag)
L
Linus Torvalds 已提交
811
{
812 813 814 815 816 817
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
818
		struct fd f = fdget(fd);
819
		mask = POLLNVAL;
820
		if (f.file) {
821
			mask = DEFAULT_POLLMASK;
A
Al Viro 已提交
822
			if (f.file->f_op->poll) {
823
				pwait->_key = pollfd->events|POLLERR|POLLHUP;
824
				pwait->_key |= busy_flag;
825
				mask = f.file->f_op->poll(f.file, pwait);
826 827
				if (mask & busy_flag)
					*can_busy_poll = true;
828
			}
829 830
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
831
			fdput(f);
L
Linus Torvalds 已提交
832 833
		}
	}
834 835 836
	pollfd->revents = mask;

	return mask;
L
Linus Torvalds 已提交
837 838
}

839
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
840
		   struct timespec64 *end_time)
L
Linus Torvalds 已提交
841 842
{
	poll_table* pt = &wait->pt;
843 844
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
845
	u64 slack = 0;
846
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
847
	unsigned long busy_start = 0;
L
Linus Torvalds 已提交
848

849
	/* Optimise the no-wait case */
850
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
851
		pt->_qproc = NULL;
852 853
		timed_out = 1;
	}
854

855
	if (end_time && !timed_out)
856
		slack = select_estimate_accuracy(end_time);
857

L
Linus Torvalds 已提交
858 859
	for (;;) {
		struct poll_list *walk;
860
		bool can_busy_loop = false;
861

862 863 864 865 866 867 868 869
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
870
				 * and kill poll_table->_qproc, so we don't
871 872 873 874
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
875 876
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
877
					count++;
878
					pt->_qproc = NULL;
879 880 881
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
882 883
				}
			}
L
Linus Torvalds 已提交
884
		}
885 886
		/*
		 * All waiters have already been registered, so don't provide
887
		 * a poll_table->_qproc to them on the next loop iteration.
888
		 */
889
		pt->_qproc = NULL;
890 891 892 893 894
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
895
		if (count || timed_out)
L
Linus Torvalds 已提交
896
			break;
897

898
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
899
		if (can_busy_loop && !need_resched()) {
900 901
			if (!busy_start) {
				busy_start = busy_loop_current_time();
902 903
				continue;
			}
904
			if (!busy_loop_timeout(busy_start))
905 906 907
				continue;
		}
		busy_flag = 0;
908

909 910 911 912 913 914
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
915
			expire = timespec64_to_ktime(*end_time);
916
			to = &expire;
917 918
		}

T
Tejun Heo 已提交
919
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
920
			timed_out = 1;
L
Linus Torvalds 已提交
921 922 923 924
	}
	return count;
}

925 926 927
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

928
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
929
		struct timespec64 *end_time)
L
Linus Torvalds 已提交
930 931
{
	struct poll_wqueues table;
932
 	int err = -EFAULT, fdcount, len, size;
933 934 935 936
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
937 938 939
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
L
Linus Torvalds 已提交
940

J
Jiri Slaby 已提交
941
	if (nfds > rlimit(RLIMIT_NOFILE))
L
Linus Torvalds 已提交
942 943
		return -EINVAL;

944 945 946 947 948 949
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
L
Linus Torvalds 已提交
950

951 952 953 954 955 956 957
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
L
Linus Torvalds 已提交
958

959 960 961 962 963
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
L
Linus Torvalds 已提交
964 965 966
			goto out_fds;
		}
	}
967

968
	poll_initwait(&table);
969
	fdcount = do_poll(head, &table, end_time);
970
	poll_freewait(&table);
L
Linus Torvalds 已提交
971

972
	for (walk = head; walk; walk = walk->next) {
L
Linus Torvalds 已提交
973 974 975
		struct pollfd *fds = walk->entries;
		int j;

976 977
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
L
Linus Torvalds 已提交
978 979
				goto out_fds;
  	}
980

L
Linus Torvalds 已提交
981 982
	err = fdcount;
out_fds:
983 984 985 986 987
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
L
Linus Torvalds 已提交
988
	}
989

L
Linus Torvalds 已提交
990 991
	return err;
}
992

993 994
static long do_restart_poll(struct restart_block *restart_block)
{
995 996
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
997
	struct timespec64 *to = NULL, end_time;
998 999
	int ret;

1000 1001 1002 1003 1004 1005 1006 1007
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

1008 1009 1010 1011 1012 1013 1014
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

1015
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
1016
		int, timeout_msecs)
1017
{
1018
	struct timespec64 end_time, *to = NULL;
1019
	int ret;
1020

1021 1022 1023 1024
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
1025 1026
	}

1027 1028
	ret = do_sys_poll(ufds, nfds, to);

1029 1030
	if (ret == -EINTR) {
		struct restart_block *restart_block;
1031

1032
		restart_block = &current->restart_block;
1033
		restart_block->fn = do_restart_poll;
1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

1044 1045 1046
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
1047 1048
}

1049 1050 1051
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
		size_t, sigsetsize)
1052 1053
{
	sigset_t ksigmask, sigsaved;
1054 1055
	struct timespec ts;
	struct timespec64 end_time, *to = NULL;
1056 1057 1058 1059 1060 1061
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

1062 1063 1064
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

1078
	ret = do_sys_poll(ufds, nfds, to);
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
1090
			set_restore_sigmask();
1091 1092 1093 1094 1095
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

1096
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1097 1098 1099

	return ret;
}
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

static
int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
				      int timeval, int ret)
{
	struct timespec ts;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

	ktime_get_ts(&ts);
	ts = timespec_sub(*end_time, ts);
	if (ts.tv_sec < 0)
		ts.tv_sec = ts.tv_nsec = 0;

	if (timeval) {
		struct compat_timeval rtv;

		rtv.tv_sec = ts.tv_sec;
		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;
	} else {
		struct compat_timespec rts;

		rts.tv_sec = ts.tv_sec;
		rts.tv_nsec = ts.tv_nsec;

		if (!copy_to_user(p, &rts, sizeof(rts)))
			return ret;
	}
	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
			unsigned long *fdset)
{
	if (ufdset) {
1165
		return compat_get_bitmap(fdset, ufdset, nr);
1166
	} else {
1167
		zero_fd_set(nr, fdset);
1168
		return 0;
1169 1170 1171 1172 1173 1174 1175 1176 1177
	}
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
		      unsigned long *fdset)
{
	if (!ufdset)
		return 0;
1178
	return compat_put_bitmap(ufdset, fdset, nr);
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct timespec *end_time)
{
	fd_set_bits fds;
	void *bits;
	int size, max_fds, ret = -EINVAL;
	struct fdtable *fdt;
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

	if (n < 0)
		goto out_nofds;

	/* max_fds can increase, so grab it once to avoid race */
	rcu_read_lock();
	fdt = files_fdtable(current->files);
	max_fds = fdt->max_fds;
	rcu_read_unlock();
	if (n > max_fds)
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words.
	 */
	size = FDS_BYTES(n);
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		bits = kmalloc(6 * size, GFP_KERNEL);
		ret = -ENOMEM;
		if (!bits)
			goto out_nofds;
	}
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
	    (ret = compat_get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, end_time);

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (compat_set_fd_set(n, inp, fds.res_in) ||
	    compat_set_fd_set(n, outp, fds.res_out) ||
	    compat_set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	if (bits != stack_fds)
		kfree(bits);
out_nofds:
	return ret;
}

COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timeval __user *, tvp)
{
	struct timespec end_time, *to = NULL;
	struct compat_timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
			return -EINVAL;
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);

	return ret;
}

struct compat_sel_arg_struct {
	compat_ulong_t n;
	compat_uptr_t inp;
	compat_uptr_t outp;
	compat_uptr_t exp;
	compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
	struct compat_sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
				 compat_ptr(a.exp), compat_ptr(a.tvp));
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
	compat_size_t sigsetsize)
{
	compat_sigset_t ss32;
	sigset_t ksigmask, sigsaved;
	struct compat_timespec ts;
	struct timespec end_time, *to = NULL;
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
			return -EFAULT;
		sigset_from_compat(&ksigmask, &ss32);

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
			set_restore_sigmask();
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timespec __user *, tsp, void __user *, sig)
{
	compat_size_t sigsetsize = 0;
	compat_uptr_t up = 0;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig,
				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
		    	__get_user(up, (compat_uptr_t __user *)sig) ||
		    	__get_user(sigsetsize,
				(compat_size_t __user *)(sig+sizeof(up))))
			return -EFAULT;
	}
	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
				 sigsetsize);
}

COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
	unsigned int,  nfds, struct compat_timespec __user *, tsp,
	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
	compat_sigset_t ss32;
	sigset_t ksigmask, sigsaved;
	struct compat_timespec ts;
	struct timespec end_time, *to = NULL;
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
			return -EFAULT;
		sigset_from_compat(&ksigmask, &ss32);

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = do_sys_poll(ufds, nfds, to);

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
				sizeof(sigsaved));
			set_restore_sigmask();
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	return ret;
}
#endif