select.c 34.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

18
#include <linux/kernel.h>
19 20
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
L
Linus Torvalds 已提交
21
#include <linux/syscalls.h>
22
#include <linux/export.h>
L
Linus Torvalds 已提交
23 24 25 26
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
A
Al Viro 已提交
27
#include <linux/fdtable.h>
L
Linus Torvalds 已提交
28
#include <linux/fs.h>
29
#include <linux/rcupdate.h>
30
#include <linux/hrtimer.h>
31
#include <linux/freezer.h>
32
#include <net/busy_poll.h>
33
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
34

35
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
36

37 38 39 40 41 42 43 44 45 46 47 48 49

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

50 51
#define MAX_SLACK	(100 * NSEC_PER_MSEC)

52
static long __estimate_accuracy(struct timespec64 *tv)
53
{
54
	long slack;
55 56
	int divfactor = 1000;

57 58 59
	if (tv->tv_sec < 0)
		return 0;

60
	if (task_nice(current) > 0)
61 62
		divfactor = divfactor / 5;

63 64 65
	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
		return MAX_SLACK;

66 67 68
	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

69 70
	if (slack > MAX_SLACK)
		return MAX_SLACK;
71

72 73 74
	return slack;
}

75
u64 select_estimate_accuracy(struct timespec64 *tv)
76
{
77
	u64 ret;
78
	struct timespec64 now;
79 80 81 82 83

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

84
	if (rt_task(current))
85 86
		return 0;

87 88
	ktime_get_ts64(&now);
	now = timespec64_sub(*tv, now);
89 90 91 92 93 94 95 96
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



L
Linus Torvalds 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
A
Adrian Bunk 已提交
118 119
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
L
Linus Torvalds 已提交
120 121 122 123

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
T
Tejun Heo 已提交
124
	pwq->polling_task = current;
125
	pwq->triggered = 0;
L
Linus Torvalds 已提交
126 127
	pwq->error = 0;
	pwq->table = NULL;
128
	pwq->inline_index = 0;
L
Linus Torvalds 已提交
129 130 131
}
EXPORT_SYMBOL(poll_initwait);

132 133
static void free_poll_entry(struct poll_table_entry *entry)
{
W
WANG Cong 已提交
134
	remove_wait_queue(entry->wait_address, &entry->wait);
135 136 137
	fput(entry->filp);
}

L
Linus Torvalds 已提交
138 139 140
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
141 142 143
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
L
Linus Torvalds 已提交
144 145 146 147 148 149 150
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
151
			free_poll_entry(entry);
L
Linus Torvalds 已提交
152 153 154 155 156 157 158 159
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

T
Tejun Heo 已提交
160
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
L
Linus Torvalds 已提交
161 162 163
{
	struct poll_table_page *table = p->table;

164 165 166
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

L
Linus Torvalds 已提交
167 168 169 170 171 172
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
173
			return NULL;
L
Linus Torvalds 已提交
174 175 176 177 178 179 180
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

181 182 183
	return table->entry++;
}

184
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
T
Tejun Heo 已提交
185 186 187 188 189 190 191 192 193
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
194
	 * and is paired with smp_store_mb() in poll_schedule_timeout.
T
Tejun Heo 已提交
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

210
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
211 212 213 214
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
A
Al Viro 已提交
215
	if (key && !(key_to_poll(key) & entry->key))
216 217 218 219
		return 0;
	return __pollwake(wait, mode, sync, key);
}

220 221 222 223
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
T
Tejun Heo 已提交
224 225
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
226 227
	if (!entry)
		return;
A
Al Viro 已提交
228
	entry->filp = get_file(filp);
229
	entry->wait_address = wait_address;
230
	entry->key = p->_key;
T
Tejun Heo 已提交
231 232
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
W
WANG Cong 已提交
233
	add_wait_queue(wait_address, &entry->wait);
L
Linus Torvalds 已提交
234 235
}

T
Tejun Heo 已提交
236 237 238 239 240 241 242
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
243
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
T
Tejun Heo 已提交
244 245 246 247 248
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
249
	 * The following smp_store_mb() serves two purposes.  First, it's
T
Tejun Heo 已提交
250 251 252 253 254 255 256
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
257
	smp_store_mb(pwq->triggered, 0);
T
Tejun Heo 已提交
258 259 260 261 262

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

263 264
/**
 * poll_select_set_timeout - helper function to setup the timeout value
265
 * @to:		pointer to timespec64 variable for the final timeout
266 267 268 269 270 271 272 273
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
274
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
275
{
276
	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
277

278
	if (!timespec64_valid(&ts))
279 280 281 282 283 284
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
285 286
		ktime_get_ts64(to);
		*to = timespec64_add_safe(*to, ts);
287 288 289 290
	}
	return 0;
}

291 292
static int poll_select_copy_remaining(struct timespec64 *end_time,
				      void __user *p,
293 294
				      int timeval, int ret)
{
D
Deepa Dinamani 已提交
295
	struct timespec64 rts;
296 297 298 299 300 301 302 303 304 305 306 307
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

D
Deepa Dinamani 已提交
308 309 310 311
	ktime_get_ts64(&rts);
	rts = timespec64_sub(*end_time, rts);
	if (rts.tv_sec < 0)
		rts.tv_sec = rts.tv_nsec = 0;
312

313 314

	if (timeval) {
315 316
		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
			memset(&rtv, 0, sizeof(rtv));
D
Deepa Dinamani 已提交
317 318
		rtv.tv_sec = rts.tv_sec;
		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
319 320 321 322

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

D
Deepa Dinamani 已提交
323
	} else if (!put_timespec64(&rts, p))
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
/*
 * Scalable version of the fd_set.
 */

typedef struct {
	unsigned long *in, *out, *ex;
	unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG	(8*sizeof(long))
#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))

/*
 * We do a VERIFY_WRITE here even though we are only reading this time:
 * we'll write to it eventually..
 *
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	nr = FDS_BYTES(nr);
	if (ufdset)
		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

	memset(fdset, 0, nr);
	return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	if (ufdset)
		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
	return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
	memset(fdset, 0, FDS_BYTES(nr));
}

L
Linus Torvalds 已提交
387 388 389 390 391 392 393 394 395 396 397
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
398
	struct fdtable *fdt;
L
Linus Torvalds 已提交
399 400

	/* handle last in-complete long-word first */
401 402
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	n /= BITS_PER_LONG;
403
	fdt = files_fdtable(current->files);
404
	open_fds = fdt->open_fds + n;
L
Linus Torvalds 已提交
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
429
		max += n * BITS_PER_LONG;
L
Linus Torvalds 已提交
430 431 432 433 434 435 436 437 438
	}

	return max;
}

#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

439
static inline void wait_key_set(poll_table *wait, unsigned long in,
440
				unsigned long out, unsigned long bit,
A
Al Viro 已提交
441
				__poll_t ll_flag)
442
{
443
	wait->_key = POLLEX_SET | ll_flag;
444 445 446 447
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
448 449
}

450
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
L
Linus Torvalds 已提交
451
{
452
	ktime_t expire, *to = NULL;
L
Linus Torvalds 已提交
453 454
	struct poll_wqueues table;
	poll_table *wait;
455
	int retval, i, timed_out = 0;
456
	u64 slack = 0;
A
Al Viro 已提交
457
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
458
	unsigned long busy_start = 0;
L
Linus Torvalds 已提交
459

460
	rcu_read_lock();
L
Linus Torvalds 已提交
461
	retval = max_select_fd(n, fds);
462
	rcu_read_unlock();
L
Linus Torvalds 已提交
463 464 465 466 467 468 469

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
470
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
471
		wait->_qproc = NULL;
472 473 474
		timed_out = 1;
	}

475
	if (end_time && !timed_out)
476
		slack = select_estimate_accuracy(end_time);
477

L
Linus Torvalds 已提交
478 479 480
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
481
		bool can_busy_loop = false;
L
Linus Torvalds 已提交
482 483 484 485 486

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
487
			unsigned long in, out, ex, all_bits, bit = 1, j;
L
Linus Torvalds 已提交
488
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
489
			__poll_t mask;
L
Linus Torvalds 已提交
490 491 492 493

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
494
				i += BITS_PER_LONG;
L
Linus Torvalds 已提交
495 496 497
				continue;
			}

498
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
499
				struct fd f;
L
Linus Torvalds 已提交
500 501 502 503
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
504 505 506 507
				f = fdget(i);
				if (f.file) {
					const struct file_operations *f_op;
					f_op = f.file->f_op;
L
Linus Torvalds 已提交
508
					mask = DEFAULT_POLLMASK;
A
Al Viro 已提交
509
					if (f_op->poll) {
510
						wait_key_set(wait, in, out,
511
							     bit, busy_flag);
512
						mask = (*f_op->poll)(f.file, wait);
513
					}
514
					fdput(f);
L
Linus Torvalds 已提交
515 516 517
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
518
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
519 520 521 522
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
523
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
524 525 526 527
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
528
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
529
					}
530
					/* got something, stop busy polling */
531 532 533 534 535 536 537 538 539 540 541
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

L
Linus Torvalds 已提交
542 543 544 545 546 547 548 549
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
550
			cond_resched();
L
Linus Torvalds 已提交
551
		}
552
		wait->_qproc = NULL;
553
		if (retval || timed_out || signal_pending(current))
L
Linus Torvalds 已提交
554
			break;
P
Pavel Machek 已提交
555
		if (table.error) {
L
Linus Torvalds 已提交
556 557 558
			retval = table.error;
			break;
		}
559

560
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
561
		if (can_busy_loop && !need_resched()) {
562 563
			if (!busy_start) {
				busy_start = busy_loop_current_time();
564 565
				continue;
			}
566
			if (!busy_loop_timeout(busy_start))
567 568 569
				continue;
		}
		busy_flag = 0;
570

571 572 573 574 575 576
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
577
			expire = timespec64_to_ktime(*end_time);
578
			to = &expire;
579
		}
580

T
Tejun Heo 已提交
581 582
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
583
			timed_out = 1;
L
Linus Torvalds 已提交
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
599
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
600
			   fd_set __user *exp, struct timespec64 *end_time)
L
Linus Torvalds 已提交
601 602
{
	fd_set_bits fds;
A
Andrew Morton 已提交
603
	void *bits;
604
	int ret, max_fds;
605
	size_t size, alloc_size;
606
	struct fdtable *fdt;
607
	/* Allocate small arguments on the stack to save memory and be faster */
608
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
L
Linus Torvalds 已提交
609 610 611 612 613

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

614
	/* max_fds can increase, so grab it once to avoid race */
615
	rcu_read_lock();
616
	fdt = files_fdtable(current->files);
617
	max_fds = fdt->max_fds;
618
	rcu_read_unlock();
619 620
	if (n > max_fds)
		n = max_fds;
L
Linus Torvalds 已提交
621 622 623 624 625 626 627

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
628 629 630 631
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
632 633 634 635
		if (size > (SIZE_MAX / 6))
			goto out_nofds;

		alloc_size = 6 * size;
636
		bits = kvmalloc(alloc_size, GFP_KERNEL);
637 638 639
		if (!bits)
			goto out_nofds;
	}
A
Andrew Morton 已提交
640 641 642 643 644 645
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
L
Linus Torvalds 已提交
646 647 648 649 650 651 652 653 654

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

655
	ret = do_select(n, &fds, end_time);
L
Linus Torvalds 已提交
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
672
	if (bits != stack_fds)
673
		kvfree(bits);
L
Linus Torvalds 已提交
674 675 676 677
out_nofds:
	return ret;
}

678 679
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
680
{
681
	struct timespec64 end_time, *to = NULL;
682 683 684 685 686 687 688
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

689
		to = &end_time;
690 691 692
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
693 694 695
			return -EINVAL;
	}

696 697
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
698 699 700 701

	return ret;
}

702 703 704
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timespec __user *tsp,
		       const sigset_t __user *sigmask, size_t sigsetsize)
705 706
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
707
	struct timespec64 ts, end_time, *to = NULL;
708 709 710
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
711
		if (get_timespec64(&ts, tsp))
712 713
			return -EFAULT;

714
		to = &end_time;
D
Deepa Dinamani 已提交
715
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
716 717 718 719 720 721 722 723 724 725 726 727 728 729
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

B
Bernd Schmidt 已提交
730
	ret = core_sys_select(n, inp, outp, exp, to);
731
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
732 733 734 735 736 737 738 739 740 741

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
742
			set_restore_sigmask();
743 744 745 746 747 748 749 750 751 752 753 754 755
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
756 757 758
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timespec __user *, tsp,
		void __user *, sig)
759 760 761 762 763 764
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
765
		    || __get_user(up, (sigset_t __user * __user *)sig)
766
		    || __get_user(sigsetsize,
767
				(size_t __user *)(sig+sizeof(void *))))
768 769 770
			return -EFAULT;
	}

771
	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
772 773
}

C
Christoph Hellwig 已提交
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
	unsigned long n;
	fd_set __user *inp, *outp, *exp;
	struct timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
	struct sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

L
Linus Torvalds 已提交
791 792 793 794 795 796 797 798
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

799 800 801 802 803
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
804
 * if pwait->_qproc is non-NULL.
805
 */
A
Al Viro 已提交
806
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
807
				     bool *can_busy_poll,
A
Al Viro 已提交
808
				     __poll_t busy_flag)
L
Linus Torvalds 已提交
809
{
810
	__poll_t mask;
811 812 813 814 815
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
816
		struct fd f = fdget(fd);
817
		mask = POLLNVAL;
818
		if (f.file) {
A
Al Viro 已提交
819
			/* userland u16 ->events contains POLL... bitmap */
820
			__poll_t filter = demangle_poll(pollfd->events) |
A
Al Viro 已提交
821
						POLLERR | POLLHUP;
822
			mask = DEFAULT_POLLMASK;
A
Al Viro 已提交
823
			if (f.file->f_op->poll) {
A
Al Viro 已提交
824
				pwait->_key = filter;
825
				pwait->_key |= busy_flag;
826
				mask = f.file->f_op->poll(f.file, pwait);
827 828
				if (mask & busy_flag)
					*can_busy_poll = true;
829
			}
830
			/* Mask out unneeded events. */
A
Al Viro 已提交
831
			mask &= filter;
832
			fdput(f);
L
Linus Torvalds 已提交
833 834
		}
	}
A
Al Viro 已提交
835
	/* ... and so does ->revents */
836
	pollfd->revents = mangle_poll(mask);
837 838

	return mask;
L
Linus Torvalds 已提交
839 840
}

841
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
842
		   struct timespec64 *end_time)
L
Linus Torvalds 已提交
843 844
{
	poll_table* pt = &wait->pt;
845 846
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
847
	u64 slack = 0;
A
Al Viro 已提交
848
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
849
	unsigned long busy_start = 0;
L
Linus Torvalds 已提交
850

851
	/* Optimise the no-wait case */
852
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
853
		pt->_qproc = NULL;
854 855
		timed_out = 1;
	}
856

857
	if (end_time && !timed_out)
858
		slack = select_estimate_accuracy(end_time);
859

L
Linus Torvalds 已提交
860 861
	for (;;) {
		struct poll_list *walk;
862
		bool can_busy_loop = false;
863

864 865 866 867 868 869 870 871
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
872
				 * and kill poll_table->_qproc, so we don't
873 874 875 876
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
877 878
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
879
					count++;
880
					pt->_qproc = NULL;
881 882 883
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
884 885
				}
			}
L
Linus Torvalds 已提交
886
		}
887 888
		/*
		 * All waiters have already been registered, so don't provide
889
		 * a poll_table->_qproc to them on the next loop iteration.
890
		 */
891
		pt->_qproc = NULL;
892 893 894 895 896
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
897
		if (count || timed_out)
L
Linus Torvalds 已提交
898
			break;
899

900
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
901
		if (can_busy_loop && !need_resched()) {
902 903
			if (!busy_start) {
				busy_start = busy_loop_current_time();
904 905
				continue;
			}
906
			if (!busy_loop_timeout(busy_start))
907 908 909
				continue;
		}
		busy_flag = 0;
910

911 912 913 914 915 916
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
917
			expire = timespec64_to_ktime(*end_time);
918
			to = &expire;
919 920
		}

T
Tejun Heo 已提交
921
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
922
			timed_out = 1;
L
Linus Torvalds 已提交
923 924 925 926
	}
	return count;
}

927 928 929
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

930
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
931
		struct timespec64 *end_time)
L
Linus Torvalds 已提交
932 933
{
	struct poll_wqueues table;
934
 	int err = -EFAULT, fdcount, len, size;
935 936 937 938
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
939 940 941
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
L
Linus Torvalds 已提交
942

J
Jiri Slaby 已提交
943
	if (nfds > rlimit(RLIMIT_NOFILE))
L
Linus Torvalds 已提交
944 945
		return -EINVAL;

946 947 948 949 950 951
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
L
Linus Torvalds 已提交
952

953 954 955 956 957 958 959
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
L
Linus Torvalds 已提交
960

961 962 963 964 965
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
L
Linus Torvalds 已提交
966 967 968
			goto out_fds;
		}
	}
969

970
	poll_initwait(&table);
971
	fdcount = do_poll(head, &table, end_time);
972
	poll_freewait(&table);
L
Linus Torvalds 已提交
973

974
	for (walk = head; walk; walk = walk->next) {
L
Linus Torvalds 已提交
975 976 977
		struct pollfd *fds = walk->entries;
		int j;

978 979
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
L
Linus Torvalds 已提交
980 981
				goto out_fds;
  	}
982

L
Linus Torvalds 已提交
983 984
	err = fdcount;
out_fds:
985 986 987 988 989
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
L
Linus Torvalds 已提交
990
	}
991

L
Linus Torvalds 已提交
992 993
	return err;
}
994

995 996
static long do_restart_poll(struct restart_block *restart_block)
{
997 998
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
999
	struct timespec64 *to = NULL, end_time;
1000 1001
	int ret;

1002 1003 1004 1005 1006 1007 1008 1009
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

1010 1011 1012 1013 1014 1015 1016
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

1017
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
1018
		int, timeout_msecs)
1019
{
1020
	struct timespec64 end_time, *to = NULL;
1021
	int ret;
1022

1023 1024 1025 1026
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
1027 1028
	}

1029 1030
	ret = do_sys_poll(ufds, nfds, to);

1031 1032
	if (ret == -EINTR) {
		struct restart_block *restart_block;
1033

1034
		restart_block = &current->restart_block;
1035
		restart_block->fn = do_restart_poll;
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

1046 1047 1048
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
1049 1050
}

1051 1052 1053
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
		size_t, sigsetsize)
1054 1055
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
1056
	struct timespec64 ts, end_time, *to = NULL;
1057 1058 1059
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
1060
		if (get_timespec64(&ts, tsp))
1061 1062
			return -EFAULT;

1063 1064 1065
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

1079
	ret = do_sys_poll(ufds, nfds, to);
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
1091
			set_restore_sigmask();
1092 1093 1094 1095 1096
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

1097
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1098 1099 1100

	return ret;
}
1101 1102 1103 1104 1105

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

static
D
Deepa Dinamani 已提交
1106
int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
1107 1108
				      int timeval, int ret)
{
D
Deepa Dinamani 已提交
1109
	struct timespec64 ts;
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

D
Deepa Dinamani 已提交
1121 1122
	ktime_get_ts64(&ts);
	ts = timespec64_sub(*end_time, ts);
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
	if (ts.tv_sec < 0)
		ts.tv_sec = ts.tv_nsec = 0;

	if (timeval) {
		struct compat_timeval rtv;

		rtv.tv_sec = ts.tv_sec;
		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;
	} else {
D
Deepa Dinamani 已提交
1135
		if (!compat_put_timespec64(&ts, p))
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
			return ret;
	}
	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
			unsigned long *fdset)
{
	if (ufdset) {
1161
		return compat_get_bitmap(fdset, ufdset, nr);
1162
	} else {
1163
		zero_fd_set(nr, fdset);
1164
		return 0;
1165 1166 1167 1168 1169 1170 1171 1172 1173
	}
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
		      unsigned long *fdset)
{
	if (!ufdset)
		return 0;
1174
	return compat_put_bitmap(ufdset, fdset, nr);
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
D
Deepa Dinamani 已提交
1193
	struct timespec64 *end_time)
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
{
	fd_set_bits fds;
	void *bits;
	int size, max_fds, ret = -EINVAL;
	struct fdtable *fdt;
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

	if (n < 0)
		goto out_nofds;

	/* max_fds can increase, so grab it once to avoid race */
	rcu_read_lock();
	fdt = files_fdtable(current->files);
	max_fds = fdt->max_fds;
	rcu_read_unlock();
	if (n > max_fds)
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words.
	 */
	size = FDS_BYTES(n);
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		bits = kmalloc(6 * size, GFP_KERNEL);
		ret = -ENOMEM;
		if (!bits)
			goto out_nofds;
	}
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
	    (ret = compat_get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, end_time);

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (compat_set_fd_set(n, inp, fds.res_in) ||
	    compat_set_fd_set(n, outp, fds.res_out) ||
	    compat_set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	if (bits != stack_fds)
		kfree(bits);
out_nofds:
	return ret;
}

COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timeval __user *, tvp)
{
D
Deepa Dinamani 已提交
1266
	struct timespec64 end_time, *to = NULL;
1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
	struct compat_timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
			return -EINVAL;
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);

	return ret;
}

struct compat_sel_arg_struct {
	compat_ulong_t n;
	compat_uptr_t inp;
	compat_uptr_t outp;
	compat_uptr_t exp;
	compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
	struct compat_sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
				 compat_ptr(a.exp), compat_ptr(a.tvp));
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
	compat_size_t sigsetsize)
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
1311
	struct timespec64 ts, end_time, *to = NULL;
1312 1313 1314
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
1315
		if (compat_get_timespec64(&ts, tsp))
1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
A
Al Viro 已提交
1326
		if (get_compat_sigset(&ksigmask, sigmask))
1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
			set_restore_sigmask();
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timespec __user *, tsp, void __user *, sig)
{
	compat_size_t sigsetsize = 0;
	compat_uptr_t up = 0;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig,
				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
		    	__get_user(up, (compat_uptr_t __user *)sig) ||
		    	__get_user(sigsetsize,
				(compat_size_t __user *)(sig+sizeof(up))))
			return -EFAULT;
	}
	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
				 sigsetsize);
}

COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
	unsigned int,  nfds, struct compat_timespec __user *, tsp,
	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
1377
	struct timespec64 ts, end_time, *to = NULL;
1378 1379 1380
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
1381
		if (compat_get_timespec64(&ts, tsp))
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
A
Al Viro 已提交
1392
		if (get_compat_sigset(&ksigmask, sigmask))
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = do_sys_poll(ufds, nfds, to);

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
				sizeof(sigsaved));
			set_restore_sigmask();
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	return ret;
}
#endif