select.c 34.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

18
#include <linux/kernel.h>
19 20
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
L
Linus Torvalds 已提交
21
#include <linux/syscalls.h>
22
#include <linux/export.h>
L
Linus Torvalds 已提交
23 24 25 26
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
A
Al Viro 已提交
27
#include <linux/fdtable.h>
L
Linus Torvalds 已提交
28
#include <linux/fs.h>
29
#include <linux/rcupdate.h>
30
#include <linux/hrtimer.h>
31
#include <linux/freezer.h>
32
#include <net/busy_poll.h>
33
#include <linux/vmalloc.h>
L
Linus Torvalds 已提交
34

35
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
36

37 38 39 40 41 42 43 44 45 46 47 48 49

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

50 51
#define MAX_SLACK	(100 * NSEC_PER_MSEC)

52
static long __estimate_accuracy(struct timespec64 *tv)
53
{
54
	long slack;
55 56
	int divfactor = 1000;

57 58 59
	if (tv->tv_sec < 0)
		return 0;

60
	if (task_nice(current) > 0)
61 62
		divfactor = divfactor / 5;

63 64 65
	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
		return MAX_SLACK;

66 67 68
	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

69 70
	if (slack > MAX_SLACK)
		return MAX_SLACK;
71

72 73 74
	return slack;
}

75
u64 select_estimate_accuracy(struct timespec64 *tv)
76
{
77
	u64 ret;
78
	struct timespec64 now;
79 80 81 82 83

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

84
	if (rt_task(current))
85 86
		return 0;

87 88
	ktime_get_ts64(&now);
	now = timespec64_sub(*tv, now);
89 90 91 92 93 94 95 96
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



L
Linus Torvalds 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
A
Adrian Bunk 已提交
118 119
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
L
Linus Torvalds 已提交
120 121 122 123

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
T
Tejun Heo 已提交
124
	pwq->polling_task = current;
125
	pwq->triggered = 0;
L
Linus Torvalds 已提交
126 127
	pwq->error = 0;
	pwq->table = NULL;
128
	pwq->inline_index = 0;
L
Linus Torvalds 已提交
129 130 131
}
EXPORT_SYMBOL(poll_initwait);

132 133
static void free_poll_entry(struct poll_table_entry *entry)
{
W
WANG Cong 已提交
134
	remove_wait_queue(entry->wait_address, &entry->wait);
135 136 137
	fput(entry->filp);
}

L
Linus Torvalds 已提交
138 139 140
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
141 142 143
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
L
Linus Torvalds 已提交
144 145 146 147 148 149 150
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
151
			free_poll_entry(entry);
L
Linus Torvalds 已提交
152 153 154 155 156 157 158 159
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

T
Tejun Heo 已提交
160
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
L
Linus Torvalds 已提交
161 162 163
{
	struct poll_table_page *table = p->table;

164 165 166
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

L
Linus Torvalds 已提交
167 168 169 170 171 172
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
173
			return NULL;
L
Linus Torvalds 已提交
174 175 176 177 178 179 180
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

181 182 183
	return table->entry++;
}

184
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
T
Tejun Heo 已提交
185 186 187 188 189 190 191 192 193
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
194
	 * and is paired with smp_store_mb() in poll_schedule_timeout.
T
Tejun Heo 已提交
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

210
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
211 212 213 214
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
A
Al Viro 已提交
215
	if (key && !(key_to_poll(key) & entry->key))
216 217 218 219
		return 0;
	return __pollwake(wait, mode, sync, key);
}

220 221 222 223
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
T
Tejun Heo 已提交
224 225
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
226 227
	if (!entry)
		return;
A
Al Viro 已提交
228
	entry->filp = get_file(filp);
229
	entry->wait_address = wait_address;
230
	entry->key = p->_key;
T
Tejun Heo 已提交
231 232
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
W
WANG Cong 已提交
233
	add_wait_queue(wait_address, &entry->wait);
L
Linus Torvalds 已提交
234 235
}

T
Tejun Heo 已提交
236 237 238 239 240 241 242
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
243
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
T
Tejun Heo 已提交
244 245 246 247 248
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
249
	 * The following smp_store_mb() serves two purposes.  First, it's
T
Tejun Heo 已提交
250 251 252 253 254 255 256
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
257
	smp_store_mb(pwq->triggered, 0);
T
Tejun Heo 已提交
258 259 260 261 262

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

263 264
/**
 * poll_select_set_timeout - helper function to setup the timeout value
265
 * @to:		pointer to timespec64 variable for the final timeout
266 267 268 269 270 271 272 273
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
274
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
275
{
276
	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
277

278
	if (!timespec64_valid(&ts))
279 280 281 282 283 284
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
285 286
		ktime_get_ts64(to);
		*to = timespec64_add_safe(*to, ts);
287 288 289 290
	}
	return 0;
}

291 292
static int poll_select_copy_remaining(struct timespec64 *end_time,
				      void __user *p,
293 294
				      int timeval, int ret)
{
D
Deepa Dinamani 已提交
295
	struct timespec64 rts;
296 297 298 299 300 301 302 303 304 305 306 307
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

D
Deepa Dinamani 已提交
308 309 310 311
	ktime_get_ts64(&rts);
	rts = timespec64_sub(*end_time, rts);
	if (rts.tv_sec < 0)
		rts.tv_sec = rts.tv_nsec = 0;
312

313 314

	if (timeval) {
315 316
		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
			memset(&rtv, 0, sizeof(rtv));
D
Deepa Dinamani 已提交
317 318
		rtv.tv_sec = rts.tv_sec;
		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
319 320 321 322

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

D
Deepa Dinamani 已提交
323
	} else if (!put_timespec64(&rts, p))
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
/*
 * Scalable version of the fd_set.
 */

typedef struct {
	unsigned long *in, *out, *ex;
	unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG	(8*sizeof(long))
#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))

/*
 * We do a VERIFY_WRITE here even though we are only reading this time:
 * we'll write to it eventually..
 *
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	nr = FDS_BYTES(nr);
	if (ufdset)
		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

	memset(fdset, 0, nr);
	return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	if (ufdset)
		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
	return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
	memset(fdset, 0, FDS_BYTES(nr));
}

L
Linus Torvalds 已提交
387 388 389 390 391 392 393 394 395 396 397
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
398
	struct fdtable *fdt;
L
Linus Torvalds 已提交
399 400

	/* handle last in-complete long-word first */
401 402
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	n /= BITS_PER_LONG;
403
	fdt = files_fdtable(current->files);
404
	open_fds = fdt->open_fds + n;
L
Linus Torvalds 已提交
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
429
		max += n * BITS_PER_LONG;
L
Linus Torvalds 已提交
430 431 432 433 434
	}

	return max;
}

435 436 437
#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR)
#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR)
#define POLLEX_SET (EPOLLPRI)
L
Linus Torvalds 已提交
438

439
static inline void wait_key_set(poll_table *wait, unsigned long in,
440
				unsigned long out, unsigned long bit,
A
Al Viro 已提交
441
				__poll_t ll_flag)
442
{
443
	wait->_key = POLLEX_SET | ll_flag;
444 445 446 447
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
448 449
}

450
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
L
Linus Torvalds 已提交
451
{
452
	ktime_t expire, *to = NULL;
L
Linus Torvalds 已提交
453 454
	struct poll_wqueues table;
	poll_table *wait;
455
	int retval, i, timed_out = 0;
456
	u64 slack = 0;
A
Al Viro 已提交
457
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
458
	unsigned long busy_start = 0;
L
Linus Torvalds 已提交
459

460
	rcu_read_lock();
L
Linus Torvalds 已提交
461
	retval = max_select_fd(n, fds);
462
	rcu_read_unlock();
L
Linus Torvalds 已提交
463 464 465 466 467 468 469

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
470
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
471
		wait->_qproc = NULL;
472 473 474
		timed_out = 1;
	}

475
	if (end_time && !timed_out)
476
		slack = select_estimate_accuracy(end_time);
477

L
Linus Torvalds 已提交
478 479 480
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
481
		bool can_busy_loop = false;
L
Linus Torvalds 已提交
482 483 484 485 486

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
487
			unsigned long in, out, ex, all_bits, bit = 1, j;
L
Linus Torvalds 已提交
488
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
489
			__poll_t mask;
L
Linus Torvalds 已提交
490 491 492 493

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
494
				i += BITS_PER_LONG;
L
Linus Torvalds 已提交
495 496 497
				continue;
			}

498
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
499
				struct fd f;
L
Linus Torvalds 已提交
500 501 502 503
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
504 505 506 507
				f = fdget(i);
				if (f.file) {
					const struct file_operations *f_op;
					f_op = f.file->f_op;
L
Linus Torvalds 已提交
508
					mask = DEFAULT_POLLMASK;
A
Al Viro 已提交
509
					if (f_op->poll) {
510
						wait_key_set(wait, in, out,
511
							     bit, busy_flag);
512
						mask = (*f_op->poll)(f.file, wait);
513
					}
514
					fdput(f);
L
Linus Torvalds 已提交
515 516 517
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
518
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
519 520 521 522
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
523
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
524 525 526 527
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
528
						wait->_qproc = NULL;
L
Linus Torvalds 已提交
529
					}
530
					/* got something, stop busy polling */
531 532 533 534 535 536 537 538 539 540 541
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

L
Linus Torvalds 已提交
542 543 544 545 546 547 548 549
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
550
			cond_resched();
L
Linus Torvalds 已提交
551
		}
552
		wait->_qproc = NULL;
553
		if (retval || timed_out || signal_pending(current))
L
Linus Torvalds 已提交
554
			break;
P
Pavel Machek 已提交
555
		if (table.error) {
L
Linus Torvalds 已提交
556 557 558
			retval = table.error;
			break;
		}
559

560
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
561
		if (can_busy_loop && !need_resched()) {
562 563
			if (!busy_start) {
				busy_start = busy_loop_current_time();
564 565
				continue;
			}
566
			if (!busy_loop_timeout(busy_start))
567 568 569
				continue;
		}
		busy_flag = 0;
570

571 572 573 574 575 576
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
577
			expire = timespec64_to_ktime(*end_time);
578
			to = &expire;
579
		}
580

T
Tejun Heo 已提交
581 582
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
583
			timed_out = 1;
L
Linus Torvalds 已提交
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
599
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
600
			   fd_set __user *exp, struct timespec64 *end_time)
L
Linus Torvalds 已提交
601 602
{
	fd_set_bits fds;
A
Andrew Morton 已提交
603
	void *bits;
604
	int ret, max_fds;
605
	size_t size, alloc_size;
606
	struct fdtable *fdt;
607
	/* Allocate small arguments on the stack to save memory and be faster */
608
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
L
Linus Torvalds 已提交
609 610 611 612 613

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

614
	/* max_fds can increase, so grab it once to avoid race */
615
	rcu_read_lock();
616
	fdt = files_fdtable(current->files);
617
	max_fds = fdt->max_fds;
618
	rcu_read_unlock();
619 620
	if (n > max_fds)
		n = max_fds;
L
Linus Torvalds 已提交
621 622 623 624 625 626 627

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
628 629 630 631
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
632 633 634 635
		if (size > (SIZE_MAX / 6))
			goto out_nofds;

		alloc_size = 6 * size;
636
		bits = kvmalloc(alloc_size, GFP_KERNEL);
637 638 639
		if (!bits)
			goto out_nofds;
	}
A
Andrew Morton 已提交
640 641 642 643 644 645
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
L
Linus Torvalds 已提交
646 647 648 649 650 651 652 653 654

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

655
	ret = do_select(n, &fds, end_time);
L
Linus Torvalds 已提交
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
672
	if (bits != stack_fds)
673
		kvfree(bits);
L
Linus Torvalds 已提交
674 675 676 677
out_nofds:
	return ret;
}

678 679
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timeval __user *tvp)
680
{
681
	struct timespec64 end_time, *to = NULL;
682 683 684 685 686 687 688
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

689
		to = &end_time;
690 691 692
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
693 694 695
			return -EINVAL;
	}

696 697
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
698 699 700 701

	return ret;
}

702 703 704 705 706 707
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
{
	return kern_select(n, inp, outp, exp, tvp);
}

708 709 710
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timespec __user *tsp,
		       const sigset_t __user *sigmask, size_t sigsetsize)
711 712
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
713
	struct timespec64 ts, end_time, *to = NULL;
714 715 716
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
717
		if (get_timespec64(&ts, tsp))
718 719
			return -EFAULT;

720
		to = &end_time;
D
Deepa Dinamani 已提交
721
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
722 723 724 725 726 727 728 729 730 731 732 733 734 735
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

B
Bernd Schmidt 已提交
736
	ret = core_sys_select(n, inp, outp, exp, to);
737
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
738 739 740 741 742 743 744 745 746 747

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
748
			set_restore_sigmask();
749 750 751 752 753 754 755 756 757 758 759 760 761
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
762 763 764
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timespec __user *, tsp,
		void __user *, sig)
765 766 767 768 769 770
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
771
		    || __get_user(up, (sigset_t __user * __user *)sig)
772
		    || __get_user(sigsetsize,
773
				(size_t __user *)(sig+sizeof(void *))))
774 775 776
			return -EFAULT;
	}

777
	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
778 779
}

C
Christoph Hellwig 已提交
780 781 782 783 784 785 786 787 788 789 790 791 792
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
	unsigned long n;
	fd_set __user *inp, *outp, *exp;
	struct timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
	struct sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
793
	return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp);
C
Christoph Hellwig 已提交
794 795 796
}
#endif

L
Linus Torvalds 已提交
797 798 799 800 801 802 803 804
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

805 806 807 808 809
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
810
 * if pwait->_qproc is non-NULL.
811
 */
A
Al Viro 已提交
812
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
813
				     bool *can_busy_poll,
A
Al Viro 已提交
814
				     __poll_t busy_flag)
L
Linus Torvalds 已提交
815
{
816
	__poll_t mask;
817 818 819 820 821
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
822
		struct fd f = fdget(fd);
823
		mask = EPOLLNVAL;
824
		if (f.file) {
A
Al Viro 已提交
825
			/* userland u16 ->events contains POLL... bitmap */
826
			__poll_t filter = demangle_poll(pollfd->events) |
827
						EPOLLERR | EPOLLHUP;
828
			mask = DEFAULT_POLLMASK;
A
Al Viro 已提交
829
			if (f.file->f_op->poll) {
A
Al Viro 已提交
830
				pwait->_key = filter;
831
				pwait->_key |= busy_flag;
832
				mask = f.file->f_op->poll(f.file, pwait);
833 834
				if (mask & busy_flag)
					*can_busy_poll = true;
835
			}
836
			/* Mask out unneeded events. */
A
Al Viro 已提交
837
			mask &= filter;
838
			fdput(f);
L
Linus Torvalds 已提交
839 840
		}
	}
A
Al Viro 已提交
841
	/* ... and so does ->revents */
842
	pollfd->revents = mangle_poll(mask);
843 844

	return mask;
L
Linus Torvalds 已提交
845 846
}

847
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
848
		   struct timespec64 *end_time)
L
Linus Torvalds 已提交
849 850
{
	poll_table* pt = &wait->pt;
851 852
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
853
	u64 slack = 0;
A
Al Viro 已提交
854
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
855
	unsigned long busy_start = 0;
L
Linus Torvalds 已提交
856

857
	/* Optimise the no-wait case */
858
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
859
		pt->_qproc = NULL;
860 861
		timed_out = 1;
	}
862

863
	if (end_time && !timed_out)
864
		slack = select_estimate_accuracy(end_time);
865

L
Linus Torvalds 已提交
866 867
	for (;;) {
		struct poll_list *walk;
868
		bool can_busy_loop = false;
869

870 871 872 873 874 875 876 877
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
878
				 * and kill poll_table->_qproc, so we don't
879 880 881 882
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
883 884
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
885
					count++;
886
					pt->_qproc = NULL;
887 888 889
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
890 891
				}
			}
L
Linus Torvalds 已提交
892
		}
893 894
		/*
		 * All waiters have already been registered, so don't provide
895
		 * a poll_table->_qproc to them on the next loop iteration.
896
		 */
897
		pt->_qproc = NULL;
898 899 900 901 902
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
903
		if (count || timed_out)
L
Linus Torvalds 已提交
904
			break;
905

906
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
907
		if (can_busy_loop && !need_resched()) {
908 909
			if (!busy_start) {
				busy_start = busy_loop_current_time();
910 911
				continue;
			}
912
			if (!busy_loop_timeout(busy_start))
913 914 915
				continue;
		}
		busy_flag = 0;
916

917 918 919 920 921 922
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
923
			expire = timespec64_to_ktime(*end_time);
924
			to = &expire;
925 926
		}

T
Tejun Heo 已提交
927
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
928
			timed_out = 1;
L
Linus Torvalds 已提交
929 930 931 932
	}
	return count;
}

933 934 935
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

936
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
937
		struct timespec64 *end_time)
L
Linus Torvalds 已提交
938 939
{
	struct poll_wqueues table;
940
 	int err = -EFAULT, fdcount, len, size;
941 942 943 944
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
945 946 947
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
L
Linus Torvalds 已提交
948

J
Jiri Slaby 已提交
949
	if (nfds > rlimit(RLIMIT_NOFILE))
L
Linus Torvalds 已提交
950 951
		return -EINVAL;

952 953 954 955 956 957
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
L
Linus Torvalds 已提交
958

959 960 961 962 963 964 965
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
L
Linus Torvalds 已提交
966

967 968 969 970 971
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
L
Linus Torvalds 已提交
972 973 974
			goto out_fds;
		}
	}
975

976
	poll_initwait(&table);
977
	fdcount = do_poll(head, &table, end_time);
978
	poll_freewait(&table);
L
Linus Torvalds 已提交
979

980
	for (walk = head; walk; walk = walk->next) {
L
Linus Torvalds 已提交
981 982 983
		struct pollfd *fds = walk->entries;
		int j;

984 985
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
L
Linus Torvalds 已提交
986 987
				goto out_fds;
  	}
988

L
Linus Torvalds 已提交
989 990
	err = fdcount;
out_fds:
991 992 993 994 995
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
L
Linus Torvalds 已提交
996
	}
997

L
Linus Torvalds 已提交
998 999
	return err;
}
1000

1001 1002
static long do_restart_poll(struct restart_block *restart_block)
{
1003 1004
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
1005
	struct timespec64 *to = NULL, end_time;
1006 1007
	int ret;

1008 1009 1010 1011 1012 1013 1014 1015
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

1016 1017 1018 1019 1020 1021 1022
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

1023
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
1024
		int, timeout_msecs)
1025
{
1026
	struct timespec64 end_time, *to = NULL;
1027
	int ret;
1028

1029 1030 1031 1032
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
1033 1034
	}

1035 1036
	ret = do_sys_poll(ufds, nfds, to);

1037 1038
	if (ret == -EINTR) {
		struct restart_block *restart_block;
1039

1040
		restart_block = &current->restart_block;
1041
		restart_block->fn = do_restart_poll;
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

1052 1053 1054
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
1055 1056
}

1057 1058 1059
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
		size_t, sigsetsize)
1060 1061
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
1062
	struct timespec64 ts, end_time, *to = NULL;
1063 1064 1065
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
1066
		if (get_timespec64(&ts, tsp))
1067 1068
			return -EFAULT;

1069 1070 1071
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

1085
	ret = do_sys_poll(ufds, nfds, to);
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
1097
			set_restore_sigmask();
1098 1099 1100 1101 1102
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

1103
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1104 1105 1106

	return ret;
}
1107 1108 1109 1110 1111

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

static
D
Deepa Dinamani 已提交
1112
int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
1113 1114
				      int timeval, int ret)
{
D
Deepa Dinamani 已提交
1115
	struct timespec64 ts;
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

D
Deepa Dinamani 已提交
1127 1128
	ktime_get_ts64(&ts);
	ts = timespec64_sub(*end_time, ts);
1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
	if (ts.tv_sec < 0)
		ts.tv_sec = ts.tv_nsec = 0;

	if (timeval) {
		struct compat_timeval rtv;

		rtv.tv_sec = ts.tv_sec;
		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;
	} else {
D
Deepa Dinamani 已提交
1141
		if (!compat_put_timespec64(&ts, p))
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
			return ret;
	}
	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
			unsigned long *fdset)
{
	if (ufdset) {
1167
		return compat_get_bitmap(fdset, ufdset, nr);
1168
	} else {
1169
		zero_fd_set(nr, fdset);
1170
		return 0;
1171 1172 1173 1174 1175 1176 1177 1178 1179
	}
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
		      unsigned long *fdset)
{
	if (!ufdset)
		return 0;
1180
	return compat_put_bitmap(ufdset, fdset, nr);
1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
D
Deepa Dinamani 已提交
1199
	struct timespec64 *end_time)
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267
{
	fd_set_bits fds;
	void *bits;
	int size, max_fds, ret = -EINVAL;
	struct fdtable *fdt;
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

	if (n < 0)
		goto out_nofds;

	/* max_fds can increase, so grab it once to avoid race */
	rcu_read_lock();
	fdt = files_fdtable(current->files);
	max_fds = fdt->max_fds;
	rcu_read_unlock();
	if (n > max_fds)
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words.
	 */
	size = FDS_BYTES(n);
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		bits = kmalloc(6 * size, GFP_KERNEL);
		ret = -ENOMEM;
		if (!bits)
			goto out_nofds;
	}
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
	    (ret = compat_get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, end_time);

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (compat_set_fd_set(n, inp, fds.res_in) ||
	    compat_set_fd_set(n, outp, fds.res_out) ||
	    compat_set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	if (bits != stack_fds)
		kfree(bits);
out_nofds:
	return ret;
}

1268 1269 1270
static int do_compat_select(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct compat_timeval __user *tvp)
1271
{
D
Deepa Dinamani 已提交
1272
	struct timespec64 end_time, *to = NULL;
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
	struct compat_timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
			return -EINVAL;
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);

	return ret;
}

1293 1294 1295 1296 1297 1298 1299
COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timeval __user *, tvp)
{
	return do_compat_select(n, inp, outp, exp, tvp);
}

1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
struct compat_sel_arg_struct {
	compat_ulong_t n;
	compat_uptr_t inp;
	compat_uptr_t outp;
	compat_uptr_t exp;
	compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
	struct compat_sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
1314 1315
	return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
				compat_ptr(a.exp), compat_ptr(a.tvp));
1316 1317 1318 1319 1320 1321 1322 1323
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
	compat_size_t sigsetsize)
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
1324
	struct timespec64 ts, end_time, *to = NULL;
1325 1326 1327
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
1328
		if (compat_get_timespec64(&ts, tsp))
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
A
Al Viro 已提交
1339
		if (get_compat_sigset(&ksigmask, sigmask))
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
			set_restore_sigmask();
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timespec __user *, tsp, void __user *, sig)
{
	compat_size_t sigsetsize = 0;
	compat_uptr_t up = 0;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig,
				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
		    	__get_user(up, (compat_uptr_t __user *)sig) ||
		    	__get_user(sigsetsize,
				(compat_size_t __user *)(sig+sizeof(up))))
			return -EFAULT;
	}
	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
				 sigsetsize);
}

COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
	unsigned int,  nfds, struct compat_timespec __user *, tsp,
	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
	sigset_t ksigmask, sigsaved;
D
Deepa Dinamani 已提交
1390
	struct timespec64 ts, end_time, *to = NULL;
1391 1392 1393
	int ret;

	if (tsp) {
D
Deepa Dinamani 已提交
1394
		if (compat_get_timespec64(&ts, tsp))
1395 1396 1397 1398 1399 1400 1401 1402 1403 1404
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
A
Al Viro 已提交
1405
		if (get_compat_sigset(&ksigmask, sigmask))
1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = do_sys_poll(ufds, nfds, to);

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
				sizeof(sigsaved));
			set_restore_sigmask();
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	return ret;
}
#endif