pipe.c 22.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 *  linux/fs/pipe.c
 *
 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
18
#include <linux/pagemap.h>
L
Linus Torvalds 已提交
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38

#include <asm/uaccess.h>
#include <asm/ioctls.h>

/*
 * We use a start+len construction, which provides full use of the 
 * allocated memory.
 * -- Florian Coosmann (FGC)
 * 
 * Reads with count = 0 should always return 0.
 * -- Julian Bradfield 1999-06-07.
 *
 * FIFOs and Pipes now generate SIGIO for both readers and writers.
 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
 *
 * pipe_read & write cleanup
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */

/* Drop the inode semaphore and wait for a pipe event, atomically */
39
void pipe_wait(struct pipe_inode_info *pipe)
L
Linus Torvalds 已提交
40 41 42
{
	DEFINE_WAIT(wait);

I
Ingo Molnar 已提交
43 44 45 46
	/*
	 * Pipes are system-local resources, so sleeping on them
	 * is considered a noninteractive wait:
	 */
47 48
	prepare_to_wait(&pipe->wait, &wait,
			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
49 50
	if (pipe->inode)
		mutex_unlock(&pipe->inode->i_mutex);
L
Linus Torvalds 已提交
51
	schedule();
52 53 54
	finish_wait(&pipe->wait, &wait);
	if (pipe->inode)
		mutex_lock(&pipe->inode->i_mutex);
L
Linus Torvalds 已提交
55 56
}

57
static int
58 59
pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
			int atomic)
L
Linus Torvalds 已提交
60 61 62 63 64 65 66 67
{
	unsigned long copy;

	while (len > 0) {
		while (!iov->iov_len)
			iov++;
		copy = min_t(unsigned long, len, iov->iov_len);

68 69 70 71 72 73 74
		if (atomic) {
			if (__copy_from_user_inatomic(to, iov->iov_base, copy))
				return -EFAULT;
		} else {
			if (copy_from_user(to, iov->iov_base, copy))
				return -EFAULT;
		}
L
Linus Torvalds 已提交
75 76 77 78 79 80 81 82
		to += copy;
		len -= copy;
		iov->iov_base += copy;
		iov->iov_len -= copy;
	}
	return 0;
}

83
static int
84 85
pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
		      int atomic)
L
Linus Torvalds 已提交
86 87 88 89 90 91 92 93
{
	unsigned long copy;

	while (len > 0) {
		while (!iov->iov_len)
			iov++;
		copy = min_t(unsigned long, len, iov->iov_len);

94 95 96 97 98 99 100
		if (atomic) {
			if (__copy_to_user_inatomic(iov->iov_base, from, copy))
				return -EFAULT;
		} else {
			if (copy_to_user(iov->iov_base, from, copy))
				return -EFAULT;
		}
L
Linus Torvalds 已提交
101 102 103 104 105 106 107 108
		from += copy;
		len -= copy;
		iov->iov_base += copy;
		iov->iov_len -= copy;
	}
	return 0;
}

109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
/*
 * Attempt to pre-fault in the user memory, so we can use atomic copies.
 * Returns the number of bytes not faulted in.
 */
static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
{
	while (!iov->iov_len)
		iov++;

	while (len > 0) {
		unsigned long this_len;

		this_len = min_t(unsigned long, len, iov->iov_len);
		if (fault_in_pages_writeable(iov->iov_base, this_len))
			break;

		len -= this_len;
		iov++;
	}

	return len;
}

/*
 * Pre-fault in the user memory, so we can use atomic copies.
 */
static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
{
	while (!iov->iov_len)
		iov++;

	while (len > 0) {
		unsigned long this_len;

		this_len = min_t(unsigned long, len, iov->iov_len);
		fault_in_pages_readable(iov->iov_base, this_len);
		len -= this_len;
		iov++;
	}
}

150 151
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
				  struct pipe_buffer *buf)
L
Linus Torvalds 已提交
152 153 154
{
	struct page *page = buf->page;

155 156 157
	/*
	 * If nobody else uses this page, and we don't already have a
	 * temporary page, let's keep track of it as a one-deep
158
	 * allocation cache. (Otherwise just release our reference to it)
159
	 */
160
	if (page_count(page) == 1 && !pipe->tmp_page)
161
		pipe->tmp_page = page;
162 163
	else
		page_cache_release(page);
L
Linus Torvalds 已提交
164 165
}

166
void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
167
			   struct pipe_buffer *buf, int atomic)
L
Linus Torvalds 已提交
168
{
169 170 171 172 173
	if (atomic) {
		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
		return kmap_atomic(buf->page, KM_USER0);
	}

L
Linus Torvalds 已提交
174 175 176
	return kmap(buf->page);
}

177
void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
178
			    struct pipe_buffer *buf, void *map_data)
L
Linus Torvalds 已提交
179
{
180 181 182 183 184
	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
		kunmap_atomic(map_data, KM_USER0);
	} else
		kunmap(buf->page);
L
Linus Torvalds 已提交
185 186
}

187 188
int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
			   struct pipe_buffer *buf)
189
{
190 191 192 193 194 195 196 197
	struct page *page = buf->page;

	if (page_count(page) == 1) {
		lock_page(page);
		return 0;
	}

	return 1;
198 199
}

200
void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
201 202 203 204
{
	page_cache_get(buf->page);
}

205 206 207 208 209
int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf)
{
	return 0;
}

210
static const struct pipe_buf_operations anon_pipe_buf_ops = {
L
Linus Torvalds 已提交
211
	.can_merge = 1,
212 213 214
	.map = generic_pipe_buf_map,
	.unmap = generic_pipe_buf_unmap,
	.pin = generic_pipe_buf_pin,
L
Linus Torvalds 已提交
215
	.release = anon_pipe_buf_release,
216
	.steal = generic_pipe_buf_steal,
217
	.get = generic_pipe_buf_get,
L
Linus Torvalds 已提交
218 219 220
};

static ssize_t
221 222
pipe_read(struct kiocb *iocb, const struct iovec *_iov,
	   unsigned long nr_segs, loff_t pos)
L
Linus Torvalds 已提交
223
{
224
	struct file *filp = iocb->ki_filp;
225
	struct inode *inode = filp->f_path.dentry->d_inode;
226
	struct pipe_inode_info *pipe;
L
Linus Torvalds 已提交
227 228 229 230 231 232 233 234 235 236 237 238
	int do_wakeup;
	ssize_t ret;
	struct iovec *iov = (struct iovec *)_iov;
	size_t total_len;

	total_len = iov_length(iov, nr_segs);
	/* Null read succeeds. */
	if (unlikely(total_len == 0))
		return 0;

	do_wakeup = 0;
	ret = 0;
239
	mutex_lock(&inode->i_mutex);
240
	pipe = inode->i_pipe;
L
Linus Torvalds 已提交
241
	for (;;) {
242
		int bufs = pipe->nrbufs;
L
Linus Torvalds 已提交
243
		if (bufs) {
244 245
			int curbuf = pipe->curbuf;
			struct pipe_buffer *buf = pipe->bufs + curbuf;
246
			const struct pipe_buf_operations *ops = buf->ops;
L
Linus Torvalds 已提交
247 248
			void *addr;
			size_t chars = buf->len;
249
			int error, atomic;
L
Linus Torvalds 已提交
250 251 252 253

			if (chars > total_len)
				chars = total_len;

254 255
			error = ops->pin(pipe, buf);
			if (error) {
256
				if (!ret)
257
					error = ret;
258 259
				break;
			}
260

261 262 263 264 265
			atomic = !iov_fault_in_pages_write(iov, chars);
redo:
			addr = ops->map(pipe, buf, atomic);
			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
			ops->unmap(pipe, buf, addr);
L
Linus Torvalds 已提交
266
			if (unlikely(error)) {
267 268 269 270 271 272 273
				/*
				 * Just retry with the slow path if we failed.
				 */
				if (atomic) {
					atomic = 0;
					goto redo;
				}
274
				if (!ret)
275
					ret = error;
L
Linus Torvalds 已提交
276 277 278 279 280 281 282
				break;
			}
			ret += chars;
			buf->offset += chars;
			buf->len -= chars;
			if (!buf->len) {
				buf->ops = NULL;
283
				ops->release(pipe, buf);
L
Linus Torvalds 已提交
284
				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
285 286
				pipe->curbuf = curbuf;
				pipe->nrbufs = --bufs;
L
Linus Torvalds 已提交
287 288 289 290 291 292 293 294
				do_wakeup = 1;
			}
			total_len -= chars;
			if (!total_len)
				break;	/* common path: read succeeded */
		}
		if (bufs)	/* More to do? */
			continue;
295
		if (!pipe->writers)
L
Linus Torvalds 已提交
296
			break;
297
		if (!pipe->waiting_writers) {
L
Linus Torvalds 已提交
298 299 300 301 302 303 304 305 306 307 308 309 310
			/* syscall merging: Usually we must not sleep
			 * if O_NONBLOCK is set, or if we got some data.
			 * But if a writer sleeps in kernel space, then
			 * we can wait for that data without violating POSIX.
			 */
			if (ret)
				break;
			if (filp->f_flags & O_NONBLOCK) {
				ret = -EAGAIN;
				break;
			}
		}
		if (signal_pending(current)) {
311 312
			if (!ret)
				ret = -ERESTARTSYS;
L
Linus Torvalds 已提交
313 314 315
			break;
		}
		if (do_wakeup) {
316 317
			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
L
Linus Torvalds 已提交
318
		}
319
		pipe_wait(pipe);
L
Linus Torvalds 已提交
320
	}
321
	mutex_unlock(&inode->i_mutex);
322 323

	/* Signal writers asynchronously that there is more room. */
L
Linus Torvalds 已提交
324
	if (do_wakeup) {
325 326
		wake_up_interruptible(&pipe->wait);
		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
L
Linus Torvalds 已提交
327 328 329 330 331 332 333
	}
	if (ret > 0)
		file_accessed(filp);
	return ret;
}

static ssize_t
334 335
pipe_write(struct kiocb *iocb, const struct iovec *_iov,
	    unsigned long nr_segs, loff_t ppos)
L
Linus Torvalds 已提交
336
{
337
	struct file *filp = iocb->ki_filp;
338
	struct inode *inode = filp->f_path.dentry->d_inode;
339
	struct pipe_inode_info *pipe;
L
Linus Torvalds 已提交
340 341 342 343 344 345 346 347 348 349 350 351 352
	ssize_t ret;
	int do_wakeup;
	struct iovec *iov = (struct iovec *)_iov;
	size_t total_len;
	ssize_t chars;

	total_len = iov_length(iov, nr_segs);
	/* Null write succeeds. */
	if (unlikely(total_len == 0))
		return 0;

	do_wakeup = 0;
	ret = 0;
353
	mutex_lock(&inode->i_mutex);
354
	pipe = inode->i_pipe;
L
Linus Torvalds 已提交
355

356
	if (!pipe->readers) {
L
Linus Torvalds 已提交
357 358 359 360 361 362 363
		send_sig(SIGPIPE, current, 0);
		ret = -EPIPE;
		goto out;
	}

	/* We try to merge small writes */
	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
364
	if (pipe->nrbufs && chars != 0) {
365 366
		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
							(PIPE_BUFFERS-1);
367
		struct pipe_buffer *buf = pipe->bufs + lastbuf;
368
		const struct pipe_buf_operations *ops = buf->ops;
L
Linus Torvalds 已提交
369
		int offset = buf->offset + buf->len;
370

L
Linus Torvalds 已提交
371
		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
372
			int error, atomic = 1;
373 374
			void *addr;

375 376
			error = ops->pin(pipe, buf);
			if (error)
377
				goto out;
378

379 380 381
			iov_fault_in_pages_read(iov, chars);
redo1:
			addr = ops->map(pipe, buf, atomic);
382
			error = pipe_iov_copy_from_user(offset + addr, iov,
383 384
							chars, atomic);
			ops->unmap(pipe, buf, addr);
L
Linus Torvalds 已提交
385 386
			ret = error;
			do_wakeup = 1;
387 388 389 390 391
			if (error) {
				if (atomic) {
					atomic = 0;
					goto redo1;
				}
L
Linus Torvalds 已提交
392
				goto out;
393
			}
L
Linus Torvalds 已提交
394 395 396 397 398 399 400 401 402 403
			buf->len += chars;
			total_len -= chars;
			ret = chars;
			if (!total_len)
				goto out;
		}
	}

	for (;;) {
		int bufs;
404

405
		if (!pipe->readers) {
L
Linus Torvalds 已提交
406
			send_sig(SIGPIPE, current, 0);
407 408
			if (!ret)
				ret = -EPIPE;
L
Linus Torvalds 已提交
409 410
			break;
		}
411
		bufs = pipe->nrbufs;
L
Linus Torvalds 已提交
412
		if (bufs < PIPE_BUFFERS) {
413 414 415
			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
			struct pipe_buffer *buf = pipe->bufs + newbuf;
			struct page *page = pipe->tmp_page;
416 417
			char *src;
			int error, atomic = 1;
L
Linus Torvalds 已提交
418 419 420 421 422 423 424

			if (!page) {
				page = alloc_page(GFP_HIGHUSER);
				if (unlikely(!page)) {
					ret = ret ? : -ENOMEM;
					break;
				}
425
				pipe->tmp_page = page;
L
Linus Torvalds 已提交
426
			}
427
			/* Always wake up, even if the copy fails. Otherwise
L
Linus Torvalds 已提交
428 429 430 431 432 433 434 435 436
			 * we lock up (O_NONBLOCK-)readers that sleep due to
			 * syscall merging.
			 * FIXME! Is this really true?
			 */
			do_wakeup = 1;
			chars = PAGE_SIZE;
			if (chars > total_len)
				chars = total_len;

437 438 439 440 441 442 443 444 445 446 447 448 449 450
			iov_fault_in_pages_read(iov, chars);
redo2:
			if (atomic)
				src = kmap_atomic(page, KM_USER0);
			else
				src = kmap(page);

			error = pipe_iov_copy_from_user(src, iov, chars,
							atomic);
			if (atomic)
				kunmap_atomic(src, KM_USER0);
			else
				kunmap(page);

L
Linus Torvalds 已提交
451
			if (unlikely(error)) {
452 453 454 455
				if (atomic) {
					atomic = 0;
					goto redo2;
				}
456
				if (!ret)
457
					ret = error;
L
Linus Torvalds 已提交
458 459 460 461 462 463 464 465 466
				break;
			}
			ret += chars;

			/* Insert it into the buffer array */
			buf->page = page;
			buf->ops = &anon_pipe_buf_ops;
			buf->offset = 0;
			buf->len = chars;
467 468
			pipe->nrbufs = ++bufs;
			pipe->tmp_page = NULL;
L
Linus Torvalds 已提交
469 470 471 472 473 474 475 476

			total_len -= chars;
			if (!total_len)
				break;
		}
		if (bufs < PIPE_BUFFERS)
			continue;
		if (filp->f_flags & O_NONBLOCK) {
477 478
			if (!ret)
				ret = -EAGAIN;
L
Linus Torvalds 已提交
479 480 481
			break;
		}
		if (signal_pending(current)) {
482 483
			if (!ret)
				ret = -ERESTARTSYS;
L
Linus Torvalds 已提交
484 485 486
			break;
		}
		if (do_wakeup) {
487 488
			wake_up_interruptible_sync(&pipe->wait);
			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
L
Linus Torvalds 已提交
489 490
			do_wakeup = 0;
		}
491 492 493
		pipe->waiting_writers++;
		pipe_wait(pipe);
		pipe->waiting_writers--;
L
Linus Torvalds 已提交
494 495
	}
out:
496
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
497
	if (do_wakeup) {
498 499
		wake_up_interruptible(&pipe->wait);
		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
L
Linus Torvalds 已提交
500 501
	}
	if (ret > 0)
502
		file_update_time(filp);
L
Linus Torvalds 已提交
503 504 505 506 507 508 509 510 511 512
	return ret;
}

static ssize_t
bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
	return -EBADF;
}

static ssize_t
513 514
bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
	   loff_t *ppos)
L
Linus Torvalds 已提交
515 516 517 518 519 520 521 522
{
	return -EBADF;
}

static int
pipe_ioctl(struct inode *pino, struct file *filp,
	   unsigned int cmd, unsigned long arg)
{
523
	struct inode *inode = filp->f_path.dentry->d_inode;
524
	struct pipe_inode_info *pipe;
L
Linus Torvalds 已提交
525 526 527 528
	int count, buf, nrbufs;

	switch (cmd) {
		case FIONREAD:
529
			mutex_lock(&inode->i_mutex);
530
			pipe = inode->i_pipe;
L
Linus Torvalds 已提交
531
			count = 0;
532 533
			buf = pipe->curbuf;
			nrbufs = pipe->nrbufs;
L
Linus Torvalds 已提交
534
			while (--nrbufs >= 0) {
535
				count += pipe->bufs[buf].len;
L
Linus Torvalds 已提交
536 537
				buf = (buf+1) & (PIPE_BUFFERS-1);
			}
538
			mutex_unlock(&inode->i_mutex);
539

L
Linus Torvalds 已提交
540 541 542 543 544 545 546 547 548 549 550
			return put_user(count, (int __user *)arg);
		default:
			return -EINVAL;
	}
}

/* No kernel lock held - fine */
static unsigned int
pipe_poll(struct file *filp, poll_table *wait)
{
	unsigned int mask;
551
	struct inode *inode = filp->f_path.dentry->d_inode;
552
	struct pipe_inode_info *pipe = inode->i_pipe;
L
Linus Torvalds 已提交
553 554
	int nrbufs;

555
	poll_wait(filp, &pipe->wait, wait);
L
Linus Torvalds 已提交
556 557

	/* Reading only -- no need for acquiring the semaphore.  */
558
	nrbufs = pipe->nrbufs;
L
Linus Torvalds 已提交
559 560 561
	mask = 0;
	if (filp->f_mode & FMODE_READ) {
		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
562
		if (!pipe->writers && filp->f_version != pipe->w_counter)
L
Linus Torvalds 已提交
563 564 565 566 567
			mask |= POLLHUP;
	}

	if (filp->f_mode & FMODE_WRITE) {
		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
568 569 570 571
		/*
		 * Most Unices do not set POLLERR for FIFOs but on Linux they
		 * behave exactly like pipes for poll().
		 */
572
		if (!pipe->readers)
L
Linus Torvalds 已提交
573 574 575 576 577 578 579 580 581
			mask |= POLLERR;
	}

	return mask;
}

static int
pipe_release(struct inode *inode, int decr, int decw)
{
582 583
	struct pipe_inode_info *pipe;

584
	mutex_lock(&inode->i_mutex);
585 586 587
	pipe = inode->i_pipe;
	pipe->readers -= decr;
	pipe->writers -= decw;
588

589
	if (!pipe->readers && !pipe->writers) {
L
Linus Torvalds 已提交
590 591
		free_pipe_info(inode);
	} else {
592 593 594
		wake_up_interruptible(&pipe->wait);
		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
L
Linus Torvalds 已提交
595
	}
596
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
597 598 599 600 601 602 603

	return 0;
}

static int
pipe_read_fasync(int fd, struct file *filp, int on)
{
604
	struct inode *inode = filp->f_path.dentry->d_inode;
L
Linus Torvalds 已提交
605 606
	int retval;

607 608 609
	mutex_lock(&inode->i_mutex);
	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
610 611 612 613 614 615 616 617 618 619 620

	if (retval < 0)
		return retval;

	return 0;
}


static int
pipe_write_fasync(int fd, struct file *filp, int on)
{
621
	struct inode *inode = filp->f_path.dentry->d_inode;
L
Linus Torvalds 已提交
622 623
	int retval;

624 625 626
	mutex_lock(&inode->i_mutex);
	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
627 628 629 630 631 632 633 634 635 636 637

	if (retval < 0)
		return retval;

	return 0;
}


static int
pipe_rdwr_fasync(int fd, struct file *filp, int on)
{
638
	struct inode *inode = filp->f_path.dentry->d_inode;
639
	struct pipe_inode_info *pipe = inode->i_pipe;
L
Linus Torvalds 已提交
640 641
	int retval;

642
	mutex_lock(&inode->i_mutex);
L
Linus Torvalds 已提交
643

644
	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
L
Linus Torvalds 已提交
645 646

	if (retval >= 0)
647
		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
L
Linus Torvalds 已提交
648

649
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687

	if (retval < 0)
		return retval;

	return 0;
}


static int
pipe_read_release(struct inode *inode, struct file *filp)
{
	pipe_read_fasync(-1, filp, 0);
	return pipe_release(inode, 1, 0);
}

static int
pipe_write_release(struct inode *inode, struct file *filp)
{
	pipe_write_fasync(-1, filp, 0);
	return pipe_release(inode, 0, 1);
}

static int
pipe_rdwr_release(struct inode *inode, struct file *filp)
{
	int decr, decw;

	pipe_rdwr_fasync(-1, filp, 0);
	decr = (filp->f_mode & FMODE_READ) != 0;
	decw = (filp->f_mode & FMODE_WRITE) != 0;
	return pipe_release(inode, decr, decw);
}

static int
pipe_read_open(struct inode *inode, struct file *filp)
{
	/* We could have perhaps used atomic_t, but this and friends
	   below are the only places.  So it doesn't seem worthwhile.  */
688 689 690
	mutex_lock(&inode->i_mutex);
	inode->i_pipe->readers++;
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
691 692 693 694 695 696 697

	return 0;
}

static int
pipe_write_open(struct inode *inode, struct file *filp)
{
698 699 700
	mutex_lock(&inode->i_mutex);
	inode->i_pipe->writers++;
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
701 702 703 704 705 706 707

	return 0;
}

static int
pipe_rdwr_open(struct inode *inode, struct file *filp)
{
708
	mutex_lock(&inode->i_mutex);
L
Linus Torvalds 已提交
709
	if (filp->f_mode & FMODE_READ)
710
		inode->i_pipe->readers++;
L
Linus Torvalds 已提交
711
	if (filp->f_mode & FMODE_WRITE)
712 713
		inode->i_pipe->writers++;
	mutex_unlock(&inode->i_mutex);
L
Linus Torvalds 已提交
714 715 716 717 718 719 720 721

	return 0;
}

/*
 * The file_operations structs are not static because they
 * are also used in linux/fs/fifo.c to do operations on FIFOs.
 */
722
const struct file_operations read_fifo_fops = {
L
Linus Torvalds 已提交
723
	.llseek		= no_llseek,
724 725
	.read		= do_sync_read,
	.aio_read	= pipe_read,
L
Linus Torvalds 已提交
726
	.write		= bad_pipe_w,
727
	.poll		= pipe_poll,
L
Linus Torvalds 已提交
728 729 730 731 732 733
	.ioctl		= pipe_ioctl,
	.open		= pipe_read_open,
	.release	= pipe_read_release,
	.fasync		= pipe_read_fasync,
};

734
const struct file_operations write_fifo_fops = {
L
Linus Torvalds 已提交
735 736
	.llseek		= no_llseek,
	.read		= bad_pipe_r,
737 738
	.write		= do_sync_write,
	.aio_write	= pipe_write,
739
	.poll		= pipe_poll,
L
Linus Torvalds 已提交
740 741 742 743 744 745
	.ioctl		= pipe_ioctl,
	.open		= pipe_write_open,
	.release	= pipe_write_release,
	.fasync		= pipe_write_fasync,
};

746
const struct file_operations rdwr_fifo_fops = {
L
Linus Torvalds 已提交
747
	.llseek		= no_llseek,
748 749 750 751
	.read		= do_sync_read,
	.aio_read	= pipe_read,
	.write		= do_sync_write,
	.aio_write	= pipe_write,
752
	.poll		= pipe_poll,
L
Linus Torvalds 已提交
753 754 755 756 757 758
	.ioctl		= pipe_ioctl,
	.open		= pipe_rdwr_open,
	.release	= pipe_rdwr_release,
	.fasync		= pipe_rdwr_fasync,
};

759
static const struct file_operations read_pipe_fops = {
L
Linus Torvalds 已提交
760
	.llseek		= no_llseek,
761 762
	.read		= do_sync_read,
	.aio_read	= pipe_read,
L
Linus Torvalds 已提交
763 764 765 766 767 768 769 770
	.write		= bad_pipe_w,
	.poll		= pipe_poll,
	.ioctl		= pipe_ioctl,
	.open		= pipe_read_open,
	.release	= pipe_read_release,
	.fasync		= pipe_read_fasync,
};

771
static const struct file_operations write_pipe_fops = {
L
Linus Torvalds 已提交
772 773
	.llseek		= no_llseek,
	.read		= bad_pipe_r,
774 775
	.write		= do_sync_write,
	.aio_write	= pipe_write,
L
Linus Torvalds 已提交
776 777 778 779 780 781 782
	.poll		= pipe_poll,
	.ioctl		= pipe_ioctl,
	.open		= pipe_write_open,
	.release	= pipe_write_release,
	.fasync		= pipe_write_fasync,
};

783
static const struct file_operations rdwr_pipe_fops = {
L
Linus Torvalds 已提交
784
	.llseek		= no_llseek,
785 786 787 788
	.read		= do_sync_read,
	.aio_read	= pipe_read,
	.write		= do_sync_write,
	.aio_write	= pipe_write,
L
Linus Torvalds 已提交
789 790 791 792 793 794 795
	.poll		= pipe_poll,
	.ioctl		= pipe_ioctl,
	.open		= pipe_rdwr_open,
	.release	= pipe_rdwr_release,
	.fasync		= pipe_rdwr_fasync,
};

796 797
struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
{
798
	struct pipe_inode_info *pipe;
799

800 801 802 803 804
	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
	if (pipe) {
		init_waitqueue_head(&pipe->wait);
		pipe->r_counter = pipe->w_counter = 1;
		pipe->inode = inode;
805 806
	}

807
	return pipe;
808 809
}

810
void __free_pipe_info(struct pipe_inode_info *pipe)
L
Linus Torvalds 已提交
811 812 813 814
{
	int i;

	for (i = 0; i < PIPE_BUFFERS; i++) {
815
		struct pipe_buffer *buf = pipe->bufs + i;
L
Linus Torvalds 已提交
816
		if (buf->ops)
817
			buf->ops->release(pipe, buf);
L
Linus Torvalds 已提交
818
	}
819 820 821
	if (pipe->tmp_page)
		__free_page(pipe->tmp_page);
	kfree(pipe);
L
Linus Torvalds 已提交
822 823
}

824 825 826 827 828 829
void free_pipe_info(struct inode *inode)
{
	__free_pipe_info(inode->i_pipe);
	inode->i_pipe = NULL;
}

830
static struct vfsmount *pipe_mnt __read_mostly;
L
Linus Torvalds 已提交
831 832
static int pipefs_delete_dentry(struct dentry *dentry)
{
833 834 835 836 837 838 839 840
	/*
	 * At creation time, we pretended this dentry was hashed
	 * (by clearing DCACHE_UNHASHED bit in d_flags)
	 * At delete time, we restore the truth : not hashed.
	 * (so that dput() can proceed correctly)
	 */
	dentry->d_flags |= DCACHE_UNHASHED;
	return 0;
L
Linus Torvalds 已提交
841
}
842

L
Linus Torvalds 已提交
843 844 845 846 847 848 849
static struct dentry_operations pipefs_dentry_operations = {
	.d_delete	= pipefs_delete_dentry,
};

static struct inode * get_pipe_inode(void)
{
	struct inode *inode = new_inode(pipe_mnt->mnt_sb);
850
	struct pipe_inode_info *pipe;
L
Linus Torvalds 已提交
851 852 853 854

	if (!inode)
		goto fail_inode;

855 856
	pipe = alloc_pipe_info(inode);
	if (!pipe)
L
Linus Torvalds 已提交
857
		goto fail_iput;
858
	inode->i_pipe = pipe;
859

860
	pipe->readers = pipe->writers = 1;
L
Linus Torvalds 已提交
861 862 863 864 865 866 867 868 869 870 871 872 873
	inode->i_fop = &rdwr_pipe_fops;

	/*
	 * Mark the inode dirty from the very beginning,
	 * that way it will never be moved to the dirty
	 * list because "mark_inode_dirty()" will think
	 * that it already _is_ on the dirty list.
	 */
	inode->i_state = I_DIRTY;
	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;
	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
874

L
Linus Torvalds 已提交
875 876 877 878
	return inode;

fail_iput:
	iput(inode);
879

L
Linus Torvalds 已提交
880 881 882 883
fail_inode:
	return NULL;
}

A
Andi Kleen 已提交
884
struct file *create_write_pipe(void)
L
Linus Torvalds 已提交
885
{
A
Andi Kleen 已提交
886 887 888
	int err;
	struct inode *inode;
	struct file *f;
L
Linus Torvalds 已提交
889
	struct dentry *dentry;
A
Andi Kleen 已提交
890 891
	char name[32];
	struct qstr this;
L
Linus Torvalds 已提交
892

A
Andi Kleen 已提交
893 894 895 896
	f = get_empty_filp();
	if (!f)
		return ERR_PTR(-ENFILE);
	err = -ENFILE;
L
Linus Torvalds 已提交
897 898
	inode = get_pipe_inode();
	if (!inode)
A
Andi Kleen 已提交
899
		goto err_file;
L
Linus Torvalds 已提交
900

901
	this.len = sprintf(name, "[%lu]", inode->i_ino);
L
Linus Torvalds 已提交
902
	this.name = name;
903
	this.hash = 0;
A
Andi Kleen 已提交
904
	err = -ENOMEM;
L
Linus Torvalds 已提交
905 906
	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
	if (!dentry)
A
Andi Kleen 已提交
907
		goto err_inode;
908

L
Linus Torvalds 已提交
909
	dentry->d_op = &pipefs_dentry_operations;
910 911 912 913 914 915 916
	/*
	 * We dont want to publish this dentry into global dentry hash table.
	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
	 * This permits a working /proc/$pid/fd/XXX on pipes
	 */
	dentry->d_flags &= ~DCACHE_UNHASHED;
	d_instantiate(dentry, inode);
917 918
	f->f_path.mnt = mntget(pipe_mnt);
	f->f_path.dentry = dentry;
A
Andi Kleen 已提交
919
	f->f_mapping = inode->i_mapping;
920

A
Andi Kleen 已提交
921 922 923 924 925 926
	f->f_flags = O_WRONLY;
	f->f_op = &write_pipe_fops;
	f->f_mode = FMODE_WRITE;
	f->f_version = 0;

	return f;
L
Linus Torvalds 已提交
927

A
Andi Kleen 已提交
928
 err_inode:
L
Linus Torvalds 已提交
929 930
	free_pipe_info(inode);
	iput(inode);
A
Andi Kleen 已提交
931 932 933 934 935 936 937
 err_file:
	put_filp(f);
	return ERR_PTR(err);
}

void free_write_pipe(struct file *f)
{
938 939
	mntput(f->f_path.mnt);
	dput(f->f_path.dentry);
A
Andi Kleen 已提交
940 941 942 943 944 945 946 947 948 949
	put_filp(f);
}

struct file *create_read_pipe(struct file *wrf)
{
	struct file *f = get_empty_filp();
	if (!f)
		return ERR_PTR(-ENFILE);

	/* Grab pipe from the writer */
950 951 952
	f->f_path.mnt = mntget(wrf->f_path.mnt);
	f->f_path.dentry = dget(wrf->f_path.dentry);
	f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
A
Andi Kleen 已提交
953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000

	f->f_pos = 0;
	f->f_flags = O_RDONLY;
	f->f_op = &read_pipe_fops;
	f->f_mode = FMODE_READ;
	f->f_version = 0;

	return f;
}

int do_pipe(int *fd)
{
	struct file *fw, *fr;
	int error;
	int fdw, fdr;

	fw = create_write_pipe();
	if (IS_ERR(fw))
		return PTR_ERR(fw);
	fr = create_read_pipe(fw);
	error = PTR_ERR(fr);
	if (IS_ERR(fr))
		goto err_write_pipe;

	error = get_unused_fd();
	if (error < 0)
		goto err_read_pipe;
	fdr = error;

	error = get_unused_fd();
	if (error < 0)
		goto err_fdr;
	fdw = error;

	fd_install(fdr, fr);
	fd_install(fdw, fw);
	fd[0] = fdr;
	fd[1] = fdw;

	return 0;

 err_fdr:
	put_unused_fd(fdr);
 err_read_pipe:
	put_filp(fr);
 err_write_pipe:
	free_write_pipe(fw);
	return error;
L
Linus Torvalds 已提交
1001 1002 1003 1004 1005 1006 1007 1008
}

/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */
1009 1010 1011
static int pipefs_get_sb(struct file_system_type *fs_type,
			 int flags, const char *dev_name, void *data,
			 struct vfsmount *mnt)
L
Linus Torvalds 已提交
1012
{
1013
	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
L
Linus Torvalds 已提交
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
}

static struct file_system_type pipe_fs_type = {
	.name		= "pipefs",
	.get_sb		= pipefs_get_sb,
	.kill_sb	= kill_anon_super,
};

static int __init init_pipe_fs(void)
{
	int err = register_filesystem(&pipe_fs_type);
1025

L
Linus Torvalds 已提交
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	if (!err) {
		pipe_mnt = kern_mount(&pipe_fs_type);
		if (IS_ERR(pipe_mnt)) {
			err = PTR_ERR(pipe_mnt);
			unregister_filesystem(&pipe_fs_type);
		}
	}
	return err;
}

static void __exit exit_pipe_fs(void)
{
	unregister_filesystem(&pipe_fs_type);
	mntput(pipe_mnt);
}

fs_initcall(init_pipe_fs);
module_exit(exit_pipe_fs);