fanotify_user.c 27.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/fanotify.h>
3
#include <linux/fcntl.h>
4
#include <linux/file.h>
5
#include <linux/fs.h>
6
#include <linux/anon_inodes.h>
7
#include <linux/fsnotify_backend.h>
8
#include <linux/init.h>
E
Eric Paris 已提交
9
#include <linux/mount.h>
10
#include <linux/namei.h>
E
Eric Paris 已提交
11
#include <linux/poll.h>
12 13
#include <linux/security.h>
#include <linux/syscalls.h>
T
Tejun Heo 已提交
14
#include <linux/slab.h>
15
#include <linux/types.h>
E
Eric Paris 已提交
16
#include <linux/uaccess.h>
17
#include <linux/compat.h>
18
#include <linux/sched/signal.h>
19
#include <linux/memcontrol.h>
20 21
#include <linux/statfs.h>
#include <linux/exportfs.h>
E
Eric Paris 已提交
22 23

#include <asm/ioctls.h>
24

25
#include "../../mount.h"
26
#include "../fdinfo.h"
27
#include "fanotify.h"
28

29
#define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30
#define FANOTIFY_DEFAULT_MAX_MARKS	8192
31
#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
32

33 34 35 36 37 38 39 40 41 42 43 44 45
/*
 * All flags that may be specified in parameter event_f_flags of fanotify_init.
 *
 * Internal and external open flags are stored together in field f_flags of
 * struct file. Only external open flags shall be allowed in event_f_flags.
 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
 * excluded.
 */
#define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
		O_LARGEFILE	| O_NOATIME	)

46
extern const struct fsnotify_ops fanotify_fsnotify_ops;
47

48
struct kmem_cache *fanotify_mark_cache __read_mostly;
49
struct kmem_cache *fanotify_event_cachep __read_mostly;
50
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
51

52 53 54 55 56 57 58 59 60 61 62 63
#define FANOTIFY_EVENT_ALIGN 4

static int fanotify_event_info_len(struct fanotify_event *event)
{
	if (!fanotify_event_has_fid(event))
		return 0;

	return roundup(sizeof(struct fanotify_event_info_fid) +
		       sizeof(struct file_handle) + event->fh_len,
		       FANOTIFY_EVENT_ALIGN);
}

E
Eric Paris 已提交
64 65 66 67 68
/*
 * Get an fsnotify notification event if one exists and is small
 * enough to fit in "count". Return an error pointer if the count
 * is not large enough.
 *
69
 * Called with the group->notification_lock held.
E
Eric Paris 已提交
70 71 72 73
 */
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
					    size_t count)
{
74 75 76
	size_t event_size = FAN_EVENT_METADATA_LEN;
	struct fanotify_event *event;

J
Jan Kara 已提交
77
	assert_spin_locked(&group->notification_lock);
E
Eric Paris 已提交
78 79 80 81 82 83

	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);

	if (fsnotify_notify_queue_is_empty(group))
		return NULL;

84 85 86 87 88 89
	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
		event = FANOTIFY_E(fsnotify_peek_first_event(group));
		event_size += fanotify_event_info_len(event);
	}

	if (event_size > count)
E
Eric Paris 已提交
90 91
		return ERR_PTR(-EINVAL);

92 93 94 95
	/*
	 * Held the notification_lock the whole time, so this is the
	 * same event we peeked above
	 */
96
	return fsnotify_remove_first_event(group);
E
Eric Paris 已提交
97 98
}

99
static int create_fd(struct fsnotify_group *group,
100
		     struct fanotify_event *event,
101
		     struct file **file)
E
Eric Paris 已提交
102 103 104 105
{
	int client_fd;
	struct file *new_file;

106
	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
E
Eric Paris 已提交
107

108
	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
E
Eric Paris 已提交
109 110 111 112 113 114 115 116 117
	if (client_fd < 0)
		return client_fd;

	/*
	 * we need a new file handle for the userspace program so it can read even if it was
	 * originally opened O_WRONLY.
	 */
	/* it's possible this event was an overflow event.  in that case dentry and mnt
	 * are NULL;  That's fine, just don't call dentry open */
118 119
	if (event->path.dentry && event->path.mnt)
		new_file = dentry_open(&event->path,
120
				       group->fanotify_data.f_flags | FMODE_NONOTIFY,
E
Eric Paris 已提交
121 122 123 124 125 126 127 128 129 130 131 132 133 134
				       current_cred());
	else
		new_file = ERR_PTR(-EOVERFLOW);
	if (IS_ERR(new_file)) {
		/*
		 * we still send an event even if we can't open the file.  this
		 * can happen when say tasks are gone and we try to open their
		 * /proc files or we try to open a WRONLY file like in sysfs
		 * we just send the errno to userspace since there isn't much
		 * else we can do.
		 */
		put_unused_fd(client_fd);
		client_fd = PTR_ERR(new_file);
	} else {
135
		*file = new_file;
E
Eric Paris 已提交
136 137
	}

138
	return client_fd;
E
Eric Paris 已提交
139 140
}

141
static struct fanotify_perm_event *dequeue_event(
142
				struct fsnotify_group *group, int fd)
143
{
144
	struct fanotify_perm_event *event, *return_e = NULL;
145

146
	spin_lock(&group->notification_lock);
147 148 149
	list_for_each_entry(event, &group->fanotify_data.access_list,
			    fae.fse.list) {
		if (event->fd != fd)
150 151
			continue;

152 153
		list_del_init(&event->fae.fse.list);
		return_e = event;
154 155
		break;
	}
156
	spin_unlock(&group->notification_lock);
157

158
	pr_debug("%s: found return_re=%p\n", __func__, return_e);
159

160
	return return_e;
161 162 163 164 165
}

static int process_access_response(struct fsnotify_group *group,
				   struct fanotify_response *response_struct)
{
166
	struct fanotify_perm_event *event;
167 168
	int fd = response_struct->fd;
	int response = response_struct->response;
169 170 171 172 173

	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
		 fd, response);
	/*
	 * make sure the response is valid, if invalid we do nothing and either
L
Lucas De Marchi 已提交
174
	 * userspace can send a valid response or we will clean it up after the
175 176
	 * timeout
	 */
177
	switch (response & ~FAN_AUDIT) {
178 179 180 181 182 183 184 185 186 187
	case FAN_ALLOW:
	case FAN_DENY:
		break;
	default:
		return -EINVAL;
	}

	if (fd < 0)
		return -EINVAL;

188
	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
189 190
		return -EINVAL;

191 192
	event = dequeue_event(group, fd);
	if (!event)
193 194
		return -ENOENT;

195
	event->response = response;
196 197 198 199 200
	wake_up(&group->fanotify_data.access_waitq);

	return 0;
}

201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
{
	struct fanotify_event_info_fid info = { };
	struct file_handle handle = { };
	size_t fh_len = event->fh_len;
	size_t len = fanotify_event_info_len(event);

	if (!len)
		return 0;

	if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
		return -EFAULT;

	/* Copy event info fid header followed by vaiable sized file handle */
	info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
	info.hdr.len = len;
	info.fsid = event->fid.fsid;
	if (copy_to_user(buf, &info, sizeof(info)))
		return -EFAULT;

	buf += sizeof(info);
	len -= sizeof(info);
	handle.handle_type = event->fh_type;
	handle.handle_bytes = fh_len;
	if (copy_to_user(buf, &handle, sizeof(handle)))
		return -EFAULT;

	buf += sizeof(handle);
	len -= sizeof(handle);
	if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
		return -EFAULT;

	/* Pad with 0's */
	buf += fh_len;
	len -= fh_len;
	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
	if (len > 0 && clear_user(buf, len))
		return -EFAULT;

	return 0;
}

E
Eric Paris 已提交
243
static ssize_t copy_event_to_user(struct fsnotify_group *group,
244
				  struct fsnotify_event *fsn_event,
245
				  char __user *buf, size_t count)
E
Eric Paris 已提交
246
{
247 248 249
	struct fanotify_event_metadata metadata;
	struct fanotify_event *event;
	struct file *f = NULL;
250
	int ret, fd = FAN_NOFD;
E
Eric Paris 已提交
251

252
	pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
E
Eric Paris 已提交
253

254 255 256 257 258 259 260 261
	event = container_of(fsn_event, struct fanotify_event, fse);
	metadata.event_len = FAN_EVENT_METADATA_LEN;
	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
	metadata.vers = FANOTIFY_METADATA_VERSION;
	metadata.reserved = 0;
	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
	metadata.pid = pid_vnr(event->pid);

262
	if (fanotify_event_has_path(event)) {
263 264 265
		fd = create_fd(group, event, &f);
		if (fd < 0)
			return fd;
266 267
	} else if (fanotify_event_has_fid(event)) {
		metadata.event_len += fanotify_event_info_len(event);
268 269
	}
	metadata.fd = fd;
270 271

	ret = -EFAULT;
272 273 274 275
	/*
	 * Sanity check copy size in case get_one_event() and
	 * fill_event_metadata() event_len sizes ever get out of sync.
	 */
276
	if (WARN_ON_ONCE(metadata.event_len > count))
277
		goto out_close_fd;
278

279
	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
280 281
		goto out_close_fd;

282 283
	if (fanotify_is_perm_event(event->mask))
		FANOTIFY_PE(fsn_event)->fd = fd;
E
Eric Paris 已提交
284

285
	if (fanotify_event_has_path(event)) {
286
		fd_install(fd, f);
287 288 289 290 291 292
	} else if (fanotify_event_has_fid(event)) {
		ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
		if (ret < 0)
			return ret;
	}

293
	return metadata.event_len;
294 295

out_close_fd:
296 297 298 299
	if (fd != FAN_NOFD) {
		put_unused_fd(fd);
		fput(f);
	}
300
	return ret;
E
Eric Paris 已提交
301 302 303
}

/* intofiy userspace file descriptor functions */
A
Al Viro 已提交
304
static __poll_t fanotify_poll(struct file *file, poll_table *wait)
E
Eric Paris 已提交
305 306
{
	struct fsnotify_group *group = file->private_data;
A
Al Viro 已提交
307
	__poll_t ret = 0;
E
Eric Paris 已提交
308 309

	poll_wait(file, &group->notification_waitq, wait);
310
	spin_lock(&group->notification_lock);
E
Eric Paris 已提交
311
	if (!fsnotify_notify_queue_is_empty(group))
312
		ret = EPOLLIN | EPOLLRDNORM;
313
	spin_unlock(&group->notification_lock);
E
Eric Paris 已提交
314 315 316 317 318 319 320 321 322 323 324

	return ret;
}

static ssize_t fanotify_read(struct file *file, char __user *buf,
			     size_t count, loff_t *pos)
{
	struct fsnotify_group *group;
	struct fsnotify_event *kevent;
	char __user *start;
	int ret;
325
	DEFINE_WAIT_FUNC(wait, woken_wake_function);
E
Eric Paris 已提交
326 327 328 329 330 331

	start = buf;
	group = file->private_data;

	pr_debug("%s: group=%p\n", __func__, group);

332
	add_wait_queue(&group->notification_waitq, &wait);
E
Eric Paris 已提交
333
	while (1) {
334
		spin_lock(&group->notification_lock);
E
Eric Paris 已提交
335
		kevent = get_one_event(group, count);
336
		spin_unlock(&group->notification_lock);
E
Eric Paris 已提交
337

338
		if (IS_ERR(kevent)) {
E
Eric Paris 已提交
339
			ret = PTR_ERR(kevent);
340 341 342 343 344 345
			break;
		}

		if (!kevent) {
			ret = -EAGAIN;
			if (file->f_flags & O_NONBLOCK)
E
Eric Paris 已提交
346
				break;
347 348 349 350 351 352

			ret = -ERESTARTSYS;
			if (signal_pending(current))
				break;

			if (start != buf)
E
Eric Paris 已提交
353
				break;
354 355

			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
E
Eric Paris 已提交
356 357 358
			continue;
		}

359
		ret = copy_event_to_user(group, kevent, buf, count);
360 361 362 363 364 365 366 367 368 369
		if (unlikely(ret == -EOPENSTALE)) {
			/*
			 * We cannot report events with stale fd so drop it.
			 * Setting ret to 0 will continue the event loop and
			 * do the right thing if there are no more events to
			 * read (i.e. return bytes read, -EAGAIN or wait).
			 */
			ret = 0;
		}

370 371 372 373
		/*
		 * Permission events get queued to wait for response.  Other
		 * events can be destroyed now.
		 */
374
		if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
375
			fsnotify_destroy_event(group, kevent);
376
		} else {
377
			if (ret <= 0) {
378 379
				FANOTIFY_PE(kevent)->response = FAN_DENY;
				wake_up(&group->fanotify_data.access_waitq);
380 381 382 383 384
			} else {
				spin_lock(&group->notification_lock);
				list_add_tail(&kevent->list,
					&group->fanotify_data.access_list);
				spin_unlock(&group->notification_lock);
385 386
			}
		}
387 388
		if (ret < 0)
			break;
389 390
		buf += ret;
		count -= ret;
E
Eric Paris 已提交
391
	}
392
	remove_wait_queue(&group->notification_waitq, &wait);
E
Eric Paris 已提交
393 394 395 396 397 398

	if (start != buf && ret != -EFAULT)
		ret = buf - start;
	return ret;
}

399 400 401 402 403 404
static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
	struct fanotify_response response = { .fd = -1, .response = -1 };
	struct fsnotify_group *group;
	int ret;

405 406 407
	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
		return -EINVAL;

408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
	group = file->private_data;

	if (count > sizeof(response))
		count = sizeof(response);

	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);

	if (copy_from_user(&response, buf, count))
		return -EFAULT;

	ret = process_access_response(group, &response);
	if (ret < 0)
		count = ret;

	return count;
}

425 426 427
static int fanotify_release(struct inode *ignored, struct file *file)
{
	struct fsnotify_group *group = file->private_data;
428
	struct fanotify_perm_event *event, *next;
429
	struct fsnotify_event *fsn_event;
430

431
	/*
432 433 434
	 * Stop new events from arriving in the notification queue. since
	 * userspace cannot use fanotify fd anymore, no event can enter or
	 * leave access_list by now either.
435
	 */
436
	fsnotify_group_stop_queueing(group);
437

438 439 440 441
	/*
	 * Process all permission events on access_list and notification queue
	 * and simulate reply from userspace.
	 */
442
	spin_lock(&group->notification_lock);
443 444 445 446
	list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
				 fae.fse.list) {
		pr_debug("%s: found group=%p event=%p\n", __func__, group,
			 event);
447

448 449
		list_del_init(&event->fae.fse.list);
		event->response = FAN_ALLOW;
450 451
	}

452
	/*
453 454 455
	 * Destroy all non-permission events. For permission events just
	 * dequeue them and set the response. They will be freed once the
	 * response is consumed and fanotify_get_response() returns.
456
	 */
457 458
	while (!fsnotify_notify_queue_is_empty(group)) {
		fsn_event = fsnotify_remove_first_event(group);
459
		if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
460
			spin_unlock(&group->notification_lock);
461
			fsnotify_destroy_event(group, fsn_event);
462
			spin_lock(&group->notification_lock);
463
		} else {
464
			FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
465
		}
466
	}
467
	spin_unlock(&group->notification_lock);
468 469

	/* Response for all permission events it set, wakeup waiters */
470
	wake_up(&group->fanotify_data.access_waitq);
471

472
	/* matches the fanotify_init->fsnotify_alloc_group */
473
	fsnotify_destroy_group(group);
474 475 476 477

	return 0;
}

E
Eric Paris 已提交
478 479 480
static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	struct fsnotify_group *group;
481
	struct fsnotify_event *fsn_event;
E
Eric Paris 已提交
482 483 484 485 486 487 488 489 490 491
	void __user *p;
	int ret = -ENOTTY;
	size_t send_len = 0;

	group = file->private_data;

	p = (void __user *) arg;

	switch (cmd) {
	case FIONREAD:
492
		spin_lock(&group->notification_lock);
493
		list_for_each_entry(fsn_event, &group->notification_list, list)
E
Eric Paris 已提交
494
			send_len += FAN_EVENT_METADATA_LEN;
495
		spin_unlock(&group->notification_lock);
E
Eric Paris 已提交
496 497 498 499 500 501 502
		ret = put_user(send_len, (int __user *) p);
		break;
	}

	return ret;
}

503
static const struct file_operations fanotify_fops = {
504
	.show_fdinfo	= fanotify_show_fdinfo,
E
Eric Paris 已提交
505 506
	.poll		= fanotify_poll,
	.read		= fanotify_read,
507
	.write		= fanotify_write,
508 509
	.fasync		= NULL,
	.release	= fanotify_release,
E
Eric Paris 已提交
510 511
	.unlocked_ioctl	= fanotify_ioctl,
	.compat_ioctl	= fanotify_ioctl,
512
	.llseek		= noop_llseek,
513 514
};

515 516 517 518 519 520 521 522 523
static int fanotify_find_path(int dfd, const char __user *filename,
			      struct path *path, unsigned int flags)
{
	int ret;

	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
		 dfd, filename, flags);

	if (filename == NULL) {
524
		struct fd f = fdget(dfd);
525 526

		ret = -EBADF;
527
		if (!f.file)
528 529 530 531
			goto out;

		ret = -ENOTDIR;
		if ((flags & FAN_MARK_ONLYDIR) &&
A
Al Viro 已提交
532
		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
533
			fdput(f);
534 535 536
			goto out;
		}

537
		*path = f.file->f_path;
538
		path_get(path);
539
		fdput(f);
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
	} else {
		unsigned int lookup_flags = 0;

		if (!(flags & FAN_MARK_DONT_FOLLOW))
			lookup_flags |= LOOKUP_FOLLOW;
		if (flags & FAN_MARK_ONLYDIR)
			lookup_flags |= LOOKUP_DIRECTORY;

		ret = user_path_at(dfd, filename, lookup_flags, path);
		if (ret)
			goto out;
	}

	/* you can only watch an inode if you have read permissions on it */
	ret = inode_permission(path->dentry->d_inode, MAY_READ);
	if (ret)
		path_put(path);
out:
	return ret;
}

561 562
static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
					    __u32 mask,
563 564
					    unsigned int flags,
					    int *destroy)
565
{
566
	__u32 oldmask = 0;
567 568

	spin_lock(&fsn_mark->lock);
569 570
	if (!(flags & FAN_MARK_IGNORED_MASK)) {
		oldmask = fsn_mark->mask;
571
		fsn_mark->mask &= ~mask;
572
	} else {
573
		fsn_mark->ignored_mask &= ~mask;
574
	}
575
	*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
576 577 578 579 580
	spin_unlock(&fsn_mark->lock);

	return mask & oldmask;
}

581 582 583
static int fanotify_remove_mark(struct fsnotify_group *group,
				fsnotify_connp_t *connp, __u32 mask,
				unsigned int flags)
584 585
{
	struct fsnotify_mark *fsn_mark = NULL;
586
	__u32 removed;
587
	int destroy_mark;
588

589
	mutex_lock(&group->mark_mutex);
590
	fsn_mark = fsnotify_find_mark(connp, group);
591 592
	if (!fsn_mark) {
		mutex_unlock(&group->mark_mutex);
593
		return -ENOENT;
594
	}
595

596 597
	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
						 &destroy_mark);
598 599
	if (removed & fsnotify_conn_mask(fsn_mark->connector))
		fsnotify_recalc_mask(fsn_mark->connector);
600
	if (destroy_mark)
601
		fsnotify_detach_mark(fsn_mark);
602
	mutex_unlock(&group->mark_mutex);
603 604
	if (destroy_mark)
		fsnotify_free_mark(fsn_mark);
605

606
	/* matches the fsnotify_find_mark() */
607 608 609
	fsnotify_put_mark(fsn_mark);
	return 0;
}
610

611 612 613 614 615 616 617 618
static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
					 struct vfsmount *mnt, __u32 mask,
					 unsigned int flags)
{
	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
				    mask, flags);
}

619 620 621 622 623 624 625
static int fanotify_remove_sb_mark(struct fsnotify_group *group,
				      struct super_block *sb, __u32 mask,
				      unsigned int flags)
{
	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags);
}

626
static int fanotify_remove_inode_mark(struct fsnotify_group *group,
627 628
				      struct inode *inode, __u32 mask,
				      unsigned int flags)
629
{
630 631
	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
				    flags);
632 633
}

634 635 636
static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
				       __u32 mask,
				       unsigned int flags)
637
{
638
	__u32 oldmask = -1;
639 640

	spin_lock(&fsn_mark->lock);
641 642
	if (!(flags & FAN_MARK_IGNORED_MASK)) {
		oldmask = fsn_mark->mask;
643
		fsn_mark->mask |= mask;
644
	} else {
645
		fsn_mark->ignored_mask |= mask;
646 647
		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
648
	}
649 650 651 652 653
	spin_unlock(&fsn_mark->lock);

	return mask & ~oldmask;
}

654
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
655
						   fsnotify_connp_t *connp,
656 657
						   unsigned int type,
						   __kernel_fsid_t *fsid)
658 659 660 661 662 663 664 665 666 667 668
{
	struct fsnotify_mark *mark;
	int ret;

	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
		return ERR_PTR(-ENOSPC);

	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
	if (!mark)
		return ERR_PTR(-ENOMEM);

669
	fsnotify_init_mark(mark, group);
670
	ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
671 672 673 674 675 676 677 678 679
	if (ret) {
		fsnotify_put_mark(mark);
		return ERR_PTR(ret);
	}

	return mark;
}


680 681
static int fanotify_add_mark(struct fsnotify_group *group,
			     fsnotify_connp_t *connp, unsigned int type,
682 683
			     __u32 mask, unsigned int flags,
			     __kernel_fsid_t *fsid)
684 685
{
	struct fsnotify_mark *fsn_mark;
686
	__u32 added;
687

688
	mutex_lock(&group->mark_mutex);
689
	fsn_mark = fsnotify_find_mark(connp, group);
690
	if (!fsn_mark) {
691
		fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
692
		if (IS_ERR(fsn_mark)) {
693
			mutex_unlock(&group->mark_mutex);
694
			return PTR_ERR(fsn_mark);
695
		}
696
	}
697
	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
698 699
	if (added & ~fsnotify_conn_mask(fsn_mark->connector))
		fsnotify_recalc_mask(fsn_mark->connector);
700
	mutex_unlock(&group->mark_mutex);
701

702
	fsnotify_put_mark(fsn_mark);
703
	return 0;
704 705
}

706 707
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
				      struct vfsmount *mnt, __u32 mask,
708
				      unsigned int flags, __kernel_fsid_t *fsid)
709 710
{
	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
711
				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
712 713
}

714
static int fanotify_add_sb_mark(struct fsnotify_group *group,
715 716
				struct super_block *sb, __u32 mask,
				unsigned int flags, __kernel_fsid_t *fsid)
717 718
{
	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
719
				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
720 721
}

722
static int fanotify_add_inode_mark(struct fsnotify_group *group,
723
				   struct inode *inode, __u32 mask,
724
				   unsigned int flags, __kernel_fsid_t *fsid)
725 726
{
	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
727

728 729 730 731 732 733 734
	/*
	 * If some other task has this inode open for write we should not add
	 * an ignored mark, unless that ignored mark is supposed to survive
	 * modification changes anyway.
	 */
	if ((flags & FAN_MARK_IGNORED_MASK) &&
	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
735
	    inode_is_open_for_write(inode))
736 737
		return 0;

738
	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
739
				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
740
}
741

742
/* fanotify syscalls */
743
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
744
{
745 746
	struct fsnotify_group *group;
	int f_flags, fd;
747
	struct user_struct *user;
748
	struct fanotify_event *oevent;
749

750 751
	pr_debug("%s: flags=%x event_f_flags=%x\n",
		 __func__, flags, event_f_flags);
752 753

	if (!capable(CAP_SYS_ADMIN))
754
		return -EPERM;
755

756
#ifdef CONFIG_AUDITSYSCALL
757
	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
758
#else
759
	if (flags & ~FANOTIFY_INIT_FLAGS)
760
#endif
761 762
		return -EINVAL;

763 764 765 766 767 768 769 770 771 772 773 774
	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
		return -EINVAL;

	switch (event_f_flags & O_ACCMODE) {
	case O_RDONLY:
	case O_RDWR:
	case O_WRONLY:
		break;
	default:
		return -EINVAL;
	}

775 776 777 778
	if ((flags & FAN_REPORT_FID) &&
	    (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
		return -EINVAL;

779 780 781 782 783 784
	user = get_current_user();
	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
		free_uid(user);
		return -EMFILE;
	}

785
	f_flags = O_RDWR | FMODE_NONOTIFY;
786 787 788 789 790 791 792
	if (flags & FAN_CLOEXEC)
		f_flags |= O_CLOEXEC;
	if (flags & FAN_NONBLOCK)
		f_flags |= O_NONBLOCK;

	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
793 794
	if (IS_ERR(group)) {
		free_uid(user);
795
		return PTR_ERR(group);
796
	}
797

798
	group->fanotify_data.user = user;
799
	group->fanotify_data.flags = flags;
800
	atomic_inc(&user->fanotify_listeners);
801
	group->memcg = get_mem_cgroup_from_mm(current->mm);
802

803
	oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL, NULL);
804 805 806 807 808 809
	if (unlikely(!oevent)) {
		fd = -ENOMEM;
		goto out_destroy_group;
	}
	group->overflow_event = &oevent->fse;

810 811
	if (force_o_largefile())
		event_f_flags |= O_LARGEFILE;
812
	group->fanotify_data.f_flags = event_f_flags;
E
Eric Paris 已提交
813 814
	init_waitqueue_head(&group->fanotify_data.access_waitq);
	INIT_LIST_HEAD(&group->fanotify_data.access_list);
815
	switch (flags & FANOTIFY_CLASS_BITS) {
816 817 818 819 820 821 822 823 824 825 826
	case FAN_CLASS_NOTIF:
		group->priority = FS_PRIO_0;
		break;
	case FAN_CLASS_CONTENT:
		group->priority = FS_PRIO_1;
		break;
	case FAN_CLASS_PRE_CONTENT:
		group->priority = FS_PRIO_2;
		break;
	default:
		fd = -EINVAL;
827
		goto out_destroy_group;
828
	}
E
Eric Paris 已提交
829

830 831 832
	if (flags & FAN_UNLIMITED_QUEUE) {
		fd = -EPERM;
		if (!capable(CAP_SYS_ADMIN))
833
			goto out_destroy_group;
834 835 836 837
		group->max_events = UINT_MAX;
	} else {
		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
	}
838

839 840 841
	if (flags & FAN_UNLIMITED_MARKS) {
		fd = -EPERM;
		if (!capable(CAP_SYS_ADMIN))
842
			goto out_destroy_group;
843 844 845 846
		group->fanotify_data.max_marks = UINT_MAX;
	} else {
		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
	}
847

848 849 850 851 852 853
	if (flags & FAN_ENABLE_AUDIT) {
		fd = -EPERM;
		if (!capable(CAP_AUDIT_WRITE))
			goto out_destroy_group;
	}

854 855
	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
	if (fd < 0)
856
		goto out_destroy_group;
857 858 859

	return fd;

860 861
out_destroy_group:
	fsnotify_destroy_group(group);
862
	return fd;
863
}
864

865
/* Check if filesystem can encode a unique fid */
866
static int fanotify_test_fid(struct path *path, struct kstatfs *stat)
867
{
868
	struct kstatfs root_stat;
869 870 871 872 873 874 875 876 877
	struct path root = {
		.mnt = path->mnt,
		.dentry = path->dentry->d_sb->s_root,
	};
	int err;

	/*
	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
	 */
878
	err = vfs_statfs(path, stat);
879 880 881
	if (err)
		return err;

882
	if (!stat->f_fsid.val[0] && !stat->f_fsid.val[1])
883 884 885 886 887 888 889 890 891 892
		return -ENODEV;

	/*
	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
	 * which uses a different fsid than sb root.
	 */
	err = vfs_statfs(&root, &root_stat);
	if (err)
		return err;

893 894
	if (root_stat.f_fsid.val[0] != stat->f_fsid.val[0] ||
	    root_stat.f_fsid.val[1] != stat->f_fsid.val[1])
895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
		return -EXDEV;

	/*
	 * We need to make sure that the file system supports at least
	 * encoding a file handle so user can use name_to_handle_at() to
	 * compare fid returned with event to the file handle of watched
	 * objects. However, name_to_handle_at() requires that the
	 * filesystem also supports decoding file handles.
	 */
	if (!path->dentry->d_sb->s_export_op ||
	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
		return -EOPNOTSUPP;

	return 0;
}

911 912
static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
			    int dfd, const char  __user *pathname)
913
{
914 915
	struct inode *inode = NULL;
	struct vfsmount *mnt = NULL;
916
	struct fsnotify_group *group;
917
	struct fd f;
918
	struct path path;
919 920
	struct kstatfs stat;
	__kernel_fsid_t *fsid = NULL;
921
	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
922
	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
923
	int ret;
924 925 926 927 928 929 930 931

	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
		 __func__, fanotify_fd, flags, dfd, pathname, mask);

	/* we only use the lower 32 bits as of right now. */
	if (mask & ((__u64)0xffffffff << 32))
		return -EINVAL;

932
	if (flags & ~FANOTIFY_MARK_FLAGS)
933
		return -EINVAL;
934 935 936 937 938 939 940 941 942 943

	switch (mark_type) {
	case FAN_MARK_INODE:
	case FAN_MARK_MOUNT:
	case FAN_MARK_FILESYSTEM:
		break;
	default:
		return -EINVAL;
	}

E
Eric Paris 已提交
944
	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
945
	case FAN_MARK_ADD:		/* fallthrough */
946
	case FAN_MARK_REMOVE:
947 948
		if (!mask)
			return -EINVAL;
949
		break;
E
Eric Paris 已提交
950
	case FAN_MARK_FLUSH:
951
		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
952
			return -EINVAL;
953 954 955 956
		break;
	default:
		return -EINVAL;
	}
957

958
	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
959
		valid_mask |= FANOTIFY_PERM_EVENTS;
960 961

	if (mask & ~valid_mask)
962 963
		return -EINVAL;

964 965
	f = fdget(fanotify_fd);
	if (unlikely(!f.file))
966 967 968 969
		return -EBADF;

	/* verify that this is indeed an fanotify instance */
	ret = -EINVAL;
970
	if (unlikely(f.file->f_op != &fanotify_fops))
971
		goto fput_and_out;
972
	group = f.file->private_data;
973 974 975 976 977 978

	/*
	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
	 * allowed to set permissions events.
	 */
	ret = -EINVAL;
979
	if (mask & FANOTIFY_PERM_EVENTS &&
980 981
	    group->priority == FS_PRIO_0)
		goto fput_and_out;
982

983 984
	if (flags & FAN_MARK_FLUSH) {
		ret = 0;
985
		if (mark_type == FAN_MARK_MOUNT)
986
			fsnotify_clear_vfsmount_marks_by_group(group);
987 988
		else if (mark_type == FAN_MARK_FILESYSTEM)
			fsnotify_clear_sb_marks_by_group(group);
989 990 991 992 993
		else
			fsnotify_clear_inode_marks_by_group(group);
		goto fput_and_out;
	}

994 995 996 997
	ret = fanotify_find_path(dfd, pathname, &path, flags);
	if (ret)
		goto fput_and_out;

998
	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
999
		ret = fanotify_test_fid(&path, &stat);
1000 1001
		if (ret)
			goto path_put_and_out;
1002 1003

		fsid = &stat.f_fsid;
1004 1005
	}

1006
	/* inode held in place by reference to path; group by fget on fd */
1007
	if (mark_type == FAN_MARK_INODE)
1008 1009 1010
		inode = path.dentry->d_inode;
	else
		mnt = path.mnt;
1011 1012

	/* create/update an inode mark */
1013
	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1014
	case FAN_MARK_ADD:
1015
		if (mark_type == FAN_MARK_MOUNT)
1016 1017
			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
							 flags, fsid);
1018
		else if (mark_type == FAN_MARK_FILESYSTEM)
1019 1020
			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
						   flags, fsid);
1021
		else
1022 1023
			ret = fanotify_add_inode_mark(group, inode, mask,
						      flags, fsid);
1024 1025
		break;
	case FAN_MARK_REMOVE:
1026
		if (mark_type == FAN_MARK_MOUNT)
1027 1028
			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
							    flags);
1029
		else if (mark_type == FAN_MARK_FILESYSTEM)
1030 1031
			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
						      flags);
1032
		else
1033 1034
			ret = fanotify_remove_inode_mark(group, inode, mask,
							 flags);
1035 1036 1037 1038
		break;
	default:
		ret = -EINVAL;
	}
1039

1040
path_put_and_out:
1041 1042
	path_put(&path);
fput_and_out:
1043
	fdput(f);
1044 1045 1046
	return ret;
}

1047 1048 1049 1050 1051 1052 1053
SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
			      __u64, mask, int, dfd,
			      const char  __user *, pathname)
{
	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
}

1054 1055 1056 1057 1058 1059
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE6(fanotify_mark,
				int, fanotify_fd, unsigned int, flags,
				__u32, mask0, __u32, mask1, int, dfd,
				const char  __user *, pathname)
{
1060
	return do_fanotify_mark(fanotify_fd, flags,
1061 1062
#ifdef __BIG_ENDIAN
				((__u64)mask0 << 32) | mask1,
H
Heiko Carstens 已提交
1063 1064
#else
				((__u64)mask1 << 32) | mask0,
1065 1066 1067 1068 1069
#endif
				 dfd, pathname);
}
#endif

1070
/*
1071
 * fanotify_user_setup - Our initialization function.  Note that we cannot return
1072 1073 1074 1075 1076
 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
 * must result in panic().
 */
static int __init fanotify_user_setup(void)
{
1077
	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
1078 1079
	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);

1080 1081
	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
					 SLAB_PANIC|SLAB_ACCOUNT);
1082
	fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
1083 1084
	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
		fanotify_perm_event_cachep =
1085
			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1086
	}
1087 1088

	return 0;
1089
}
1090
device_initcall(fanotify_user_setup);