inode_mark.c 13.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
 * The mark->refcnt tells how many "things" in the kernel currently are
 * referencing this object.  The object typically will live inside the kernel
 * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
 * which can find this object holding the appropriete locks, can take a reference
 * and the object itself is guarenteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
 * be taken in order as follows:
 *
33
 * mark->lock
34 35 36
 * group->mark_lock
 * inode->i_lock
 *
37
 * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
38 39 40
 * that lock to dereference either of these things (they could be NULL even with
 * the lock)
 *
41
 * group->mark_lock protects the marks_list anchored inside a given group
42
 * and each mark is hooked via the g_list.  It also sorta protects the
43 44 45
 * free_g_list, which when used is anchored by a private list on the stack of the
 * task which held the group->mark_lock.
 *
46
 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47
 * given inode and each mark is hooked via the i_list. (and sorta the
48 49 50 51 52 53 54 55 56 57 58
 * free_i_list)
 *
 *
 * LIFETIME:
 * Inode marks survive between when they are added to an inode and when their
 * refcnt==0.
 *
 * The inode mark can be cleared for a number of different reasons including:
 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
 * - The inode is being evicted from cache. (fsnotify_inode_delete)
 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
59
 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
60 61 62 63
 * - The fsnotify_group associated with the mark is going away and all such marks
 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
 *
 * Worst case we are given an inode and need to clean up all the marks on that
64
 * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
 * mark on the list we take a reference (so the mark can't disappear under us).
 * We remove that mark form the inode's list of marks and we add this mark to a
 * private list anchored on the stack using i_free_list;  At this point we no
 * longer fear anything finding the mark using the inode's list of marks.
 *
 * We can safely and locklessly run the private list on the stack of everything
 * we just unattached from the original inode.  For each mark on the private list
 * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
 * we see the group and inode are not NULL we take those locks.  Now holding all
 * 3 locks we can completely remove the mark from other tasks finding it in the
 * future.  Remember, 10 things might already be referencing this mark, but they
 * better be holding a ref.  We drop our reference we took before we unhooked it
 * from the inode.  When the ref hits 0 we can free the mark.
 *
 * Very similarly for freeing by group, except we use free_g_list.
 *
 * This has the very interesting property of being able to run concurrently with
 * any (or all) other directions.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
91
#include <linux/writeback.h> /* for inode_lock */
92 93 94 95 96 97

#include <asm/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

98
void fsnotify_get_mark(struct fsnotify_mark *mark)
99
{
100
	atomic_inc(&mark->refcnt);
101 102
}

103
void fsnotify_put_mark(struct fsnotify_mark *mark)
104
{
105 106
	if (atomic_dec_and_test(&mark->refcnt))
		mark->free_mark(mark);
107 108 109 110 111 112 113
}

/*
 * Recalculate the mask of events relevant to a given inode locked.
 */
static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
{
114
	struct fsnotify_mark *mark;
115 116 117 118 119
	struct hlist_node *pos;
	__u32 new_mask = 0;

	assert_spin_locked(&inode->i_lock);

120 121
	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
		new_mask |= mark->mask;
122 123 124 125 126 127 128 129 130 131 132 133
	inode->i_fsnotify_mask = new_mask;
}

/*
 * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
 * any notifier is interested in hearing for this inode.
 */
void fsnotify_recalc_inode_mask(struct inode *inode)
{
	spin_lock(&inode->i_lock);
	fsnotify_recalc_inode_mask_locked(inode);
	spin_unlock(&inode->i_lock);
E
Eric Paris 已提交
134 135

	__fsnotify_update_child_dentry_flags(inode);
136 137 138 139 140
}

/*
 * Any time a mark is getting freed we end up here.
 * The caller had better be holding a reference to this mark so we don't actually
141
 * do the final put under the mark->lock
142
 */
143
void fsnotify_destroy_mark(struct fsnotify_mark *mark)
144 145 146 147
{
	struct fsnotify_group *group;
	struct inode *inode;

148
	spin_lock(&mark->lock);
149

150 151
	group = mark->group;
	inode = mark->i.inode;
152 153 154 155 156 157

	BUG_ON(group && !inode);
	BUG_ON(!group && inode);

	/* if !group something else already marked this to die */
	if (!group) {
158
		spin_unlock(&mark->lock);
159 160 161 162
		return;
	}

	/* 1 from caller and 1 for being on i_list/g_list */
163
	BUG_ON(atomic_read(&mark->refcnt) < 2);
164 165 166 167

	spin_lock(&group->mark_lock);
	spin_lock(&inode->i_lock);

168 169
	hlist_del_init(&mark->i.i_list);
	mark->i.inode = NULL;
170

171 172
	list_del_init(&mark->g_list);
	mark->group = NULL;
173

174
	fsnotify_put_mark(mark); /* for i_list and g_list */
175 176

	/*
177
	 * this mark is now off the inode->i_fsnotify_marks list and we
178 179 180 181 182 183 184
	 * hold the inode->i_lock, so this is the perfect time to update the
	 * inode->i_fsnotify_mask
	 */
	fsnotify_recalc_inode_mask_locked(inode);

	spin_unlock(&inode->i_lock);
	spin_unlock(&group->mark_lock);
185
	spin_unlock(&mark->lock);
186 187 188

	/*
	 * Some groups like to know that marks are being freed.  This is a
189
	 * callback to the group function to let it know that this mark
190 191
	 * is being freed.
	 */
192
	if (group->ops->freeing_mark)
193
		group->ops->freeing_mark(mark, group);
194

E
Eric Paris 已提交
195 196 197 198
	/*
	 * __fsnotify_update_child_dentry_flags(inode);
	 *
	 * I really want to call that, but we can't, we have no idea if the inode
199
	 * still exists the second we drop the mark->lock.
E
Eric Paris 已提交
200 201 202 203 204 205 206 207
	 *
	 * The next time an event arrive to this inode from one of it's children
	 * __fsnotify_parent will see that the inode doesn't care about it's
	 * children and will update all of these flags then.  So really this
	 * is just a lazy update (and could be a perf win...)
	 */


208 209
	iput(inode);

210 211 212 213 214 215 216 217 218 219 220 221 222 223
	/*
	 * it's possible that this group tried to destroy itself, but this
	 * this mark was simultaneously being freed by inode.  If that's the
	 * case, we finish freeing the group here.
	 */
	if (unlikely(atomic_dec_and_test(&group->num_marks)))
		fsnotify_final_destroy_group(group);
}

/*
 * Given a group, destroy all of the marks associated with that group.
 */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
{
224
	struct fsnotify_mark *lmark, *mark;
225 226 227
	LIST_HEAD(free_list);

	spin_lock(&group->mark_lock);
228 229 230 231
	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
		list_add(&mark->free_g_list, &free_list);
		list_del_init(&mark->g_list);
		fsnotify_get_mark(mark);
232 233 234
	}
	spin_unlock(&group->mark_lock);

235 236 237
	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
		fsnotify_destroy_mark(mark);
		fsnotify_put_mark(mark);
238 239 240 241 242 243 244 245
	}
}

/*
 * Given an inode, destroy all of the marks associated with that inode.
 */
void fsnotify_clear_marks_by_inode(struct inode *inode)
{
246
	struct fsnotify_mark *mark, *lmark;
247 248 249 250
	struct hlist_node *pos, *n;
	LIST_HEAD(free_list);

	spin_lock(&inode->i_lock);
251 252 253 254
	hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
		list_add(&mark->i.free_i_list, &free_list);
		hlist_del_init(&mark->i.i_list);
		fsnotify_get_mark(mark);
255 256 257
	}
	spin_unlock(&inode->i_lock);

258 259 260
	list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
		fsnotify_destroy_mark(mark);
		fsnotify_put_mark(mark);
261 262 263 264 265 266 267
	}
}

/*
 * given a group and inode, find the mark associated with that combination.
 * if found take a reference to that mark and return it, else return NULL
 */
268 269
struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group,
					 struct inode *inode)
270
{
271
	struct fsnotify_mark *mark;
272 273 274 275
	struct hlist_node *pos;

	assert_spin_locked(&inode->i_lock);

276 277 278 279
	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
		if (mark->group == group) {
			fsnotify_get_mark(mark);
			return mark;
280 281 282 283 284
		}
	}
	return NULL;
}

285
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
286 287
{
	assert_spin_locked(&old->lock);
288
	new->i.inode = old->i.inode;
289 290 291 292 293
	new->group = old->group;
	new->mask = old->mask;
	new->free_mark = old->free_mark;
}

294 295 296
/*
 * Nothing fancy, just initialize lists and locks and counters.
 */
297 298
void fsnotify_init_mark(struct fsnotify_mark *mark,
			void (*free_mark)(struct fsnotify_mark *mark))
299
{
300 301 302 303 304 305 306
	spin_lock_init(&mark->lock);
	atomic_set(&mark->refcnt, 1);
	INIT_HLIST_NODE(&mark->i.i_list);
	mark->group = NULL;
	mark->mask = 0;
	mark->i.inode = NULL;
	mark->free_mark = free_mark;
307 308 309
}

/*
310
 * Attach an initialized mark mark to a given group and inode.
311 312 313
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group and for which inodes.
 */
314
int fsnotify_add_mark(struct fsnotify_mark *mark,
315 316
		      struct fsnotify_group *group, struct inode *inode,
		      int allow_dups)
317
{
318
	struct fsnotify_mark *lmark = NULL;
319 320
	int ret = 0;

321 322 323 324
	inode = igrab(inode);
	if (unlikely(!inode))
		return -EINVAL;

325
	mark->flags = FSNOTIFY_MARK_FLAG_INODE;
326

327 328 329 330 331 332
	/*
	 * if this group isn't being testing for inode type events we need
	 * to start testing
	 */
	if (unlikely(list_empty(&group->inode_group_list)))
		fsnotify_add_inode_group(group);
333 334 335 336 337 338 339
	/*
	 * XXX This is where we could also do the fsnotify_add_vfsmount_group
	 * if we are setting and vfsmount mark....

	if (unlikely(list_empty(&group->vfsmount_group_list)))
		fsnotify_add_vfsmount_group(group);
	 */
340

341 342
	/*
	 * LOCKING ORDER!!!!
343
	 * mark->lock
344 345 346
	 * group->mark_lock
	 * inode->i_lock
	 */
347
	spin_lock(&mark->lock);
348 349 350
	spin_lock(&group->mark_lock);
	spin_lock(&inode->i_lock);

351
	if (!allow_dups)
352 353 354 355
		lmark = fsnotify_find_mark(group, inode);
	if (!lmark) {
		mark->group = group;
		mark->i.inode = inode;
356

357 358
		hlist_add_head(&mark->i.i_list, &inode->i_fsnotify_marks);
		list_add(&mark->g_list, &group->marks_list);
359

360
		fsnotify_get_mark(mark); /* for i_list and g_list */
361 362 363 364 365 366 367 368

		atomic_inc(&group->num_marks);

		fsnotify_recalc_inode_mask_locked(inode);
	}

	spin_unlock(&inode->i_lock);
	spin_unlock(&group->mark_lock);
369
	spin_unlock(&mark->lock);
370

371
	if (lmark) {
372
		ret = -EEXIST;
373
		iput(inode);
374
		fsnotify_put_mark(lmark);
E
Eric Paris 已提交
375 376
	} else {
		__fsnotify_update_child_dentry_flags(inode);
377 378 379 380
	}

	return ret;
}
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451

/**
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @list: list of inodes being unmounted (sb->s_inodes)
 *
 * Called with inode_lock held, protecting the unmounting super block's list
 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
 * We temporarily drop inode_lock, however, and CAN block.
 */
void fsnotify_unmount_inodes(struct list_head *list)
{
	struct inode *inode, *next_i, *need_iput = NULL;

	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
		struct inode *need_iput_tmp;

		/*
		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
		 * I_WILL_FREE, or I_NEW which is fine because by that point
		 * the inode cannot have any associated watches.
		 */
		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
			continue;

		/*
		 * If i_count is zero, the inode cannot have any watches and
		 * doing an __iget/iput with MS_ACTIVE clear would actually
		 * evict all inodes with zero i_count from icache which is
		 * unnecessarily violent and may in fact be illegal to do.
		 */
		if (!atomic_read(&inode->i_count))
			continue;

		need_iput_tmp = need_iput;
		need_iput = NULL;

		/* In case fsnotify_inode_delete() drops a reference. */
		if (inode != need_iput_tmp)
			__iget(inode);
		else
			need_iput_tmp = NULL;

		/* In case the dropping of a reference would nuke next_i. */
		if ((&next_i->i_sb_list != list) &&
		    atomic_read(&next_i->i_count) &&
		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
			__iget(next_i);
			need_iput = next_i;
		}

		/*
		 * We can safely drop inode_lock here because we hold
		 * references on both inode and next_i.  Also no new inodes
		 * will be added since the umount has begun.  Finally,
		 * iprune_mutex keeps shrink_icache_memory() away.
		 */
		spin_unlock(&inode_lock);

		if (need_iput_tmp)
			iput(need_iput_tmp);

		/* for each watch, send FS_UNMOUNT and then remove it */
		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);

		fsnotify_inode_delete(inode);

		iput(inode);

		spin_lock(&inode_lock);
	}
}