diff --git a/fs/namespace.c b/fs/namespace.c index b94ad3d685ff4ff617675a7b02cdcf2cec084431..22ae06ad751d96277a0095fb308ea5149d39b5a3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); - atomic_set(&mnt->__mnt_writers, 0); +#ifdef CONFIG_SMP + mnt->mnt_writers = alloc_percpu(int); + if (!mnt->mnt_writers) + goto out_free_devname; +#else + mnt->mnt_writers = 0; +#endif } return mnt; +#ifdef CONFIG_SMP +out_free_devname: + kfree(mnt->mnt_devname); +#endif out_free_id: mnt_free_id(mnt); out_free_cache: @@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -struct mnt_writer { - /* - * If holding multiple instances of this lock, they - * must be ordered by cpu number. - */ - spinlock_t lock; - struct lock_class_key lock_class; /* compiles out with !lockdep */ - unsigned long count; - struct vfsmount *mnt; -} ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); +static inline void inc_mnt_writers(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; +#else + mnt->mnt_writers++; +#endif +} -static int __init init_mnt_writers(void) +static inline void dec_mnt_writers(struct vfsmount *mnt) { - int cpu; - for_each_possible_cpu(cpu) { - struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); - spin_lock_init(&writer->lock); - lockdep_set_class(&writer->lock, &writer->lock_class); - writer->count = 0; - } - return 0; +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; +#else + mnt->mnt_writers--; +#endif } -fs_initcall(init_mnt_writers); -static void unlock_mnt_writers(void) +static unsigned int count_mnt_writers(struct vfsmount *mnt) { +#ifdef CONFIG_SMP + unsigned int count = 0; int cpu; - struct mnt_writer *cpu_writer; for_each_possible_cpu(cpu) { - cpu_writer = &per_cpu(mnt_writers, cpu); - spin_unlock(&cpu_writer->lock); + count += *per_cpu_ptr(mnt->mnt_writers, cpu); } -} -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) -{ - if (!cpu_writer->mnt) - return; - /* - * This is in case anyone ever leaves an invalid, - * old ->mnt and a count of 0. - */ - if (!cpu_writer->count) - return; - atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); - cpu_writer->count = 0; -} - /* - * must hold cpu_writer->lock - */ -static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, - struct vfsmount *mnt) -{ - if (cpu_writer->mnt == mnt) - return; - __clear_mnt_count(cpu_writer); - cpu_writer->mnt = mnt; + return count; +#else + return mnt->mnt_writers; +#endif } /* @@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, int mnt_want_write(struct vfsmount *mnt) { int ret = 0; - struct mnt_writer *cpu_writer; - cpu_writer = &get_cpu_var(mnt_writers); - spin_lock(&cpu_writer->lock); + preempt_disable(); + inc_mnt_writers(mnt); + /* + * The store to inc_mnt_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (mnt->mnt_flags & MNT_WRITE_HOLD) + cpu_relax(); + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); if (__mnt_is_readonly(mnt)) { + dec_mnt_writers(mnt); ret = -EROFS; goto out; } - use_cpu_writer_for_mount(cpu_writer, mnt); - cpu_writer->count++; out: - spin_unlock(&cpu_writer->lock); - put_cpu_var(mnt_writers); + preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); -static void lock_mnt_writers(void) -{ - int cpu; - struct mnt_writer *cpu_writer; - - for_each_possible_cpu(cpu) { - cpu_writer = &per_cpu(mnt_writers, cpu); - spin_lock(&cpu_writer->lock); - __clear_mnt_count(cpu_writer); - cpu_writer->mnt = NULL; - } -} - -/* - * These per-cpu write counts are not guaranteed to have - * matched increments and decrements on any given cpu. - * A file open()ed for write on one cpu and close()d on - * another cpu will imbalance this count. Make sure it - * does not get too far out of whack. - */ -static void handle_write_count_underflow(struct vfsmount *mnt) -{ - if (atomic_read(&mnt->__mnt_writers) >= - MNT_WRITER_UNDERFLOW_LIMIT) - return; - /* - * It isn't necessary to hold all of the locks - * at the same time, but doing it this way makes - * us share a lot more code. - */ - lock_mnt_writers(); - /* - * vfsmount_lock is for mnt_flags. - */ - spin_lock(&vfsmount_lock); - /* - * If coalescing the per-cpu writer counts did not - * get us back to a positive writer count, we have - * a bug. - */ - if ((atomic_read(&mnt->__mnt_writers) < 0) && - !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { - WARN(1, KERN_DEBUG "leak detected on mount(%p) writers " - "count: %d\n", - mnt, atomic_read(&mnt->__mnt_writers)); - /* use the flag to keep the dmesg spam down */ - mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; - } - spin_unlock(&vfsmount_lock); - unlock_mnt_writers(); -} - /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access @@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - int must_check_underflow = 0; - struct mnt_writer *cpu_writer; - - cpu_writer = &get_cpu_var(mnt_writers); - spin_lock(&cpu_writer->lock); - - use_cpu_writer_for_mount(cpu_writer, mnt); - if (cpu_writer->count > 0) { - cpu_writer->count--; - } else { - must_check_underflow = 1; - atomic_dec(&mnt->__mnt_writers); - } - - spin_unlock(&cpu_writer->lock); - /* - * Logically, we could call this each time, - * but the __mnt_writers cacheline tends to - * be cold, and makes this expensive. - */ - if (must_check_underflow) - handle_write_count_underflow(mnt); - /* - * This could be done right after the spinlock - * is taken because the spinlock keeps us on - * the cpu, and disables preemption. However, - * putting it here bounds the amount that - * __mnt_writers can underflow. Without it, - * we could theoretically wrap __mnt_writers. - */ - put_cpu_var(mnt_writers); + preempt_disable(); + dec_mnt_writers(mnt); + preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_drop_write); @@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt) { int ret = 0; - lock_mnt_writers(); + spin_lock(&vfsmount_lock); + mnt->mnt_flags |= MNT_WRITE_HOLD; /* - * With all the locks held, this value is stable + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. */ - if (atomic_read(&mnt->__mnt_writers) > 0) { - ret = -EBUSY; - goto out; - } + smp_mb(); + /* - * nobody can do a successful mnt_want_write() with all - * of the counts in MNT_DENIED_WRITE and the locks held. + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. */ - spin_lock(&vfsmount_lock); - if (!ret) + if (count_mnt_writers(mnt) > 0) + ret = -EBUSY; + else mnt->mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt_flags &= ~MNT_WRITE_HOLD; spin_unlock(&vfsmount_lock); -out: - unlock_mnt_writers(); return ret; } @@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_writers); +#endif kmem_cache_free(mnt_cache, mnt); } @@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, static inline void __mntput(struct vfsmount *mnt) { - int cpu; struct super_block *sb = mnt->mnt_sb; - /* - * We don't have to hold all of the locks at the - * same time here because we know that we're the - * last reference to mnt and that no new writers - * can come in. - */ - for_each_possible_cpu(cpu) { - struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - spin_lock(&cpu_writer->lock); - if (cpu_writer->mnt != mnt) { - spin_unlock(&cpu_writer->lock); - continue; - } - atomic_add(cpu_writer->count, &mnt->__mnt_writers); - cpu_writer->count = 0; - /* - * Might as well do this so that no one - * ever sees the pointer and expects - * it to be valid. - */ - cpu_writer->mnt = NULL; - spin_unlock(&cpu_writer->lock); - } /* * This probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this * happens, the filesystem was probably unable * to make r/w->r/o transitions. */ - WARN_ON(atomic_read(&mnt->__mnt_writers)); + /* + * atomic_dec_and_lock() used to deal with ->mnt_count decrements + * provides barriers, so count_mnt_writers() below is safe. AV + */ + WARN_ON(count_mnt_writers(mnt)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); diff --git a/include/linux/mount.h b/include/linux/mount.h index 51f55f903aff5140b9cda4ca104df8d3e2cb4426..ac49c1f8e5c098bda178a752458258d8f4e0436d 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -30,7 +30,7 @@ struct mnt_namespace; #define MNT_STRICTATIME 0x80 #define MNT_SHRINKABLE 0x100 -#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ +#define MNT_WRITE_HOLD 0x200 #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ @@ -65,13 +65,22 @@ struct vfsmount { int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; int mnt_ghosts; - /* - * This value is not stable unless all of the mnt_writers[] spinlocks - * are held, and all mnt_writer[]s on this mount have 0 as their ->count - */ - atomic_t __mnt_writers; +#ifdef CONFIG_SMP + int *mnt_writers; +#else + int mnt_writers; +#endif }; +static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + return mnt->mnt_writers; +#else + return &mnt->mnt_writers; +#endif +} + static inline struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt)