Commit 3d733633 authored by Dave Hansen's avatar Dave Hansen Committed by Al Viro

[PATCH] r/o bind mounts: track numbers of writers to mounts

This is the real meat of the entire series.  It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time.  Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.

This uses a statically-allocated percpu variable.  All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances.  Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two

Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.

I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.

http://sr71.net/~dave/linux/openbench.c

The code in here is a a worst-possible case for this patch.  It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely.  This worst-case scenario causes a 3%
degredation in the benchmark.

I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory.  In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.

(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable.  So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)

[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: default avatarAl Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: default avatarChristoph Hellwig <hch@infradead.org>
Signed-off-by: default avatarDave Hansen <haveblue@us.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 2c463e95
......@@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
......@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
return tmp & (HASH_SIZE - 1);
}
#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
......@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
atomic_set(&mnt->__mnt_writers, 0);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
......@@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
return mnt;
}
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
* We must keep track of when those operations start
* (for permission checks) and when they end, so that
* we can determine when writes are able to occur to
* a filesystem.
*/
/*
* __mnt_is_readonly: check whether a mount is read-only
* @mnt: the mount to check for its write status
*
* This shouldn't be used directly ouside of the VFS.
* It does not guarantee that the filesystem will stay
* r/w, just that it is right *now*. This can not and
* should not be used in place of IS_RDONLY(inode).
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
int __mnt_is_readonly(struct vfsmount *mnt)
{
return (mnt->mnt_sb->s_flags & MS_RDONLY);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
struct mnt_writer {
/*
* If holding multiple instances of this lock, they
* must be ordered by cpu number.
*/
spinlock_t lock;
struct lock_class_key lock_class; /* compiles out with !lockdep */
unsigned long count;
struct vfsmount *mnt;
} ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
static int __init init_mnt_writers(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
spin_lock_init(&writer->lock);
lockdep_set_class(&writer->lock, &writer->lock_class);
writer->count = 0;
}
return 0;
}
fs_initcall(init_mnt_writers);
static void unlock_mnt_writers(void)
{
int cpu;
struct mnt_writer *cpu_writer;
for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
spin_unlock(&cpu_writer->lock);
}
}
static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
{
if (!cpu_writer->mnt)
return;
/*
* This is in case anyone ever leaves an invalid,
* old ->mnt and a count of 0.
*/
if (!cpu_writer->count)
return;
atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
cpu_writer->count = 0;
}
/*
* must hold cpu_writer->lock
*/
static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
struct vfsmount *mnt)
{
if (cpu_writer->mnt == mnt)
return;
__clear_mnt_count(cpu_writer);
cpu_writer->mnt = mnt;
}
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
......@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
*/
int mnt_want_write(struct vfsmount *mnt)
{
if (__mnt_is_readonly(mnt))
return -EROFS;
return 0;
int ret = 0;
struct mnt_writer *cpu_writer;
cpu_writer = &get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);
if (__mnt_is_readonly(mnt)) {
ret = -EROFS;
goto out;
}
use_cpu_writer_for_mount(cpu_writer, mnt);
cpu_writer->count++;
out:
spin_unlock(&cpu_writer->lock);
put_cpu_var(mnt_writers);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
static void lock_mnt_writers(void)
{
int cpu;
struct mnt_writer *cpu_writer;
for_each_possible_cpu(cpu) {
cpu_writer = &per_cpu(mnt_writers, cpu);
spin_lock(&cpu_writer->lock);
__clear_mnt_count(cpu_writer);
cpu_writer->mnt = NULL;
}
}
/*
* These per-cpu write counts are not guaranteed to have
* matched increments and decrements on any given cpu.
* A file open()ed for write on one cpu and close()d on
* another cpu will imbalance this count. Make sure it
* does not get too far out of whack.
*/
static void handle_write_count_underflow(struct vfsmount *mnt)
{
if (atomic_read(&mnt->__mnt_writers) >=
MNT_WRITER_UNDERFLOW_LIMIT)
return;
/*
* It isn't necessary to hold all of the locks
* at the same time, but doing it this way makes
* us share a lot more code.
*/
lock_mnt_writers();
/*
* vfsmount_lock is for mnt_flags.
*/
spin_lock(&vfsmount_lock);
/*
* If coalescing the per-cpu writer counts did not
* get us back to a positive writer count, we have
* a bug.
*/
if ((atomic_read(&mnt->__mnt_writers) < 0) &&
!(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
printk(KERN_DEBUG "leak detected on mount(%p) writers "
"count: %d\n",
mnt, atomic_read(&mnt->__mnt_writers));
WARN_ON(1);
/* use the flag to keep the dmesg spam down */
mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
}
spin_unlock(&vfsmount_lock);
unlock_mnt_writers();
}
/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
......@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
*/
void mnt_drop_write(struct vfsmount *mnt)
{
int must_check_underflow = 0;
struct mnt_writer *cpu_writer;
cpu_writer = &get_cpu_var(mnt_writers);
spin_lock(&cpu_writer->lock);
use_cpu_writer_for_mount(cpu_writer, mnt);
if (cpu_writer->count > 0) {
cpu_writer->count--;
} else {
must_check_underflow = 1;
atomic_dec(&mnt->__mnt_writers);
}
spin_unlock(&cpu_writer->lock);
/*
* Logically, we could call this each time,
* but the __mnt_writers cacheline tends to
* be cold, and makes this expensive.
*/
if (must_check_underflow)
handle_write_count_underflow(mnt);
/*
* This could be done right after the spinlock
* is taken because the spinlock keeps us on
* the cpu, and disables preemption. However,
* putting it here bounds the amount that
* __mnt_writers can underflow. Without it,
* we could theoretically wrap __mnt_writers.
*/
put_cpu_var(mnt_writers);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
/*
* __mnt_is_readonly: check whether a mount is read-only
* @mnt: the mount to check for its write status
*
* This shouldn't be used directly ouside of the VFS.
* It does not guarantee that the filesystem will stay
* r/w, just that it is right *now*. This can not and
* should not be used in place of IS_RDONLY(inode).
*/
int __mnt_is_readonly(struct vfsmount *mnt)
int mnt_make_readonly(struct vfsmount *mnt)
{
return (mnt->mnt_sb->s_flags & MS_RDONLY);
int ret = 0;
lock_mnt_writers();
/*
* With all the locks held, this value is stable
*/
if (atomic_read(&mnt->__mnt_writers) > 0) {
ret = -EBUSY;
goto out;
}
/*
* actually set mount's r/o flag here to make
* __mnt_is_readonly() true, which keeps anyone
* from doing a successful mnt_want_write().
*/
out:
unlock_mnt_writers();
return ret;
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
......@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
static inline void __mntput(struct vfsmount *mnt)
{
int cpu;
struct super_block *sb = mnt->mnt_sb;
/*
* We don't have to hold all of the locks at the
* same time here because we know that we're the
* last reference to mnt and that no new writers
* can come in.
*/
for_each_possible_cpu(cpu) {
struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
if (cpu_writer->mnt != mnt)
continue;
spin_lock(&cpu_writer->lock);
atomic_add(cpu_writer->count, &mnt->__mnt_writers);
cpu_writer->count = 0;
/*
* Might as well do this so that no one
* ever sees the pointer and expects
* it to be valid.
*/
cpu_writer->mnt = NULL;
spin_unlock(&cpu_writer->lock);
}
/*
* This probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this
* happens, the filesystem was probably unable
* to make r/w->r/o transitions.
*/
WARN_ON(atomic_read(&mnt->__mnt_writers));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
......
......@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>
......@@ -30,6 +31,7 @@ struct mnt_namespace;
#define MNT_RELATIME 0x20
#define MNT_SHRINKABLE 0x100
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
......@@ -62,6 +64,11 @@ struct vfsmount {
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
/*
* This value is not stable unless all of the mnt_writers[] spinlocks
* are held, and all mnt_writer[]s on this mount have 0 as their ->count
*/
atomic_t __mnt_writers;
};
static inline struct vfsmount *mntget(struct vfsmount *mnt)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment