From 1406b916f4a29d5f9660264a28ce609c8c77e7ae Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Wed, 19 Feb 2014 00:27:53 +0900 Subject: nfsd: fix lost nfserrno() call in nfsd_setattr() There is a regression in 208d0ac 2014-01-07 nfsd4: break only delegations when appropriate which deletes an nfserrno() call in nfsd_setattr() (by accident, probably), and NFSD becomes ignoring an error from VFS. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 017d3cb5e99b..6d7be3f80356 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -449,6 +449,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, fh_lock(fhp); host_err = notify_change(dentry, iap, NULL); fh_unlock(fhp); + err = nfserrno(host_err); out_put_write_access: if (size_change) -- cgit v1.2.3 From d7a15f8d0777955986a2ab00ab181795cab14b01 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 16 Mar 2014 14:24:08 -0500 Subject: vfs: atomic f_pos access in llseek() Commit 9c225f2655e36a4 ("vfs: atomic f_pos accesses as per POSIX") changed several system calls to use fdget_pos() instead of fdget(), but missed sys_llseek(). Fix it. Signed-off-by: Eric Biggers Signed-off-by: Al Viro --- fs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 54e19b9392dc..28cc9c810744 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -307,7 +307,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned int, whence) { int retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) @@ -327,7 +327,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, retval = 0; } out_putf: - fdput(f); + fdput_pos(f); return retval; } #endif -- cgit v1.2.3 From 99aea68134f3c2a27b4d463c91cfa298c3efaccf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 16 Mar 2014 15:47:48 -0500 Subject: vfs: Don't let __fdget_pos() get FMODE_PATH files Commit bd2a31d522344 ("get rid of fget_light()") introduced the __fdget_pos() function, which returns the resulting file pointer and fdput flags combined in an 'unsigned long'. However, it also changed the behavior to return files with FMODE_PATH set, which shouldn't happen because read(), write(), lseek(), etc. aren't allowed on such files. This commit restores the old behavior. This regression actually had no effect on read() and write() since FMODE_READ and FMODE_WRITE are not set on file descriptors opened with O_PATH, but it did cause lseek() on a file descriptor opened with O_PATH to fail with ESPIPE rather than EBADF. Signed-off-by: Eric Biggers Signed-off-by: Al Viro --- fs/file.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 60a45e9f5323..eb56a13dab3e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -713,27 +713,16 @@ unsigned long __fdget_raw(unsigned int fd) unsigned long __fdget_pos(unsigned int fd) { - struct files_struct *files = current->files; - struct file *file; - unsigned long v; - - if (atomic_read(&files->count) == 1) { - file = __fcheck_files(files, fd); - v = 0; - } else { - file = __fget(fd, 0); - v = FDPUT_FPUT; - } - if (!file) - return 0; + unsigned long v = __fdget(fd); + struct file *file = (struct file *)(v & ~3); - if (file->f_mode & FMODE_ATOMIC_POS) { + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { if (file_count(file) > 1) { v |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } } - return v | (unsigned long)file; + return v; } /* -- cgit v1.2.3 From e825196d48d2b89a6ec3a8eff280098d2a78207e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Mar 2014 00:28:40 -0400 Subject: make prepend_name() work correctly when called with negative *buflen In all callchains leading to prepend_name(), the value left in *buflen is eventually discarded unused if prepend_name() has returned a negative. So we are free to do what prepend() does, and subtract from *buflen *before* checking for underflow (which turns into checking the sign of subtraction result, of course). Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 265e0ce9769c..ca02c13a84aa 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2833,9 +2833,9 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) u32 dlen = ACCESS_ONCE(name->len); char *p; - if (*buflen < dlen + 1) - return -ENAMETOOLONG; *buflen -= dlen + 1; + if (*buflen < 0) + return -ENAMETOOLONG; p = *buffer -= dlen + 1; *p++ = '/'; while (dlen--) { -- cgit v1.2.3 From b37199e626b31e1175fb06764c5d1d687723aac2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Mar 2014 15:18:22 -0400 Subject: rcuwalk: recheck mount_lock after mountpoint crossing attempts We can get false negative from __lookup_mnt() if an unrelated vfsmount gets moved. In that case legitimize_mnt() is guaranteed to fail, and we will fall back to non-RCU walk... unless we end up running into a hard error on a filesystem object we wouldn't have reached if not for that false negative. IOW, delaying that check until the end of pathname resolution is wrong - we should recheck right after we attempt to cross the mountpoint. We don't need to recheck unless we see d_mountpoint() being true - in that case even if we have just raced with mount/umount, we can simply go on as if we'd come at the moment when the sucker wasn't a mountpoint; if we run into a hard error as the result, it was a legitimate outcome. __lookup_mnt() returning NULL is different in that respect, since it might've happened due to operation on completely unrelated mountpoint. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namei.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 2f730ef9b4b3..4b491b431990 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1109,7 +1109,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, return false; if (!d_mountpoint(path->dentry)) - break; + return true; mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) @@ -1125,20 +1125,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return true; -} - -static void follow_mount_rcu(struct nameidata *nd) -{ - while (d_mountpoint(nd->path.dentry)) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); - if (!mounted) - break; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } + return read_seqretry(&mount_lock, nd->m_seq); } static int follow_dotdot_rcu(struct nameidata *nd) @@ -1166,7 +1153,17 @@ static int follow_dotdot_rcu(struct nameidata *nd) break; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } - follow_mount_rcu(nd); + while (d_mountpoint(nd->path.dentry)) { + struct mount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (!mounted) + break; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + if (!read_seqretry(&mount_lock, nd->m_seq)) + goto failed; + } nd->inode = nd->path.dentry->d_inode; return 0; -- cgit v1.2.3 From d6f2589ad561aa5fa39f347eca6942668b7560a1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Mar 2014 21:37:09 +0100 Subject: fs: Avoid userspace mounting anon_inodefs filesystem anon_inodefs filesystem is a kernel internal filesystem userspace shouldn't mess with. Remove registration of it so userspace cannot even try to mount it (which would fail anyway because the filesystem is MS_NOUSER). This fixes an oops triggered by trinity when it tried mounting anon_inodefs which overwrote anon_inode_inode pointer while other CPU has been in anon_inode_getfile() between ihold() and d_instantiate(). Thus effectively creating dentry pointing to an inode without holding a reference to it. Reported-by: Sasha Levin Signed-off-by: Jan Kara Signed-off-by: Linus Torvalds --- fs/anon_inodes.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 24084732b1d0..4b4543b8b894 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -177,9 +177,6 @@ static int __init anon_inode_init(void) { int error; - error = register_filesystem(&anon_inode_fs_type); - if (error) - goto err_exit; anon_inode_mnt = kern_mount(&anon_inode_fs_type); if (IS_ERR(anon_inode_mnt)) { error = PTR_ERR(anon_inode_mnt); -- cgit v1.2.3 From fce7fc79c8f7188dfc5eafa1b937bcc3c5a4c2f5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 25 Mar 2014 17:43:34 -0700 Subject: fs: remove now stale label in anon_inode_init() The previous commit removed the register_filesystem() call and the associated error handling, but left the label for the error path that no longer exists. Remove that too. Signed-off-by: Linus Torvalds --- fs/anon_inodes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 4b4543b8b894..42fcc46e2cca 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -186,7 +186,6 @@ static int __init anon_inode_init(void) err_unregister_filesystem: unregister_filesystem(&anon_inode_fs_type); -err_exit: panic(KERN_ERR "anon_inode_init() failed (%d)\n", error); } -- cgit v1.2.3 From 75c5a52da3fc2a06abb6c6192bdf5d680e56d37d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 26 Mar 2014 06:20:14 +0100 Subject: vfs: Allocate anon_inode_inode in anon_inode_init() Currently we allocated anon_inode_inode in anon_inodefs_mount. This is somewhat fragile as if that function ever gets called again, it will overwrite anon_inode_inode pointer. So move the initialization of anon_inode_inode to anon_inode_init(). Signed-off-by: Jan Kara [ Further simplified on suggestion from Dave Jones ] Signed-off-by: Linus Torvalds --- fs/anon_inodes.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 42fcc46e2cca..80ef38c73e5a 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -41,19 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = { static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct dentry *root; - root = mount_pseudo(fs_type, "anon_inode:", NULL, + return mount_pseudo(fs_type, "anon_inode:", NULL, &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); - if (!IS_ERR(root)) { - struct super_block *s = root->d_sb; - anon_inode_inode = alloc_anon_inode(s); - if (IS_ERR(anon_inode_inode)) { - dput(root); - deactivate_locked_super(s); - root = ERR_CAST(anon_inode_inode); - } - } - return root; } static struct file_system_type anon_inode_fs_type = { @@ -175,18 +164,15 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); static int __init anon_inode_init(void) { - int error; - anon_inode_mnt = kern_mount(&anon_inode_fs_type); - if (IS_ERR(anon_inode_mnt)) { - error = PTR_ERR(anon_inode_mnt); - goto err_unregister_filesystem; - } - return 0; + if (IS_ERR(anon_inode_mnt)) + panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt)); -err_unregister_filesystem: - unregister_filesystem(&anon_inode_fs_type); - panic(KERN_ERR "anon_inode_init() failed (%d)\n", error); + anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + if (IS_ERR(anon_inode_inode)) + panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); + + return 0; } fs_initcall(anon_inode_init); -- cgit v1.2.3 From d9060742fbf630fe31951dfc10b798deb2813f01 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 28 Mar 2014 13:33:38 -0700 Subject: ocfs2: check if cluster name exists before deref Commit c74a3bdd9b52 ("ocfs2: add clustername to cluster connection") is trying to strlcpy a string which was explicitly passed as NULL in the very same patch, triggering a NULL ptr deref. BUG: unable to handle kernel NULL pointer dereference at (null) IP: strlcpy (lib/string.c:388 lib/string.c:151) CPU: 19 PID: 19426 Comm: trinity-c19 Tainted: G W 3.14.0-rc7-next-20140325-sasha-00014-g9476368-dirty #274 RIP: strlcpy (lib/string.c:388 lib/string.c:151) Call Trace: ocfs2_cluster_connect (fs/ocfs2/stackglue.c:350) ocfs2_cluster_connect_agnostic (fs/ocfs2/stackglue.c:396) user_dlm_register (fs/ocfs2/dlmfs/userdlm.c:679) dlmfs_mkdir (fs/ocfs2/dlmfs/dlmfs.c:503) vfs_mkdir (fs/namei.c:3467) SyS_mkdirat (fs/namei.c:3488 fs/namei.c:3472) tracesys (arch/x86/kernel/entry_64.S:749) akpm: this patch probably disables the feature. A temporary thing to avoid triviel oopses. Signed-off-by: Sasha Levin Cc: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 1324e6600e57..ca5ce14cbddc 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -346,7 +346,9 @@ int ocfs2_cluster_connect(const char *stack_name, strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; - strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); + if (cluster_name_len) + strlcpy(new_conn->cc_cluster_name, cluster_name, + CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; -- cgit v1.2.3 From 0818bf27c05b2de56c5b2bd08cfae2a939bd5f52 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 28 Feb 2014 13:46:44 -0500 Subject: resizable namespace.c hashes * switch allocation to alloc_large_system_hash() * make sizes overridable by boot parameters (mhash_entries=, mphash_entries=) * switch mountpoint_hashtable from list_head to hlist_head Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/mount.h | 2 +- fs/namespace.c | 81 +++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/mount.h b/fs/mount.h index a17458ca6f29..acdb428de393 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -19,7 +19,7 @@ struct mnt_pcp { }; struct mountpoint { - struct list_head m_hash; + struct hlist_node m_hash; struct dentry *m_dentry; int m_count; }; diff --git a/fs/namespace.c b/fs/namespace.c index 22e536705c45..3b648da55d87 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,11 +23,34 @@ #include #include #include +#include #include "pnode.h" #include "internal.h" -#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) -#define HASH_SIZE (1UL << HASH_SHIFT) +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; + +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ + if (!str) + return 0; + mhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mhash_entries=", set_mhash_entries); + +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ + if (!str) + return 0; + mphash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mphash_entries=", set_mphash_entries); static int event; static DEFINE_IDA(mnt_id_ida); @@ -37,7 +60,7 @@ static int mnt_id_start = 0; static int mnt_group_start = 1; static struct list_head *mount_hashtable __read_mostly; -static struct list_head *mountpoint_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static DECLARE_RWSEM(namespace_sem); @@ -55,12 +78,19 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) +static inline struct list_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); - tmp = tmp + (tmp >> HASH_SHIFT); - return tmp & (HASH_SIZE - 1); + tmp = tmp + (tmp >> m_hash_shift); + return &mount_hashtable[tmp & m_hash_mask]; +} + +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mp_hash_shift); + return &mountpoint_hashtable[tmp & mp_hash_mask]; } /* @@ -575,7 +605,7 @@ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = mount_hashtable + hash(mnt, dentry); + struct list_head *head = m_hash(mnt, dentry); struct mount *p; list_for_each_entry_rcu(p, head, mnt_hash) @@ -590,7 +620,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) */ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = mount_hashtable + hash(mnt, dentry); + struct list_head *head = m_hash(mnt, dentry); struct mount *p; list_for_each_entry_reverse(p, head, mnt_hash) @@ -633,11 +663,11 @@ struct vfsmount *lookup_mnt(struct path *path) static struct mountpoint *new_mountpoint(struct dentry *dentry) { - struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); + struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; int ret; - list_for_each_entry(mp, chain, m_hash) { + hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) @@ -659,7 +689,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) mp->m_dentry = dentry; mp->m_count = 1; - list_add(&mp->m_hash, chain); + hlist_add_head(&mp->m_hash, chain); return mp; } @@ -670,7 +700,7 @@ static void put_mountpoint(struct mountpoint *mp) spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); - list_del(&mp->m_hash); + hlist_del(&mp->m_hash); kfree(mp); } } @@ -739,8 +769,7 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } @@ -762,8 +791,8 @@ static void commit_tree(struct mount *mnt) list_splice(&head, n->list.prev); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); } @@ -2777,18 +2806,24 @@ void __init mnt_init(void) mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + mount_hashtable = alloc_large_system_hash("Mount-cache", + sizeof(struct list_head), + mhash_entries, 19, + 0, + &m_hash_shift, &m_hash_mask, 0, 0); + mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", + sizeof(struct hlist_head), + mphash_entries, 19, + 0, + &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); - - for (u = 0; u < HASH_SIZE; u++) + for (u = 0; u <= m_hash_mask; u++) INIT_LIST_HEAD(&mount_hashtable[u]); - for (u = 0; u < HASH_SIZE; u++) - INIT_LIST_HEAD(&mountpoint_hashtable[u]); + for (u = 0; u <= mp_hash_mask; u++) + INIT_HLIST_HEAD(&mountpoint_hashtable[u]); kernfs_init(); -- cgit v1.2.3 From 1d6a32acd70ab18499829c0a9a5dbe2bace72a13 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Mar 2014 20:34:43 -0400 Subject: keep shadowed vfsmounts together preparation to switching mnt_hash to hlist Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namespace.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 3b648da55d87..9db3ce397a83 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -621,12 +621,20 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { struct list_head *head = m_hash(mnt, dentry); - struct mount *p; + struct mount *p, *res = NULL; - list_for_each_entry_reverse(p, head, mnt_hash) + list_for_each_entry(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) - return p; - return NULL; + goto found; + return res; +found: + res = p; + list_for_each_entry_continue(p, head, mnt_hash) { + if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) + break; + res = p; + } + return res; } /* @@ -769,14 +777,14 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - list_add_tail(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + list_add(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt) +static void commit_tree(struct mount *mnt, struct mount *shadows) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -791,7 +799,10 @@ static void commit_tree(struct mount *mnt) list_splice(&head, n->list.prev); - list_add_tail(&mnt->mnt_hash, + if (shadows) + list_add(&mnt->mnt_hash, &shadows->mnt_hash); + else + list_add(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); @@ -1659,12 +1670,15 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt); + commit_tree(source_mnt, NULL); } list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { + struct mount *q; list_del_init(&child->mnt_hash); - commit_tree(child); + q = __lookup_mnt_last(&child->mnt_parent->mnt, + child->mnt_mountpoint); + commit_tree(child, q); } unlock_mount_hash(); -- cgit v1.2.3 From 0b1b901b5a98bb36943d10820efc796f7cd45ff3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 21 Mar 2014 10:14:08 -0400 Subject: don't bother with propagate_mnt() unless the target is shared If the dest_mnt is not shared, propagate_mnt() does nothing - there's no mounts to propagate to and thus no copies to create. Might as well don't bother calling it in that case. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namespace.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 9db3ce397a83..d3fb9f00576e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1653,16 +1653,14 @@ static int attach_recursive_mnt(struct mount *source_mnt, err = invent_group_ids(source_mnt, true); if (err) goto out; - } - err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); - if (err) - goto out_cleanup_ids; - - lock_mount_hash(); - - if (IS_MNT_SHARED(dest_mnt)) { + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + if (err) + goto out_cleanup_ids; + lock_mount_hash(); for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); + } else { + lock_mount_hash(); } if (parent_path) { detach_mnt(source_mnt, parent_path); @@ -1685,8 +1683,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, return 0; out_cleanup_ids: - if (IS_MNT_SHARED(dest_mnt)) - cleanup_group_ids(source_mnt, NULL); + cleanup_group_ids(source_mnt, NULL); out: return err; } -- cgit v1.2.3 From 38129a13e6e71f666e0468e99fdd932a687b4d7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Mar 2014 21:10:51 -0400 Subject: switch mnt_hash to hlist fixes RCU bug - walking through hlist is safe in face of element moves, since it's self-terminating. Cyclic lists are not - if we end up jumping to another hash chain, we'll loop infinitely without ever hitting the original list head. [fix for dumb braino folded] Spotted by: Max Kellermann Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/mount.h | 2 +- fs/namespace.c | 79 ++++++++++++++++++++++++++++++++-------------------------- fs/pnode.c | 26 +++++++++++-------- fs/pnode.h | 4 +-- 4 files changed, 61 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/mount.h b/fs/mount.h index acdb428de393..b29e42f05f34 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -25,7 +25,7 @@ struct mountpoint { }; struct mount { - struct list_head mnt_hash; + struct hlist_node mnt_hash; struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; diff --git a/fs/namespace.c b/fs/namespace.c index d3fb9f00576e..2ffc5a2905d4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(mnt_id_lock); static int mnt_id_start = 0; static int mnt_group_start = 1; -static struct list_head *mount_hashtable __read_mostly; +static struct hlist_head *mount_hashtable __read_mostly; static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static DECLARE_RWSEM(namespace_sem); @@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline struct list_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); @@ -217,7 +217,7 @@ static struct mount *alloc_vfsmnt(const char *name) mnt->mnt_writers = 0; #endif - INIT_LIST_HEAD(&mnt->mnt_hash); + INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); @@ -605,10 +605,10 @@ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = m_hash(mnt, dentry); + struct hlist_head *head = m_hash(mnt, dentry); struct mount *p; - list_for_each_entry_rcu(p, head, mnt_hash) + hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; @@ -620,20 +620,16 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) */ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = m_hash(mnt, dentry); - struct mount *p, *res = NULL; - - list_for_each_entry(p, head, mnt_hash) - if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) - goto found; - return res; -found: - res = p; - list_for_each_entry_continue(p, head, mnt_hash) { + struct mount *p, *res; + res = p = __lookup_mnt(mnt, dentry); + if (!p) + goto out; + hlist_for_each_entry_continue(p, mnt_hash) { if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; res = p; } +out: return res; } @@ -750,7 +746,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); + hlist_del_init_rcu(&mnt->mnt_hash); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@ -777,7 +773,7 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - list_add(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } @@ -800,9 +796,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); if (shadows) - list_add(&mnt->mnt_hash, &shadows->mnt_hash); + hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); else - list_add(&mnt->mnt_hash, + hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); @@ -1193,26 +1189,28 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -static LIST_HEAD(unmounted); /* protected by namespace_sem */ +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { struct mount *mnt; - LIST_HEAD(head); + struct hlist_head head = unmounted; - if (likely(list_empty(&unmounted))) { + if (likely(hlist_empty(&head))) { up_write(&namespace_sem); return; } - list_splice_init(&unmounted, &head); + head.first->pprev = &head.first; + INIT_HLIST_HEAD(&unmounted); + up_write(&namespace_sem); synchronize_rcu(); - while (!list_empty(&head)) { - mnt = list_first_entry(&head, struct mount, mnt_hash); - list_del_init(&mnt->mnt_hash); + while (!hlist_empty(&head)) { + mnt = hlist_entry(head.first, struct mount, mnt_hash); + hlist_del_init(&mnt->mnt_hash); if (mnt->mnt_ex_mountpoint.mnt) path_put(&mnt->mnt_ex_mountpoint); mntput(&mnt->mnt); @@ -1233,16 +1231,19 @@ static inline void namespace_lock(void) */ void umount_tree(struct mount *mnt, int how) { - LIST_HEAD(tmp_list); + HLIST_HEAD(tmp_list); struct mount *p; + struct mount *last = NULL; - for (p = mnt; p; p = next_mnt(p, mnt)) - list_move(&p->mnt_hash, &tmp_list); + for (p = mnt; p; p = next_mnt(p, mnt)) { + hlist_del_init_rcu(&p->mnt_hash); + hlist_add_head(&p->mnt_hash, &tmp_list); + } if (how) propagate_umount(&tmp_list); - list_for_each_entry(p, &tmp_list, mnt_hash) { + hlist_for_each_entry(p, &tmp_list, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); @@ -1260,8 +1261,13 @@ void umount_tree(struct mount *mnt, int how) p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); + last = p; + } + if (last) { + last->mnt_hash.next = unmounted.first; + unmounted.first = tmp_list.first; + unmounted.first->pprev = &unmounted.first; } - list_splice(&tmp_list, &unmounted); } static void shrink_submounts(struct mount *mnt); @@ -1645,8 +1651,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct mountpoint *dest_mp, struct path *parent_path) { - LIST_HEAD(tree_list); + HLIST_HEAD(tree_list); struct mount *child, *p; + struct hlist_node *n; int err; if (IS_MNT_SHARED(dest_mnt)) { @@ -1671,9 +1678,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, commit_tree(source_mnt, NULL); } - list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; - list_del_init(&child->mnt_hash); + hlist_del_init(&child->mnt_hash); q = __lookup_mnt_last(&child->mnt_parent->mnt, child->mnt_mountpoint); commit_tree(child, q); @@ -2818,7 +2825,7 @@ void __init mnt_init(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", - sizeof(struct list_head), + sizeof(struct hlist_head), mhash_entries, 19, 0, &m_hash_shift, &m_hash_mask, 0, 0); @@ -2832,7 +2839,7 @@ void __init mnt_init(void) panic("Failed to allocate mount hash table\n"); for (u = 0; u <= m_hash_mask; u++) - INIT_LIST_HEAD(&mount_hashtable[u]); + INIT_HLIST_HEAD(&mount_hashtable[u]); for (u = 0; u <= mp_hash_mask; u++) INIT_HLIST_HEAD(&mountpoint_hashtable[u]); diff --git a/fs/pnode.c b/fs/pnode.c index c7221bb19801..88396df725b4 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -220,14 +220,14 @@ static struct mount *get_source(struct mount *dest, * @tree_list : list of heads of trees to be attached. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, - struct mount *source_mnt, struct list_head *tree_list) + struct mount *source_mnt, struct hlist_head *tree_list) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; struct mount *prev_src_mnt = source_mnt; - LIST_HEAD(tmp_list); + HLIST_HEAD(tmp_list); for (m = propagation_next(dest_mnt, dest_mnt); m; m = propagation_next(m, dest_mnt)) { @@ -246,27 +246,29 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); - list_splice(tree_list, tmp_list.prev); + tmp_list = *tree_list; + tmp_list.first->pprev = &tmp_list.first; + INIT_HLIST_HEAD(tree_list); goto out; } if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { mnt_set_mountpoint(m, dest_mp, child); - list_add_tail(&child->mnt_hash, tree_list); + hlist_add_head(&child->mnt_hash, tree_list); } else { /* * This can happen if the parent mount was bind mounted * on some subdirectory of a shared/slave mount. */ - list_add_tail(&child->mnt_hash, &tmp_list); + hlist_add_head(&child->mnt_hash, &tmp_list); } prev_dest_mnt = m; prev_src_mnt = child; } out: lock_mount_hash(); - while (!list_empty(&tmp_list)) { - child = list_first_entry(&tmp_list, struct mount, mnt_hash); + while (!hlist_empty(&tmp_list)) { + child = hlist_entry(tmp_list.first, struct mount, mnt_hash); umount_tree(child, 0); } unlock_mount_hash(); @@ -338,8 +340,10 @@ static void __propagate_umount(struct mount *mnt) * umount the child only if the child has no * other children */ - if (child && list_empty(&child->mnt_mounts)) - list_move_tail(&child->mnt_hash, &mnt->mnt_hash); + if (child && list_empty(&child->mnt_mounts)) { + hlist_del_init_rcu(&child->mnt_hash); + hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); + } } } @@ -350,11 +354,11 @@ static void __propagate_umount(struct mount *mnt) * * vfsmount lock must be held for write */ -int propagate_umount(struct list_head *list) +int propagate_umount(struct hlist_head *list) { struct mount *mnt; - list_for_each_entry(mnt, list, mnt_hash) + hlist_for_each_entry(mnt, list, mnt_hash) __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 59e7eda1851e..fc28a27fa892 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -36,8 +36,8 @@ static inline void set_mnt_shared(struct mount *mnt) void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, - struct list_head *); -int propagate_umount(struct list_head *); + struct hlist_head *); +int propagate_umount(struct hlist_head *); int propagate_mount_busy(struct mount *, int); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); -- cgit v1.2.3 From 00a1a053ebe5febcfc2ec498bd894f035ad2aa06 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 30 Mar 2014 10:20:01 -0400 Subject: ext4: atomically set inode->i_flags in ext4_set_inode_flags() Use cmpxchg() to atomically set i_flags instead of clearing out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the EXT4_IMMUTABLE_FL, EXT4_APPEND_FL flags, since this opens up a race where an immutable file has the immutable flag cleared for a brief window of time. Reported-by: John Sullivan Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6e39895a91b8..24bfd7ff3049 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -3921,18 +3922,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT4_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + set_mask_bits(&inode->i_flags, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ -- cgit v1.2.3