From 979883eb803f05da2d4f38b5719c2b18ac06f50d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Aug 2010 17:59:39 +0200 Subject: block_dev: always serialize exclusive open attempts commit e75aa85892b2ee78c79edac720868cbef16e62eb upstream. bd_prepare_to_claim() incorrectly allowed multiple attempts for exclusive open to progress in parallel if the attempting holders are identical. This triggered BUG_ON() as reported in the following bug. https://bugzilla.kernel.org/show_bug.cgi?id=16393 __bd_abort_claiming() is used to finish claiming blocks and doesn't work if multiple openers are inside a claiming block. Allowing multiple parallel open attempts to continue doesn't gain anything as those are serialized down in the call chain anyway. Fix it by always allowing only single open attempt in a claiming block. This problem can easily be reproduced by adding a delay after bd_prepare_to_claim() and attempting to mount two partitions of a disk. Signed-off-by: Tejun Heo Reported-by: Markus Trippelsdorf Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 99d6af81174..b3171fb0dc9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -681,8 +681,8 @@ retry: if (!bd_may_claim(bdev, whole, holder)) return -EBUSY; - /* if someone else is claiming, wait for it to finish */ - if (whole->bd_claiming && whole->bd_claiming != holder) { + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); -- cgit v1.2.3 From a2ed4680f6a386a88f1356b42c72ea45718f3005 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Aug 2010 12:05:43 -0400 Subject: Fix sget() race with failing mount commit 7a4dec53897ecd3367efb1e12fe8a4edc47dc0e9 upstream. If sget() finds a matching superblock being set up, it'll grab an active reference to it and grab s_umount. That's fine - we'll wait for completion of foofs_get_sb() that way. However, if said foofs_get_sb() fails we'll end up holding the halfway-created superblock. deactivate_locked_super() called by foofs_get_sb() will just unlock the sucker since we are holding another active reference to it. What we need is a way to tell if superblock has been successfully set up. Unfortunately, neither ->s_root nor the check for MS_ACTIVE quite fit. Cheap and easy way, suitable for backport: new flag set by the (only) caller of ->get_sb(). If that flag isn't present by the time sget() grabbed s_umount on preexisting superblock it has found, it's seeing a stillborn and should just bury it with deactivate_locked_super() (and repeat the search). Longer term we want to set that flag in ->get_sb() instances (and check for it to distinguish between "sget() found us a live sb" and "sget() has allocated an sb, we need to set it up" in there, instead of checking ->s_root as we do now). Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 2 +- fs/super.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 88058de59c7..32dcd24bbc9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1984,7 +1984,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); diff --git a/fs/super.c b/fs/super.c index 938119ab8dc..c7765bd38ee 100644 --- a/fs/super.c +++ b/fs/super.c @@ -305,8 +305,13 @@ retry: if (s) { up_write(&s->s_umount); destroy_super(s); + s = NULL; } down_write(&old->s_umount); + if (unlikely(!(old->s_flags & MS_BORN))) { + deactivate_locked_super(old); + goto retry; + } return old; } } @@ -909,6 +914,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void goto out_free_secdata; BUG_ON(!mnt->mnt_sb); WARN_ON(!mnt->mnt_sb->s_bdi); + mnt->mnt_sb->s_flags |= MS_BORN; error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); if (error) -- cgit v1.2.3 From 2414c243d762151570f8a84e5c3c3dc9d8dbae7f Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 10 Aug 2010 18:02:55 -0700 Subject: blkdev: cgroup whitelist permission fix commit b7300b78d1a87625975a799a109a2f98d77757c8 upstream. The cgroup device whitelist code gets confused when trying to grant permission to a disk partition that is not currently open. Part of blkdev_open() includes __blkdev_get() on the whole disk. Basically, the only ways to reliably allow a cgroup access to a partition on a block device when using the whitelist are to 1) also give it access to the whole block device or 2) make sure the partition is already open in a different context. The patch avoids the cgroup check for the whole disk case when opening a partition. Addresses https://bugzilla.redhat.com/show_bug.cgi?id=589662 Signed-off-by: Chris Wright Acked-by: Serge E. Hallyn Tested-by: Serge E. Hallyn Reported-by: Vivek Goyal Cc: Al Viro Cc: Christoph Hellwig Cc: "Daniel P. Berrange" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index b3171fb0dc9..4c54c86e028 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1339,10 +1339,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) /* * hooks: /n/, see "layering violations". */ - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); - return ret; + if (!for_part) { + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); + return ret; + } } lock_kernel(); -- cgit v1.2.3 From 514b1a086de08c090627458fb328aab6f80fc6ab Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 3 Nov 2009 11:45:11 -0600 Subject: eCryptfs: Handle ioctl calls with unlocked and compat functions commit c43f7b8fb03be8bcc579bfc4e6ab70eac887ab55 upstream. Lower filesystems that only implemented unlocked_ioctl weren't being passed ioctl calls because eCryptfs only checked for lower_file->f_op->ioctl and returned -ENOTTY if it was NULL. eCryptfs shouldn't implement ioctl(), since it doesn't require the BKL. This patch introduces ecryptfs_unlocked_ioctl() and ecryptfs_compat_ioctl(), which passes the calls on to the lower file system. https://bugs.launchpad.net/ecryptfs/+bug/469664 Reported-by: James Dupin Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 56 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index e8fcf4e2ed7..6e4f84cf73e 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -292,12 +292,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) return rc; } -static int ecryptfs_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); +static long +ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOTTY; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl) + rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + return rc; +} + +#ifdef CONFIG_COMPAT +static long +ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOIOCTLCMD; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl) + rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + return rc; +} +#endif const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, - .ioctl = ecryptfs_ioctl, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, @@ -313,7 +341,10 @@ const struct file_operations ecryptfs_main_fops = { .write = do_sync_write, .aio_write = generic_file_aio_write, .readdir = ecryptfs_readdir, - .ioctl = ecryptfs_ioctl, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, @@ -322,20 +353,3 @@ const struct file_operations ecryptfs_main_fops = { .fasync = ecryptfs_fasync, .splice_read = generic_file_splice_read, }; - -static int -ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, - unsigned long arg) -{ - int rc = 0; - struct file *lower_file = NULL; - - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); - if (lower_file && lower_file->f_op && lower_file->f_op->ioctl) - rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode), - lower_file, cmd, arg); - else - rc = -ENOTTY; - return rc; -} -- cgit v1.2.3 From c336098dff5e41b409aa2191857958954794c01f Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Thu, 29 Jul 2010 13:01:36 +0200 Subject: ecryptfs: release reference to lower mount if interpose fails commit 31f73bee3e170b7cabb35db9e2f4bf7919b9d036 upstream. In ecryptfs_lookup_and_interpose_lower() the lower mount is not decremented if allocation of a dentry info struct failed. As a result the lower filesystem cant be unmounted any more (since it is considered busy). This patch corrects the reference counting. Signed-off-by: Lino Sanfilippo Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 31ef5252f0f..8cd617b66ba 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -264,7 +264,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); - goto out_dput; + goto out_put; } ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); @@ -339,8 +339,9 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; -out_dput: +out_put: dput(lower_dentry); + mntput(lower_mnt); d_drop(ecryptfs_dentry); out: return rc; -- cgit v1.2.3 From cf5275da2bb68aa5137861a0c151faf8365c47e3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 6 Aug 2010 22:58:49 +0200 Subject: fs/ecryptfs/file.c: introduce missing free commit ceeab92971e8af05c1e81a4ff2c271124b55bb9b upstream. The comments in the code indicate that file_info should be released if the function fails. This releasing is done at the label out_free, not out. The semantic match that finds this problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,f1,l; position p1,p2; expression *ptr != NULL; @@ x@p1 = kmem_cache_zalloc(...); ... if (x == NULL) S <... when != x when != if (...) { <+...x...+> } ( x->f1 = E | (x->f1 == NULL || ...) | f(...,x->f1,...) ) ...> ( return <+...x...+>; | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmem_cache_zalloc %s" % (p1[0].file,p1[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 6e4f84cf73e..622c9514080 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -199,7 +199,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) "the persistent file for the dentry with name " "[%s]; rc = [%d]\n", __func__, ecryptfs_dentry->d_name.name, rc); - goto out; + goto out_free; } } if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) @@ -207,7 +207,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -EPERM; printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " "file must hence be opened RO\n", __func__); - goto out; + goto out_free; } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); -- cgit v1.2.3 From ee28b37385692e02b0ffd174b68ccd54b714910d Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 10 Aug 2010 18:03:08 -0700 Subject: signalfd: fill in ssi_int for posix timers and message queues commit a2a20c412c86e0bb46a9ab0dd31bcfe6d201b913 upstream. If signalfd is used to consume a signal generated by a POSIX interval timer or POSIX message queue, the ssi_int field does not reflect the data (sigevent->sigev_value) supplied to timer_create(2) or mq_notify(3). (The ssi_ptr field, however, is filled in.) This behavior differs from signalfd's treatment of sigqueue-generated signals -- see the default case in signalfd_copyinfo. It also gives results that differ from the case when a signal is handled conventionally via a sigaction-registered handler. So, set signalfd_siginfo->ssi_int in the remaining cases (__SI_TIMER, __SI_MESGQ) where ssi_ptr is set. akpm: a non-back-compatible change. Merge into -stable to minimise the number of kernels which are in the field and which miss this feature. Signed-off-by: Nathan Lynch Acked-by: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/signalfd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/signalfd.c b/fs/signalfd.c index f329849ce3c..1c5a6add779 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; case __SI_POLL: err |= __put_user(kinfo->si_band, &uinfo->ssi_band); @@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; default: /* -- cgit v1.2.3 From 451ba8bd07ac6a9d6abf9f944213a589b26d81d9 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 9 Aug 2010 15:57:38 -0500 Subject: jfs: don't allow os2 xattr namespace overlap with others commit aca0fa34bdaba39bfddddba8ca70dba4782e8fe6 upstream. It's currently possible to bypass xattr namespace access rules by prefixing valid xattr names with "os2.", since the os2 namespace stores extended attributes in a legacy format with no prefix. This patch adds checking to deny access to any valid namespace prefix following "os2.". Signed-off-by: Dave Kleikamp Reported-by: Sergey Vlasov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/jfs/xattr.c | 87 +++++++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fa96bbb2634..2d7f165d0f1 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -86,46 +86,25 @@ struct ea_buffer { #define EA_MALLOC 0x0008 +static int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + /* * These three routines are used to recognize on-disk extended attributes * that are in a recognized namespace. If the attribute is not recognized, * "os2." is prepended to the name */ -static inline int is_os2_xattr(struct jfs_ea *ea) +static int is_os2_xattr(struct jfs_ea *ea) { - /* - * Check for "system." - */ - if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) && - !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return false; - /* - * Check for "user." - */ - if ((ea->namelen >= XATTR_USER_PREFIX_LEN) && - !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return false; - /* - * Check for "security." - */ - if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) && - !strncmp(ea->name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) - return false; - /* - * Check for "trusted." - */ - if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) && - !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - return false; - /* - * Add any other valid namespace prefixes here - */ - - /* - * We assume it's OS/2's flat namespace - */ - return true; + return !is_known_namespace(ea->name); } static inline int name_size(struct jfs_ea *ea) @@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return can_set_system_xattr(inode, name, value, value_len); + if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "os2." + */ + if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) + return -EOPNOTSUPP; + return 0; + } + /* * Don't allow setting an attribute in an unknown namespace. */ if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && - strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return -EOPNOTSUPP; return 0; @@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, int xattr_size; ssize_t size; int namelen = strlen(name); - char *os2name = NULL; char *value; - if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { - os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, - GFP_KERNEL); - if (!os2name) - return -ENOMEM; - strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); - name = os2name; - namelen -= XATTR_OS2_PREFIX_LEN; - } - down_read(&JFS_IP(inode)->xattr_sem); xattr_size = ea_get(inode, &ea_buf, 0); @@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, out: up_read(&JFS_IP(inode)->xattr_sem); - kfree(os2name); - return size; } @@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, { int err; + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + /* + * skip past "os2." prefix + */ + name += XATTR_OS2_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "os2." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); return err; -- cgit v1.2.3 From f768d29ffa26c51dfc570a7e94728cdd6fa929cc Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 1 Aug 2010 17:33:29 -0400 Subject: ext4: fix freeze deadlock under IO commit 437f88cc031ffe7f37f3e705367f4fe1f4be8b0f upstream. Commit 6b0310fbf087ad6 caused a regression resulting in deadlocks when freezing a filesystem which had active IO; the vfs_check_frozen level (SB_FREEZE_WRITE) did not let the freeze-related IO syncing through. Duh. Changing the test to FREEZE_TRANS should let the normal freeze syncing get through the fs, but still block any transactions from starting once the fs is completely frozen. I tested this by running fsstress in the background while periodically snapshotting the fs and running fsck on the result. I ran into occasional deadlocks, but different ones. I think this is a fine fix for the problem at hand, and the other deadlocky things will need more investigation. Reported-by: Phillip Susi Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4e8983a9811..a45ced96b04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,7 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ @@ -3491,7 +3491,7 @@ int ext4_force_commit(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; if (journal) { - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); ret = ext4_journal_force_commit(journal); } -- cgit v1.2.3 From 01ac8551a0c8469cf4d535dd2583364a1be5b572 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Aug 2010 11:35:52 -0700 Subject: mm: fix up some user-visible effects of the stack guard page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d7824370e26325c881b665350ce64fb0a4fde24a upstream. This commit makes the stack guard page somewhat less visible to user space. It does this by: - not showing the guard page in /proc//maps It looks like lvm-tools will actually read /proc/self/maps to figure out where all its mappings are, and effectively do a specialized "mlockall()" in user space. By not showing the guard page as part of the mapping (by just adding PAGE_SIZE to the start for grows-up pages), lvm-tools ends up not being aware of it. - by also teaching the _real_ mlock() functionality not to try to lock the guard page. That would just expand the mapping down to create a new guard page, so there really is no point in trying to lock it in place. It would perhaps be nice to show the guard page specially in /proc//maps (or at least mark grow-down segments some way), but let's not open ourselves up to more breakage by user space from programs that depends on the exact deails of the 'maps' file. Special thanks to Henrique de Moraes Holschuh for diving into lvm-tools source code to see what was going on with the whole new warning. Reported-and-tested-by: François Valenduc Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index aea1d3f1ffb..439fc1f1c1c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -210,6 +210,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) int flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; + unsigned long start; dev_t dev = 0; int len; @@ -220,8 +221,13 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } + /* We don't show the stack guard page in /proc/maps */ + start = vma->vm_start; + if (vma->vm_flags & VM_GROWSDOWN) + start += PAGE_SIZE; + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", - vma->vm_start, + start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', -- cgit v1.2.3 From c23c1693ecfb0a68e54fbf306ac99074d5ee5c6a Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Fri, 16 Jul 2010 11:21:23 +0800 Subject: ocfs2: do not overwrite error codes in ocfs2_init_acl commit 6eda3dd33f8a0ce58ee56a11351758643a698db4 upstream. Setting the acl while creating a new inode depends on the error codes of posix_acl_create_masq. This patch fix a issue of overwriting the error codes of it. Reported-by: Pawel Zawora Signed-off-by: Tiger Yang Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/acl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index da702294d7e..5a217783321 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -344,7 +344,7 @@ int ocfs2_init_acl(handle_t *handle, { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; - int ret = 0; + int ret = 0, ret2; mode_t mode; if (!S_ISLNK(inode->i_mode)) { @@ -381,7 +381,12 @@ int ocfs2_init_acl(handle_t *handle, mode = inode->i_mode; ret = posix_acl_create_masq(clone, &mode); if (ret >= 0) { - ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } if (ret > 0) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_ACCESS, -- cgit v1.2.3 From 072cbb3f1ecff3debed7a037c4a32d3a58585c3e Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 16 Jul 2010 23:13:33 +0800 Subject: ocfs2/dlm: fix a dead lock commit 6d98c3ccb52f692f1a60339dde7c700686a5568b upstream. When we have to take both dlm->master_lock and lockres->spinlock, take them in order lockres->spinlock and then dlm->master_lock. The patch fixes a violation of the rule. We can simply move taking dlm->master_lock to where we have dropped res->spinlock since when we access res->state and free mle memory we don't need master_lock's protection. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlm/dlmmaster.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 94b97fc6a88..6d098b89d46 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -3050,8 +3050,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, /* check for pre-existing lock */ spin_lock(&dlm->spinlock); res = __dlm_lookup_lockres(dlm, name, namelen, hash); - spin_lock(&dlm->master_lock); - if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -3069,14 +3067,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&res->spinlock); } + spin_lock(&dlm->master_lock); /* ignore status. only nonzero status would BUG. */ ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, migrate->new_master, migrate->master); -unlock: spin_unlock(&dlm->master_lock); +unlock: spin_unlock(&dlm->spinlock); if (oldmle) { -- cgit v1.2.3 From 602738340fb5be2de22faddd5d67eb39cc040307 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Mon, 19 Jul 2010 16:04:12 -0700 Subject: ocfs2 fix o2dlm dlm run purgelist (rev 3) commit 7beaf243787f85a2ef9213ccf13ab4a243283fde upstream. This patch fixes two problems in dlm_run_purgelist 1. If a lockres is found to be in use, dlm_run_purgelist keeps trying to purge the same lockres instead of trying the next lockres. 2. When a lockres is found unused, dlm_run_purgelist releases lockres spinlock before setting DLM_LOCK_RES_DROPPING_REF and calls dlm_purge_lockres. spinlock is reacquired but in this window lockres can get reused. This leads to BUG. This patch modifies dlm_run_purgelist to skip lockres if it's in use and purge next lockres. It also sets DLM_LOCK_RES_DROPPING_REF before releasing the lockres spinlock protecting it from getting reused. Signed-off-by: Srinivas Eeda Acked-by: Sunil Mushran Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlm/dlmthread.c | 80 ++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d4f73ca68fe..dd78ca3f360 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -152,45 +152,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } -static int dlm_purge_lockres(struct dlm_ctxt *dlm, +static void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int master; int ret = 0; - spin_lock(&res->spinlock); - if (!__dlm_lockres_unused(res)) { - mlog(0, "%s:%.*s: tried to purge but not unused\n", - dlm->name, res->lockname.len, res->lockname.name); - __dlm_print_one_lock_resource(res); - spin_unlock(&res->spinlock); - BUG(); - } - - if (res->state & DLM_LOCK_RES_MIGRATING) { - mlog(0, "%s:%.*s: Delay dropref as this lockres is " - "being remastered\n", dlm->name, res->lockname.len, - res->lockname.name); - /* Re-add the lockres to the end of the purge list */ - if (!list_empty(&res->purge)) { - list_del_init(&res->purge); - list_add_tail(&res->purge, &dlm->purge_list); - } - spin_unlock(&res->spinlock); - return 0; - } + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); master = (res->owner == dlm->node_num); - if (!master) - res->state |= DLM_LOCK_RES_DROPPING_REF; - spin_unlock(&res->spinlock); mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, res->lockname.name, master); if (!master) { + res->state |= DLM_LOCK_RES_DROPPING_REF; /* drop spinlock... retake below */ + spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); spin_lock(&res->spinlock); @@ -208,31 +188,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", dlm->name, res->lockname.len, res->lockname.name, ret); spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); } - spin_lock(&res->spinlock); if (!list_empty(&res->purge)) { mlog(0, "removing lockres %.*s:%p from purgelist, " "master = %d\n", res->lockname.len, res->lockname.name, res, master); list_del_init(&res->purge); - spin_unlock(&res->spinlock); dlm_lockres_put(res); dlm->purge_count--; - } else - spin_unlock(&res->spinlock); + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } __dlm_unhash_lockres(res); /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { - spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_DROPPING_REF; spin_unlock(&res->spinlock); wake_up(&res->wq); - } - return 0; + } else + spin_unlock(&res->spinlock); } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -251,17 +235,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, lockres = list_entry(dlm->purge_list.next, struct dlm_lock_resource, purge); - /* Status of the lockres *might* change so double - * check. If the lockres is unused, holding the dlm - * spinlock will prevent people from getting and more - * refs on it -- there's no need to keep the lockres - * spinlock. */ spin_lock(&lockres->spinlock); - unused = __dlm_lockres_unused(lockres); - spin_unlock(&lockres->spinlock); - - if (!unused) - continue; purge_jiffies = lockres->last_used + msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); @@ -273,15 +247,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * in tail order, we can stop at the first * unpurgable resource -- anyone added after * him will have a greater last_used value */ + spin_unlock(&lockres->spinlock); break; } + /* Status of the lockres *might* change so double + * check. If the lockres is unused, holding the dlm + * spinlock will prevent people from getting and more + * refs on it. */ + unused = __dlm_lockres_unused(lockres); + if (!unused || + (lockres->state & DLM_LOCK_RES_MIGRATING)) { + mlog(0, "lockres %s:%.*s: is in use or " + "being remastered, used %d, state %d\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, !unused, lockres->state); + list_move_tail(&dlm->purge_list, &lockres->purge); + spin_unlock(&lockres->spinlock); + continue; + } + dlm_lockres_get(lockres); - /* This may drop and reacquire the dlm spinlock if it - * has to do migration. */ - if (dlm_purge_lockres(dlm, lockres)) - BUG(); + dlm_purge_lockres(dlm, lockres); dlm_lockres_put(lockres); -- cgit v1.2.3 From e95a476c0dbe727effd617608d8996ace7a98164 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 22 Jul 2010 13:56:45 +0800 Subject: ocfs2: Count more refcount records in file system fragmentation. commit 8a2e70c40ff58f82dde67770e6623ca45f0cb0c8 upstream. The refcount record calculation in ocfs2_calc_refcount_meta_credits is too optimistic that we can always allocate contiguous clusters and handle an already existed refcount rec as a whole. Actually because of file system fragmentation, we may have the chance to split a refcount record into 3 parts during the transaction. So consider the worst case in record calculation. Signed-off-by: Tao Ma Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/refcounttree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3ac5aa733e9..73a11ccfd4c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2436,16 +2436,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; /* - * If the refcount rec already exist, cool. We just need - * to check whether there is a split. Otherwise we just need - * to increase the refcount. - * If we will insert one, increases recs_add. - * * We record all the records which will be inserted to the * same refcount block, so that we can tell exactly whether * we need a new refcount block or not. + * + * If we will insert a new one, this is easy and only happens + * during adding refcounted flag to the extent, so we don't + * have a chance of spliting. We just need one record. + * + * If the refcount rec already exists, that would be a little + * complicated. we may have to: + * 1) split at the beginning if the start pos isn't aligned. + * we need 1 more record in this case. + * 2) split int the end if the end pos isn't aligned. + * we need 1 more record in this case. + * 3) split in the middle because of file system fragmentation. + * we need 2 more records in this case(we can't detect this + * beforehand, so always think of the worst case). */ if (rec.r_refcount) { + recs_add += 2; /* Check whether we need a split at the beginning. */ if (cpos == start_cpos && cpos != le64_to_cpu(rec.r_cpos)) -- cgit v1.2.3 From ba0864d8ae58007329c10119941c73747de48016 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 30 Jul 2010 16:14:44 +0800 Subject: ocfs2/dlm: avoid incorrect bit set in refmap on recovery master commit a524812b7eaa7783d7811198921100f079034e61 upstream. In the following situation, there remains an incorrect bit in refmap on the recovery master. Finally the recovery master will fail at purging the lockres due to the incorrect bit in refmap. 1) node A has no interest on lockres A any longer, so it is purging it. 2) the owner of lockres A is node B, so node A is sending de-ref message to node B. 3) at this time, node B crashed. node C becomes the recovery master. it recovers lockres A(because the master is the dead node B). 4) node A migrated lockres A to node C with a refbit there. 5) node A failed to send de-ref message to node B because it crashed. The failure is ignored. no other action is done for lockres A any more. For mormal, re-send the deref message to it to recovery master can fix it. Well, ignoring the failure of deref to the original master and not recovering the lockres to recovery master has the same effect. And the later is simpler. Signed-off-by: Wengang Wang Acked-by: Srinivas Eeda Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlm/dlmrecovery.c | 22 ++++++++++------------ fs/ocfs2/dlm/dlmthread.c | 34 +++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9dfaac73b36..aaaffbcbe91 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct list_head *queue; struct dlm_lock *lock, *next; + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); res->state |= DLM_LOCK_RES_RECOVERING; if (!list_empty(&res->recovering)) { mlog(0, @@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { - if (res->state & DLM_LOCK_RES_DROPPING_REF) - mlog(0, "%s:%.*s: owned by " - "dead node %u, this node was " - "dropping its ref when it died. " - "continue, dropping the flag.\n", - dlm->name, res->lockname.len, - res->lockname.name, dead_node); - - /* the wake_up for this will happen when the - * RECOVERING flag is dropped later */ - res->state &= ~DLM_LOCK_RES_DROPPING_REF; + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "Ignore %.*s for " + "recovery as it is being freed\n", + res->lockname.len, + res->lockname.name); + } else + dlm_move_lockres_to_recovery_list(dlm, + res); - dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index dd78ca3f360..2211acf33d9 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res) * truly ready to be freed. */ int __dlm_lockres_unused(struct dlm_lock_resource *res) { - if (!__dlm_lockres_has_locks(res) && - (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { - /* try not to scan the bitmap unless the first two - * conditions are already true */ - int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); - if (bit >= O2NM_MAX_NODES) { - /* since the bit for dlm->node_num is not - * set, inflight_locks better be zero */ - BUG_ON(res->inflight_locks != 0); - return 1; - } - } - return 0; + int bit; + + if (__dlm_lockres_has_locks(res)) + return 0; + + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) + return 0; + + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) + return 0; + + /* + * since the bit for dlm->node_num is not set, inflight_locks better + * be zero + */ + BUG_ON(res->inflight_locks != 0); + return 1; } -- cgit v1.2.3 From 6bd0fca976acb4e6629988e7aa1646680fb07a1e Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 30 Jul 2010 23:18:00 +0800 Subject: ocfs2/dlm: remove potential deadlock -V3 commit b11f1f1ab73fd358b1b734a9427744802202ba68 upstream. When we need to take both dlm_domain_lock and dlm->spinlock, we should take them in order of: dlm_domain_lock then dlm->spinlock. There is pathes disobey this order. That is calling dlm_lockres_put() with dlm->spinlock held in dlm_run_purge_list. dlm_lockres_put() calls dlm_put() at the ref and dlm_put() locks on dlm_domain_lock. Fix: Don't grab/put the dlm when the initialising/releasing lockres. That grab is not required because we don't call dlm_unregister_domain() based on refcount. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlm/dlmmaster.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 6d098b89d46..ffb4c68dafa 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref) atomic_dec(&dlm->res_cur_count); - dlm_put(dlm); - if (!hlist_unhashed(&res->hash_node) || !list_empty(&res->granted) || !list_empty(&res->converting) || @@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->migration_pending = 0; res->inflight_locks = 0; - /* put in dlm_lockres_release */ - dlm_grab(dlm); res->dlm = dlm; kref_init(&res->refs); -- cgit v1.2.3 From 58f6e9622d6d20503e22f5c04e2877cfb67811d4 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 13 Aug 2010 12:42:24 +0900 Subject: nilfs2: fix list corruption after ifile creation failure commit af4e36318edb848fcc0a8d5f75000ca00cdc7595 upstream. If nilfs_attach_checkpoint() gets a memory allocation failure during creation of ifile, it will return without removing nilfs_sb_info struct from ns_supers list. When a concurrently mounted snapshot is unmounted or another new snapshot is mounted after that, this causes kernel oops as below: > BUG: unable to handle kernel NULL pointer dereference at (null) > IP: [] nilfs_find_sbinfo+0x74/0xa4 [nilfs2] > *pde = 00000000 > Oops: 0000 [#1] SMP > Call Trace: > [] ? nilfs_get_sb+0x165/0x532 [nilfs2] > [] ? ida_get_new_above+0x16d/0x187 > [] ? alloc_vfsmnt+0x7e/0x10a > [] ? kstrdup+0x2c/0x40 > [] ? vfs_kern_mount+0x96/0x14e > [] ? do_kern_mount+0x32/0xbd > [] ? do_mount+0x642/0x6a1 > [] ? do_page_fault+0x0/0x2d1 > [] ? copy_mount_options+0x80/0xe2 > [] ? strndup_user+0x48/0x67 > [] ? sys_mount+0x61/0x90 > [] ? sysenter_do_call+0x12/0x22 This fixes the problem. Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 414ef68931c..fbb354c36e6 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -336,9 +336,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); + err = -ENOMEM; sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); if (!sbi->s_ifile) - return -ENOMEM; + goto delist; down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, @@ -369,6 +370,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; + delist: down_write(&nilfs->ns_super_sem); list_del_init(&sbi->s_list); up_write(&nilfs->ns_super_sem); -- cgit v1.2.3 From 4b1ebda3fa2eecdb15e408cef88fb6fca5f84a81 Mon Sep 17 00:00:00 2001 From: Jiaju Zhang Date: Wed, 28 Jul 2010 13:21:06 +0800 Subject: Fix the nested PR lock calling issue in ACL commit 845b6cf34150100deb5f58c8a37a372b111f2918 upstream. Hi, Thanks a lot for all the review and comments so far;) I'd like to send the improved (V4) version of this patch. This patch fixes a deadlock in OCFS2 ACL. We found this bug in OCFS2 and Samba integration using scenario, the symptom is several smbd processes will be hung under heavy workload. Finally we found out it is the nested PR lock calling that leads to this deadlock: node1 node2 gr PR | V PR(EX)---> BAST:OCFS2_LOCK_BLOCKED | V rq PR | V wait=1 After requesting the 2nd PR lock, the process "smbd" went into D state. It can only be woken up when the 1st PR lock's RO holder equals zero. There should be an ocfs2_inode_unlock in the calling path later on, which can decrement the RO holder. But since it has been in uninterruptible sleep, the unlock function has no chance to be called. The related stack trace is: smbd D ffff8800013d0600 0 9522 5608 0x00000000 ffff88002ca7fb18 0000000000000282 ffff88002f964500 ffff88002ca7fa98 ffff8800013d0600 ffff88002ca7fae0 ffff88002f964340 ffff88002f964340 ffff88002ca7ffd8 ffff88002ca7ffd8 ffff88002f964340 ffff88002f964340 Call Trace: [] schedule_timeout+0x175/0x210 [] wait_for_common+0xf0/0x210 [] __ocfs2_cluster_lock+0x3b9/0xa90 [ocfs2] [] ocfs2_inode_lock_full_nested+0x255/0xdb0 [ocfs2] [] ocfs2_get_acl+0x69/0x120 [ocfs2] [] ocfs2_check_acl+0x28/0x80 [ocfs2] [] acl_permission_check+0x57/0xb0 [] generic_permission+0x1d/0xc0 [] ocfs2_permission+0x10a/0x1d0 [ocfs2] [] inode_permission+0x45/0x100 [] sys_chdir+0x53/0x90 [] system_call_fastpath+0x16/0x1b [<00007f34a4ef6927>] 0x7f34a4ef6927 For details, please see: https://bugzilla.novell.com/show_bug.cgi?id=614332 and http://oss.oracle.com/bugzilla/show_bug.cgi?id=1278 Signed-off-by: Jiaju Zhang Acked-by: Mark Fasheh Signed-off-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/acl.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 5a217783321..a76e0aa5cd3 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -290,12 +290,30 @@ static int ocfs2_set_acl(handle_t *handle, int ocfs2_check_acl(struct inode *inode, int mask) { - struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret = -EAGAIN; - if (IS_ERR(acl)) + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return ret; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + + brelse(di_bh); + + if (IS_ERR(acl)) { + mlog_errno(PTR_ERR(acl)); return PTR_ERR(acl); + } if (acl) { - int ret = posix_acl_permission(inode, acl, mask); + ret = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); return ret; } -- cgit v1.2.3 From 8d93451050a832b7fc400b6781b0f4e4bc89f7b7 Mon Sep 17 00:00:00 2001 From: "Patrick J. LoPresti" Date: Tue, 10 Aug 2010 17:28:01 -0400 Subject: nfs: Add "lookupcache" to displayed mount options commit 9b00c64318cc337846a7a08a5678f5f19aeff188 upstream. Running "cat /proc/mounts" fails to display the "lookupcache" option. This oversight cost me a bunch of wasted time recently. The following simple patch fixes it. Signed-off-by: Patrick LoPresti Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9df16de4a5..6bf11d745a3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -652,6 +652,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->options & NFS_OPTION_FSCACHE) seq_printf(m, ",fsc"); + + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + seq_printf(m, ",lookupcache=none"); + else + seq_printf(m, ",lookupcache=pos"); + } } /* -- cgit v1.2.3 From 747f1982702433a40b2ed88945084c1b1746f26e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Aug 2010 09:25:42 -0400 Subject: NFS: Fix an Oops in the NFSv4 atomic open code commit 0a377cff9428af2da2b293d11e07bc4dbf064ee5 upstream. Adam Lackorzynski reports: with 2.6.35.2 I'm getting this reproducible Oops: [ 110.825396] BUG: unable to handle kernel NULL pointer dereference at (null) [ 110.828638] IP: [] encode_attrs+0x1a/0x2a4 [ 110.828638] PGD be89f067 PUD bf18f067 PMD 0 [ 110.828638] Oops: 0000 [#1] SMP [ 110.828638] last sysfs file: /sys/class/net/lo/operstate [ 110.828638] CPU 2 [ 110.828638] Modules linked in: rtc_cmos rtc_core rtc_lib amd64_edac_mod i2c_amd756 edac_core i2c_core dm_mirror dm_region_hash dm_log dm_snapshot sg sr_mod usb_storage ohci_hcd mptspi tg3 mptscsih mptbase usbcore nls_base [last unloaded: scsi_wait_scan] [ 110.828638] [ 110.828638] Pid: 11264, comm: setchecksum Not tainted 2.6.35.2 #1 [ 110.828638] RIP: 0010:[] [] encode_attrs+0x1a/0x2a4 [ 110.828638] RSP: 0000:ffff88003bf5b878 EFLAGS: 00010296 [ 110.828638] RAX: ffff8800bddb48a8 RBX: ffff88003bf5bb18 RCX: 0000000000000000 [ 110.828638] RDX: ffff8800be258800 RSI: 0000000000000000 RDI: ffff88003bf5b9f8 [ 110.828638] RBP: 0000000000000000 R08: ffff8800bddb48a8 R09: 0000000000000004 [ 110.828638] R10: 0000000000000003 R11: ffff8800be779000 R12: ffff8800be258800 [ 110.828638] R13: ffff88003bf5b9f8 R14: ffff88003bf5bb20 R15: ffff8800be258800 [ 110.828638] FS: 0000000000000000(0000) GS:ffff880041e00000(0063) knlGS:00000000556bd6b0 [ 110.828638] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b [ 110.828638] CR2: 0000000000000000 CR3: 00000000be8ef000 CR4: 00000000000006e0 [ 110.828638] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 110.828638] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 110.828638] Process setchecksum (pid: 11264, threadinfo ffff88003bf5a000, task ffff88003f232210) [ 110.828638] Stack: [ 110.828638] 0000000000000000 ffff8800bfbcf920 0000000000000000 0000000000000ffe [ 110.828638] <0> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 110.828638] <0> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 110.828638] Call Trace: [ 110.828638] [] ? nfs4_xdr_enc_setattr+0x90/0xb4 [ 110.828638] [] ? call_transmit+0x1c3/0x24a [ 110.828638] [] ? __rpc_execute+0x78/0x22a [ 110.828638] [] ? rpc_run_task+0x21/0x2b [ 110.828638] [] ? rpc_call_sync+0x3d/0x5d [ 110.828638] [] ? _nfs4_do_setattr+0x11b/0x147 [ 110.828638] [] ? nfs_init_locked+0x0/0x32 [ 110.828638] [] ? ifind+0x4e/0x90 [ 110.828638] [] ? nfs4_do_setattr+0x4b/0x6e [ 110.828638] [] ? nfs4_do_open+0x291/0x3a6 [ 110.828638] [] ? nfs4_open_revalidate+0x63/0x14a [ 110.828638] [] ? nfs_open_revalidate+0xd7/0x161 [ 110.828638] [] ? do_lookup+0x1a4/0x201 [ 110.828638] [] ? link_path_walk+0x6a/0x9d5 [ 110.828638] [] ? do_last+0x17b/0x58e [ 110.828638] [] ? do_filp_open+0x1bd/0x56e [ 110.828638] [] ? _atomic_dec_and_lock+0x30/0x48 [ 110.828638] [] ? dput+0x37/0x152 [ 110.828638] [] ? alloc_fd+0x69/0x10a [ 110.828638] [] ? do_sys_open+0x56/0x100 [ 110.828638] [] ? ia32_sysret+0x0/0x5 [ 110.828638] Code: 83 f1 01 e8 f5 ca ff ff 48 83 c4 50 5b 5d 41 5c c3 41 57 41 56 41 55 49 89 fd 41 54 49 89 d4 55 48 89 f5 53 48 81 ec 18 01 00 00 <8b> 06 89 c2 83 e2 08 83 fa 01 19 db 83 e3 f8 83 c3 18 a8 01 8d [ 110.828638] RIP [] encode_attrs+0x1a/0x2a4 [ 110.828638] RSP [ 110.828638] CR2: 0000000000000000 [ 112.840396] ---[ end trace 95282e83fd77358f ]--- We need to ensure that the O_EXCL flag is turned off if the user doesn't set O_CREAT. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4proc.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e60416d3f81..d69551e77b9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1103,7 +1103,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open_dput; /* We can't create new files, or truncate existing ones here */ - openflags &= ~(O_CREAT|O_TRUNC); + openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); /* * Note: we're not holding inode->i_mutex and so may be racing with diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 70015dd60a9..330a3c9a1a0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2023,7 +2023,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) struct rpc_cred *cred; struct nfs4_state *state; struct dentry *res; - fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + int open_flags = nd->intent.open.flags; + fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; @@ -2031,8 +2032,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (!IS_POSIXACL(dir)) attr.ia_mode &= ~current_umask(); } else { + open_flags &= ~O_EXCL; attr.ia_valid = 0; - BUG_ON(nd->intent.open.flags & O_CREAT); + BUG_ON(open_flags & O_CREAT); } cred = rpc_lookup_cred(); @@ -2041,7 +2043,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); + state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); put_rpccred(cred); if (IS_ERR(state)) { if (PTR_ERR(state) == -ENOENT) { -- cgit v1.2.3 From 807c4c0e43e2c161f3ca73030617038ee7f775d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Aug 2010 16:34:43 +0100 Subject: Fix init ordering of /dev/console vs callers of modprobe commit 31d1d48e199e99077fb30f6fb9a793be7bec756f upstream. Make /dev/console get initialised before any initialisation routine that invokes modprobe because if modprobe fails, it's going to want to open /dev/console, presumably to write an error message to. The problem with that is that if the /dev/console driver is not yet initialised, the chardev handler will call request_module() to invoke modprobe, which will fail, because we never compile /dev/console as a module. This will lead to a modprobe loop, showing the following in the kernel log: request_module: runaway loop modprobe char-major-5-1 request_module: runaway loop modprobe char-major-5-1 request_module: runaway loop modprobe char-major-5-1 request_module: runaway loop modprobe char-major-5-1 request_module: runaway loop modprobe char-major-5-1 This can happen, for example, when the built in md5 module can't find the built in cryptomgr module (because the latter fails to initialise). The md5 module comes before the call to tty_init(), presumably because 'crypto' comes before 'drivers' alphabetically. Fix this by calling tty_init() from chrdev_init(). Signed-off-by: David Howells Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/char_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index d6db933df2b..f80a4f25123 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "internal.h" -- cgit v1.2.3 From 5efe96cd7a842cb0e95dc8f05ed4a44358ae42bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:30 +1000 Subject: xfs: fix untrusted inode number lookup commit 4536f2ad8b330453d7ebec0746c4374eadd649b1 upstream. Commit 7124fe0a5b619d65b739477b3b55a20bf805b06d ("xfs: validate untrusted inode numbers during lookup") changes the inode lookup code to do btree lookups for untrusted inode numbers. This change made an invalid assumption about the alignment of inodes and hence incorrectly calculated the first inode in the cluster. As a result, some inode numbers were being incorrectly considered invalid when they were actually valid. The issue was not picked up by the xfstests suite because it always runs fsr and dump (the two utilities that utilise the bulkstat interface) on cache hot inodes and hence the lookup code in the cold cache path was not sufficiently exercised to uncover this intermittent problem. Fix the issue by relaxing the btree lookup criteria and then checking if the record returned contains the inode number we are lookup for. If it we get an incorrect record, then the inode number is invalid. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_ialloc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c7142a064c4..eb779affaaa 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1217,7 +1217,6 @@ xfs_imap_lookup( struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - xfs_agino_t startino; int error; int i; @@ -1231,13 +1230,13 @@ xfs_imap_lookup( } /* - * derive and lookup the exact inode record for the given agino. If the - * record cannot be found, then it's an invalid inode number and we - * should abort. + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); - error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); @@ -1250,6 +1249,11 @@ xfs_imap_lookup( if (error) return error; + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + return EINVAL; + /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) -- cgit v1.2.3 From 4f403a5d075f381c29da1e903a6b40b68c99846b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:41 +1000 Subject: xfs: ensure we mark all inodes in a freed cluster XFS_ISTALE commit 5b3eed756cd37255cad1181bd86bfd0977e97953 upstream. Under heavy load parallel metadata loads (e.g. dbench), we can fail to mark all the inodes in a cluster being freed as XFS_ISTALE as we skip inodes we cannot get the XFS_ILOCK_EXCL or the flush lock on. When this happens and the inode cluster buffer has already been marked stale and freed, inode reclaim can try to write the inode out as it is dirty and not marked stale. This can result in writing th metadata to an freed extent, or in the case it has already been overwritten trigger a magic number check failure and return an EUCLEAN error such as: Filesystem "ram0": inode 0x442ba1 background reclaim flush failed with 117 Fix this by ensuring that we hoover up all in memory inodes in the cluster and mark them XFS_ISTALE when freeing the cluster. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_inode.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b76a829d7e2..f70221820a1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1927,6 +1927,11 @@ xfs_iunlink_remove( return 0; } +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -1958,8 +1963,6 @@ xfs_ifree_cluster( } for (j = 0; j < nbufs; j++, inum += ninodes) { - int found = 0; - blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1978,7 +1981,9 @@ xfs_ifree_cluster( /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. */ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); while (lip) { @@ -1990,11 +1995,11 @@ xfs_ifree_cluster( &iip->ili_flush_lsn, &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - found++; } lip = lip->li_bio_list; } + /* * For each inode in memory attempt to add it to the inode * buffer and set it up for being staled on buffer IO @@ -2006,6 +2011,7 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < ninodes; i++) { +retry: read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); @@ -2016,38 +2022,36 @@ xfs_ifree_cluster( continue; } - /* don't try to lock/unlock the current inode */ + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); - continue; + delay(1); + goto retry; } read_unlock(&pag->pag_ici_lock); - if (!xfs_iflock_nowait(ip)) { - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - + xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ iip = ip->i_itemp; - if (!iip) { - /* inode with unlogged changes only */ + if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2063,8 +2067,7 @@ xfs_ifree_cluster( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (found) - xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } -- cgit v1.2.3 From f7b15355f8caf92516254326bc38e0970d8f9115 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jul 2010 21:17:09 +0000 Subject: direct-io: move aio_complete into ->end_io commit 40e2e97316af6e62affab7a392e792494b8d9dde upstream. Filesystems with unwritten extent support must not complete an AIO request until the transaction to convert the extent has been commited. That means the aio_complete calls needs to be moved into the ->end_io callback so that the filesystem can control when to call it exactly. This makes a bit of a mess out of dio_complete and the ->end_io callback prototype even more complicated. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Alex Elder Cc: Chuck Ebbert Signed-off-by: Greg Kroah-Hartman --- fs/direct-io.c | 26 ++++++++++++++------------ fs/ext4/inode.c | 10 +++++++--- fs/ocfs2/aops.c | 7 ++++++- fs/xfs/linux-2.6/xfs_aops.c | 7 ++++++- fs/xfs/linux-2.6/xfs_aops.h | 2 ++ 5 files changed, 35 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 7600aacf531..a10cb91cade 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } } @@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 42272d67955..0afc8c1d8cf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3775,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; @@ -3784,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", @@ -3795,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; - return; + goto out; } io_end->offset = offset; @@ -3812,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 356e976772b..96337a4fbbd 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -578,7 +578,9 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private) + void *private, + int ret, + bool is_async) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; @@ -592,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (!level) up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); + + if (is_async) + aio_complete(iocb, ret, 0); } /* diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 34640d6dbdc..5895aaf62ac 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1599,7 +1599,9 @@ xfs_end_io_direct( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private) + void *private, + int ret, + bool is_async) { xfs_ioend_t *ioend = iocb->private; @@ -1645,6 +1647,9 @@ xfs_end_io_direct( * against double-freeing. */ iocb->private = NULL; + + if (is_async) + aio_complete(iocb, ret, 0); } STATIC ssize_t diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 4cfc6ea87df..9f566d92ae3 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -37,6 +37,8 @@ typedef struct xfs_ioend { size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -- cgit v1.2.3 From 2851d0064911f0cdd9a39e5607a7b20de9a44d96 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Tue, 27 Jul 2010 11:56:06 -0400 Subject: ext4: move aio completion after unwritten extent conversion commit 5b3ff237bef43b9e7fb7d1eb858e29b73fd664f9 upstream. This patch is to be applied upon Christoph's "direct-io: move aio_complete into ->end_io" patch. It adds iocb and result fields to struct ext4_io_end_t, so that we can call aio_complete from ext4_end_io_nolock() after the extent conversion has finished. I have verified with Christoph's aio-dio test that used to fail after a few runs on an original kernel but now succeeds on the patched kernel. See http://thread.gmane.org/gmane.comp.file-systems.ext4/19659 for details. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" Cc: Chuck Ebbert Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 4 +++- fs/ext4/inode.c | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 19a4de57128..4c9f05dc704 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -167,13 +167,15 @@ struct mpage_da_data { }; #define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { - struct list_head list; /* per-file finished AIO list */ + struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ } ext4_io_end_t; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0afc8c1d8cf..3fb64b9e47c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3668,6 +3668,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) return ret; } + if (io->iocb) + aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ io->flag = 0; return ret; @@ -3767,6 +3769,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) io->offset = 0; io->size = 0; io->page = NULL; + io->iocb = NULL; + io->result = 0; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3796,12 +3800,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; - goto out; +out: + if (is_async) + aio_complete(iocb, ret, 0); + return; } io_end->offset = offset; io_end->size = size; - io_end->flag = EXT4_IO_UNWRITTEN; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ @@ -3813,9 +3823,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; -out: - if (is_async) - aio_complete(iocb, ret, 0); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) -- cgit v1.2.3 From 9d27ee433011842618908f8d8117e0148a40902b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Jul 2010 21:17:10 +0000 Subject: xfs: move aio completion after unwritten extent conversion commit fb511f2150174b18b28ad54708c1adda0df39b17 upstream. If we write into an unwritten extent using AIO we need to complete the AIO request after the extent conversion has finished. Without that a read could race to see see the extent still unwritten and return zeros. For synchronous I/O we already take care of that by flushing the xfsconvertd workqueue (which might be a bit of overkill). To do that add iocb and result fields to struct xfs_ioend, so that we can call aio_complete from xfs_end_io after the extent conversion has happened. Note that we need a new result field as io_error is used for positive errno values, while the AIO code can return negative error values and positive transfer sizes. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder Cc: Chuck Ebbert Signed-off-by: Greg Kroah-Hartman --- fs/xfs/linux-2.6/xfs_aops.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 5895aaf62ac..f9f5567bf89 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -275,8 +275,11 @@ xfs_end_io( xfs_finish_ioend(ioend, 0); /* ensure we don't spin on blocked ioends */ delay(1); - } else + } else { + if (ioend->io_iocb) + aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); + } } /* @@ -309,6 +312,8 @@ xfs_alloc_ioend( atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; + ioend->io_iocb = NULL; + ioend->io_result = 0; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -1604,6 +1609,7 @@ xfs_end_io_direct( bool is_async) { xfs_ioend_t *ioend = iocb->private; + bool complete_aio = is_async; /* * Non-NULL private data means we need to issue a transaction to @@ -1629,7 +1635,14 @@ xfs_end_io_direct( if (ioend->io_type == IO_READ) { xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { - xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); + if (is_async) { + ioend->io_iocb = iocb; + ioend->io_result = ret; + complete_aio = false; + xfs_finish_ioend(ioend, 0); + } else { + xfs_finish_ioend(ioend, 1); + } } else { /* * A direct I/O write ioend starts it's life in unwritten @@ -1648,7 +1661,7 @@ xfs_end_io_direct( */ iocb->private = NULL; - if (is_async) + if (complete_aio) aio_complete(iocb, ret, 0); } -- cgit v1.2.3 From a16e33acc48c98cc8941cf993acc8e2bb9d7dd45 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 7 Sep 2010 13:42:41 +0200 Subject: fuse: flush background queue on connection close commit 595afaf9e6ee1b48e13ec4b8bcc8c7dee888161a upstream. David Bartly reported that fuse can hang in fuse_get_req_nofail() when the connection to the filesystem server is no longer active. If bg_queue is not empty then flush_bg_queue() called from request_end() can put more requests on to the pending queue. If this happens while ending requests on the processing queue then those background requests will be queued to the pending list and never ended. Another problem is that fuse_dev_release() didn't wake up processes sleeping on blocked_waitq. Solve this by: a) flushing the background queue before calling end_requests() on the pending and processing queues b) setting blocked = 0 and waking up processes waiting on blocked_waitq() Thanks to David for an excellent bug report. Reported-by: David Bartley Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9424796d663..e5cdabf4945 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1552,6 +1552,14 @@ __acquires(&fc->lock) } } +static void end_queued_requests(struct fuse_conn *fc) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); +} + /* * Abort all requests. * @@ -1578,8 +1586,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; end_io_requests(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + end_queued_requests(fc); wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1594,8 +1601,9 @@ int fuse_dev_release(struct inode *inode, struct file *file) if (fc) { spin_lock(&fc->lock); fc->connected = 0; - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + fc->blocked = 0; + end_queued_requests(fc); + wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); fuse_conn_put(fc); } -- cgit v1.2.3 From f688ae5b5c70bf47acf9a88b851a81ec3a88b340 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 12 Aug 2010 16:24:26 -0700 Subject: ocfs2: Fix incorrect checksum validation error commit f5ce5a08a40f2086435858ddc80cb40394b082eb upstream. For local mounts, ocfs2_read_locked_inode() calls ocfs2_read_blocks_sync() to read the inode off the disk. The latter first checks to see if that block is cached in the journal, and, if so, returns that block. That is ok. But ocfs2_read_locked_inode() goes wrong when it tries to validate the checksum of such blocks. Blocks that are cached in the journal may not have had their checksum computed as yet. We should not validate the checksums of such blocks. Fixes ossbz#1282 http://oss.oracle.com/bugzilla/show_bug.cgi?id=1282 Signed-off-by: Sunil Mushran Singed-off-by: Tao Ma Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index abb0a95cc71..201e7bcec75 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { -- cgit v1.2.3 From 102b0617cb2b1607c325d2b60fed9d46d06cc47e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:12:29 +0200 Subject: sysfs: checking for NULL instead of ERR_PTR commit 57f9bdac2510cd7fda58e4a111d250861eb1ebeb upstream. d_path() returns an ERR_PTR and it doesn't return NULL. Signed-off-by: Dan Carpenter Reviewed-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1beaa739d0a..cd796847c81 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) char *p; p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (p) + if (!IS_ERR(p)) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ -- cgit v1.2.3 From c015fda4db4214fc50835e507084b7c95c85e62f Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 9 Sep 2010 16:37:33 -0700 Subject: O_DIRECT: fix the splitting up of contiguous I/O commit 7a801ac6f5067539ceb5fad0fe90ec49fc156e47 upstream. commit c2c6ca4 (direct-io: do not merge logically non-contiguous requests) introduced a bug whereby all O_DIRECT I/Os were submitted a page at a time to the block layer. The problem is that the code expected dio->block_in_file to correspond to the current page in the dio. In fact, it corresponds to the previous page submitted via submit_page_section. This was purely an oversight, as the dio->cur_page_fs_offset field was introduced for just this purpose. This patch simply uses the correct variable when calculating whether there is a mismatch between contiguous logical blocks and contiguous physical blocks (as described in the comments). I also switched the if conditional following this check to an else if, to ensure that we never call dio_bio_submit twice for the same dio (in theory, this should not happen, anyway). I've tested this by running blktrace and verifying that a 64KB I/O was submitted as a single I/O. I also ran the patched kernel through xfstests' aio tests using xfs, ext4 (with 1k and 4k block sizes) and btrfs and verified that there were no regressions as compared to an unpatched kernel. Signed-off-by: Jeff Moyer Acked-by: Josef Bacik Cc: Christoph Hellwig Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index a10cb91cade..458fdd360dd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -634,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { - loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t cur_offset = dio->cur_page_fs_offset; loff_t bio_next_offset = dio->logical_offset_in_bio + dio->bio->bi_size; @@ -659,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio) * Submit now if the underlying fs is about to perform a * metadata read */ - if (dio->boundary) + else if (dio->boundary) dio_bio_submit(dio); } -- cgit v1.2.3 From f569f31fe080173f84eb9995ba8766aa813a1b8e Mon Sep 17 00:00:00 2001 From: Menyhart Zoltan Date: Sun, 12 Sep 2010 19:55:26 -0400 Subject: statfs() gives ESTALE error commit fbf3fdd2443965d9ba6fb4b5fecd1f6e0847218f upstream. Hi, An NFS client executes a statfs("file", &buff) call. "file" exists / existed, the client has read / written it, but it has already closed it. user_path(pathname, &path) looks up "file" successfully in the directory-cache and restarts the aging timer of the directory-entry. Even if "file" has already been removed from the server, because the lookupcache=positive option I use, keeps the entries valid for a while. nfs_statfs() returns ESTALE if "file" has already been removed from the server. If the user application repeats the statfs("file", &buff) call, we are stuck: "file" remains young forever in the directory-cache. Signed-off-by: Zoltan Menyhart Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6bf11d745a3..381d9291ea1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -431,7 +431,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) goto out_err; error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(pd_dentry->d_inode); + dput(pd_dentry); + } + } nfs_free_fattr(res.fattr); if (error < 0) goto out_err; -- cgit v1.2.3 From 14066dd4e5558f0622ee6d6ffbb9a2331bd9348b Mon Sep 17 00:00:00 2001 From: "Jorge Boncompte [DTI2]" Date: Thu, 9 Sep 2010 16:38:19 -0700 Subject: minix: fix regression in minix_mkdir() commit eee743fd7eac9f2ea69ad06d093dfb5a12538fe5 upstream. Commit 9eed1fb721c ("minix: replace inode uid,gid,mode init with helper") broke directory creation on minix filesystems. Fix it by passing the needed mode flag to inode init helper. Signed-off-by: Jorge Boncompte [DTI2] Cc: Dmitry Monakhov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/minix/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/minix/namei.c b/fs/minix/namei.c index e20ee85955d..f3f3578393a 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, mode, &err); + inode = minix_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out_dir; -- cgit v1.2.3 From a61df19f76ae062b5f1a99b46ad4431c35790104 Mon Sep 17 00:00:00 2001 From: Jan Sembera Date: Thu, 9 Sep 2010 16:37:54 -0700 Subject: binfmt_misc: fix binfmt_misc priority commit ee3aebdd8f5f8eac41c25c80ceee3d728f920f3b upstream. Commit 74641f584da ("alpha: binfmt_aout fix") (May 2009) introduced a regression - binfmt_misc is now consulted after binfmt_elf, which will unfortunately break ia32el. ia32 ELF binaries on ia64 used to be matched using binfmt_misc and executed using wrapper. As 32bit binaries are now matched by binfmt_elf before bindmt_misc kicks in, the wrapper is ignored. The fix increases precedence of binfmt_misc to the original state. Signed-off-by: Jan Sembera Cc: Ivan Kokshaysky Cc: Al Viro Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c4e83537ead..42b60b04ea0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -723,7 +723,7 @@ static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) { - err = register_binfmt(&misc_format); + err = insert_binfmt(&misc_format); if (err) unregister_filesystem(&bm_fs_type); } -- cgit v1.2.3 From eb6719b3e83ba65f595f18d0fe62923a0d7db734 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 14 Sep 2010 11:38:24 -0400 Subject: cifs: fix potential double put of TCP session reference commit 460cf3411b858ad509d5255e0dfaf862a83c0299 upstream. cifs_get_smb_ses must be called on a server pointer on which it holds an active reference. It first does a search for an existing SMB session. If it finds one, it'll put the server reference and then try to ensure that the negprot is done, etc. If it encounters an error at that point then it'll return an error. There's a potential problem here though. When cifs_get_smb_ses returns an error, the caller will also put the TCP server reference leading to a double-put. Fix this by having cifs_get_smb_ses only put the server reference if it found an existing session that it could use and isn't returning an error. Reviewed-by: Suresh Jayaraman Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2208f06e4c4..fa1039a7ac4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1647,9 +1647,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses) { cFYI(1, "Existing smb sess found (status=%d)", ses->status); - /* existing SMB ses has a server reference already */ - cifs_put_tcp_session(server); - mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); if (rc) { @@ -1672,6 +1669,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) } } mutex_unlock(&ses->session_mutex); + + /* existing SMB ses has a server reference already */ + cifs_put_tcp_session(server); FreeXid(xid); return ses; } -- cgit v1.2.3 From 8b9d67d73f195b7e2ed83cef14b0b86c5d73ddce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 12 Sep 2010 19:55:26 -0400 Subject: NFS: Fix a typo in nfs_sockaddr_match_ipaddr6 commit b20d37ca9561711c6a3c4b859c2855f49565e061 upstream. Reported-by: Ben Greear Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d25b5257b7a..e0067708a55 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -274,7 +274,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, sin1->sin6_scope_id != sin2->sin6_scope_id) return 0; - return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, -- cgit v1.2.3 From 90f12412d937326e8e86c0756e3044ec551fae5a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Sep 2010 11:49:01 +0200 Subject: char: Mark /dev/zero and /dev/kmem as not capable of writeback commit 371d217ee1ff8b418b8f73fb2a34990f951ec2d4 upstream. These devices don't do any writeback but their device inodes still can get dirty so mark bdi appropriately so that bdi code does the right thing and files inodes to lists of bdi carrying the device inodes. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/char_dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index f80a4f25123..143d393881c 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -40,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = { #endif /* permit direct mmap, for read, write or exec */ BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP | + /* no writeback happens */ + BDI_CAP_NO_ACCT_AND_WRITEBACK), }; static struct kobj_map *cdev_map; -- cgit v1.2.3 From d678de28a43d985e3601dc2f4b62f6e7506a61cf Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Wed, 22 Sep 2010 14:32:56 -0400 Subject: Prevent freeing uninitialized pointer in compat_do_readv_writev commit 767b68e96993e29e3480d7ecdd9c4b84667c5762 upstream. In 32-bit compatibility mode, the error handling for compat_do_readv_writev() may free an uninitialized pointer, potentially leading to all sorts of ugly memory corruption. This is reliably triggerable by unprivileged users by invoking the readv()/writev() syscalls with an invalid iovec pointer. The below patch fixes this to emulate the non-compat version. Introduced by commit b83733639a49 ("compat: factor out compat_rw_copy_check_uvector from compat_do_readv_writev") Signed-off-by: Dan Rosenberg Cc: Al Viro Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 6490d2134ff..af7c2301a2e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1150,7 +1150,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; + struct iovec *iov = iovstack; ssize_t ret; io_fn_t fn; iov_fn_t fnv; -- cgit v1.2.3 From 3285da4a3cd1c15c4627eb134149e279ee6854e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Sep 2010 13:04:54 -0700 Subject: /proc/vmcore: fix seeking commit c227e69028473c7c7994a9b0a2cc0034f3f7e0fe upstream. Commit 73296bc611 ("procfs: Use generic_file_llseek in /proc/vmcore") broke seeking on /proc/vmcore. This changes it back to use default_llseek in order to restore the original behaviour. The problem with generic_file_llseek is that it only allows seeks up to inode->i_sb->s_maxbytes, which is zero on procfs and some other virtual file systems. We should merge generic_file_llseek and default_llseek some day and clean this up in a proper way, but for 2.6.35/36, reverting vmcore is the safer solution. Signed-off-by: Arnd Bergmann Cc: Frederic Weisbecker Reported-by: CAI Qian Tested-by: CAI Qian Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 91c817ff02c..2367fb3f70b 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .llseek = generic_file_llseek, + .llseek = default_llseek, }; static struct vmcore* __init get_new_element(void) -- cgit v1.2.3 From 9b1bb7cf6c57936bd35bb40c381e1f587fc2dd67 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Sep 2010 13:05:03 -0700 Subject: aio: do not return ERESTARTSYS as a result of AIO commit a0c42bac79731276c9b2f28d54f9e658fcf843a2 upstream. OCFS2 can return ERESTARTSYS from its write function when the process is signalled while waiting for a cluster lock (and the filesystem is mounted with intr mount option). Generally, it seems reasonable to allow filesystems to return this error code from its IO functions. As we must not leak ERESTARTSYS (and similar error codes) to userspace as a result of an AIO operation, we have to properly convert it to EINTR inside AIO code (restarting the syscall isn't really an option because other AIO could have been already submitted by the same io_submit syscall). Signed-off-by: Jan Kara Reviewed-by: Jeff Moyer Cc: Christoph Hellwig Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 1ccf25cef1f..25c815dc4b9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb) */ ret = retry(iocb); - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) + if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; aio_complete(iocb, ret, 0); + } out: spin_lock_irq(&ctx->ctx_lock); -- cgit v1.2.3 From 1c00b83d13a4b3ed5eb8a696823ce2432c96b5f3 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 10 Sep 2010 14:16:00 -0700 Subject: aio: check for multiplication overflow in do_io_submit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 75e1c70fc31490ef8a373ea2a4bea2524099b478 upstream. Tavis Ormandy pointed out that do_io_submit does not do proper bounds checking on the passed-in iocb array:        if (unlikely(nr < 0))                return -EINVAL;        if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(iocbpp)))))                return -EFAULT;                      ^^^^^^^^^^^^^^^^^^ The attached patch checks for overflow, and if it is detected, the number of iocbs submitted is scaled down to a number that will fit in the long.  This is an ok thing to do, as sys_io_submit is documented as returning the number of iocbs submitted, so callers should handle a return value of less than the 'nr' argument passed in. Reported-by: Tavis Ormandy Signed-off-by: Jeff Moyer Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 25c815dc4b9..5fb0fd7c80a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1667,6 +1667,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, if (unlikely(nr < 0)) return -EINVAL; + if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) + nr = LONG_MAX/sizeof(*iocbpp); + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) return -EFAULT; -- cgit v1.2.3 From 4974d35a6988b859c3a9a6e3eb98c86141ce2076 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 9 Sep 2010 14:45:00 +0100 Subject: GFS2: gfs2_logd should be using interruptible waits commit 5f4874903df3562b9d5649fc1cf7b8c6bb238e42 upstream. Looks like this crept in, in a recent update. Reported-by: Krzysztof Urbaniak Signed-off-by: Steven Whitehouse Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6a857e24f94..83917b50a19 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -932,7 +932,7 @@ int gfs2_logd(void *data) do { prepare_to_wait(&sdp->sd_logd_waitq, &wait, - TASK_UNINTERRUPTIBLE); + TASK_INTERRUPTIBLE); if (!gfs2_ail_flush_reqd(sdp) && !gfs2_jrnl_flush_reqd(sdp) && !kthread_should_stop()) -- cgit v1.2.3 From cbf012878f93079e232075929bb46c91f704b98e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 28 Jul 2010 10:18:37 -0400 Subject: inotify: send IN_UNMOUNT events commit 611da04f7a31b2208e838be55a42c7a1310ae321 upstream. Since the .31 or so notify rewrite inotify has not sent events about inodes which are unmounted. This patch restores those events. Signed-off-by: Eric Paris Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/notify/inotify/inotify_user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e46ca685b9b..0c6bbc031f1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -96,8 +96,11 @@ static inline __u32 inotify_arg_to_mask(u32 arg) { __u32 mask; - /* everything should accept their own ignored and cares about children */ - mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); + /* + * everything should accept their own ignored, cares about children, + * and should receive events when the inode is unmounted + */ + mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT); /* mask off the flags used to open the fd */ mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); -- cgit v1.2.3 From 872208d77525c5f0886c0d68016f322048db56f8 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Mon, 6 Sep 2010 18:24:57 -0400 Subject: xfs: prevent reading uninitialized stack memory commit a122eb2fdfd78b58c6dd992d6f4b1aaef667eef9 upstream. The XFS_IOC_FSGETXATTR ioctl allows unprivileged users to read 12 bytes of uninitialized stack memory, because the fsxattr struct declared on the stack in xfs_ioc_fsgetxattr() does not alter (or zero) the 12-byte fsx_pad member before copying it back to the user. This patch takes care of it. Signed-off-by: Dan Rosenberg Reviewed-by: Eric Sandeen Signed-off-by: Alex Elder Cc: dann frazier Signed-off-by: Greg Kroah-Hartman --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index e59a8106283..82a74f6142f 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -794,6 +794,8 @@ xfs_ioc_fsgetxattr( { struct fsxattr fa; + memset(&fa, 0, sizeof(struct fsxattr)); + xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; -- cgit v1.2.3 From 712f99d8fcd92a2b4a0949be7490d0092d1862ac Mon Sep 17 00:00:00 2001 From: Mian Yousaf Kaukab Date: Thu, 6 May 2010 09:10:53 +0200 Subject: fix cache coherence issues with bounce buffers [Rabin VINCENT] This is for I$-D$ coherence issues when bounce buffers are used for the MMC driver and code is executed from a file system on eMMC. IMO a better fix is to either change flush_kernel_dcache_page() to flush the D$ even on VIPT non-aliasing caches or to replace the flush_kernel_dcache_page() in lib/scatterlist.c with flush_dcache_page(). Signed-off-by: Mian Yousaf Kaukab --- fs/mpage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/mpage.c b/fs/mpage.c index fd56ca2ea55..ea5b3e685d5 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -52,6 +52,8 @@ static void mpage_end_io_read(struct bio *bio, int err) prefetchw(&bvec->bv_page->flags); if (uptodate) { + /* FIXME: fix to solve cache coherence issues. */ + flush_dcache_page(page); SetPageUptodate(page); } else { ClearPageUptodate(page); -- cgit v1.2.3 From e29fe08145969c9f5bf2023f5a23a752e6ba6c2f Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 12 May 2010 11:55:51 +0200 Subject: Added commandline partitions for block devices Reviewed-on: http://gerrit.lud.stericsson.com/gerrit/305 Reviewed-by: Jonas ABERG Reviewed-by: Linus WALLEIJ Tested-by: Linus WALLEIJ Signed-off-by: Mian Yousaf Kaukab Change-Id: I3725676a9177b0072a435865ae578a6e17bd7168 Reviewed-on: http://gerrit.lud.stericsson.com/gerrit/2206 --- fs/partitions/Kconfig | 19 +++++++ fs/partitions/Makefile | 1 + fs/partitions/blkdev_parts.c | 127 +++++++++++++++++++++++++++++++++++++++++++ fs/partitions/blkdev_parts.h | 14 +++++ fs/partitions/check.c | 4 ++ 5 files changed, 165 insertions(+) create mode 100755 fs/partitions/blkdev_parts.c create mode 100755 fs/partitions/blkdev_parts.h (limited to 'fs') diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig index cb5f0a3f1b0..097be1934ee 100644 --- a/fs/partitions/Kconfig +++ b/fs/partitions/Kconfig @@ -68,6 +68,25 @@ config ACORN_PARTITION_RISCIX of machines called RISCiX. If you say 'Y' here, Linux will be able to read disks partitioned under RISCiX. +config BLKDEV_PARTITION + bool "Blockdev commandline partition support" if PARTITION_ADVANCED + default n + help + Say Y if you like to setup partitions for block devices by reading + from the kernel command line (kernel boot arguments). + + The format of the partitions on the command line: + blkdevparts=[;] + := :[,] + := [@] + + := unique id used to map driver to blockdev name + := size in numbers of sectors + := offset in sectors for partition to start at + + Example: + blkdevparts=mmc0:1024@0,524288@1024;mmc1:8192@0,8192@8192 + config OSF_PARTITION bool "Alpha OSF partition support" if PARTITION_ADVANCED default y if ALPHA diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile index 03af8eac51d..48b216c53db 100644 --- a/fs/partitions/Makefile +++ b/fs/partitions/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLOCK) := check.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_BLKDEV_PARTITION) += blkdev_parts.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/fs/partitions/blkdev_parts.c b/fs/partitions/blkdev_parts.c new file mode 100755 index 00000000000..48f4136d720 --- /dev/null +++ b/fs/partitions/blkdev_parts.c @@ -0,0 +1,127 @@ +/* + * + * Copyright (C) ST-Ericsson SA 2010 + * + * Author: Ulf Hansson for ST-Ericsson + * License terms: GNU General Public License (GPL) version 2 + * + * Create partitions for block devices by reading from the kernel + * command line (kernel boot arguments). + * + */ + +#include "check.h" +#include "blkdev_parts.h" + +static char *cmdline; + +/* + * This is the handler for our kernel commandline parameter, + * called from main.c::checksetup(). + * Note that we can not yet kmalloc() anything, so we only save + * the commandline for later processing. + */ +static int cmdline_setup(char *s) +{ + cmdline = s; + return 1; +} +__setup("blkdevparts=", cmdline_setup); + +/* Parse for a matching blkdev-id and return pointer to partdef */ +static char *parse_blkdev_id(char *blkdev_name) +{ + int blkdev_id_len; + char *p, *blkdev_id; + + /* Start parsing for a matching blkdev-id */ + p = blkdev_id = cmdline; + while (blkdev_id != NULL) { + + /* Find the end of the blkdev-id string */ + p = strchr(blkdev_id, ':'); + if (p == NULL) + return NULL; + + /* Check if we found a matching blkdev-id */ + blkdev_id_len = p - blkdev_id; + if (strlen(blkdev_name) == blkdev_id_len) { + if (strncmp(blkdev_name, blkdev_id, blkdev_id_len) == 0) + return p; + } + + /* Move to next blkdev-id string if there is one */ + blkdev_id = strchr(p, ';'); + if (blkdev_id != NULL) + blkdev_id++; + } + return NULL; +} + +static int parse_partdef(char **part, struct parsed_partitions *state, int part_nbr) +{ + sector_t size, offset; + char *p = *part; + + /* Skip the beginning "," or ":" */ + p++; + + /* Fetch and verify size from partdef */ + size = simple_strtoull(p, &p, 10); + if ((size == 0) || (*p != '@')) + return 0; + + /* Skip the "@" */ + p++; + + /* Fetch offset from partdef and check if there are more parts */ + offset = simple_strtoull(p, &p, 10); + if (*p == ',') + *part = p; + else + *part = NULL; + + /* Add partition to state */ + put_partition(state, part_nbr, offset, size); + printk(KERN_INFO "\nPartition: size=%llu, offset=%llu\n", + (unsigned long long) size, + (unsigned long long) offset); + return 1; +} + +static int parse_blkdev_parts(char *blkdev_name, struct parsed_partitions *state) +{ + char *partdef; + int part_nbr = 0; + + /* Find partdef */ + partdef = parse_blkdev_id(blkdev_name); + + /* Add parts */ + while (partdef != NULL) { + /* Find next part and add it to state */ + part_nbr++; + if (!parse_partdef(&partdef, state, part_nbr)) + return 0; + } + return part_nbr; +} + +int blkdev_partition(struct parsed_partitions *state, struct block_device *bdev) +{ + char blkdev_name[BDEVNAME_SIZE]; + + /* Check if there are any partitions to handle */ + if (cmdline == NULL) + return 0; + + /* Get the name of the blockdevice we are operating upon */ + if (bdevname(bdev, blkdev_name) == NULL) { + printk(KERN_WARNING "Could not get a blkdev name\n"); + return 0; + } + + /* Parse for partitions and add them to the state */ + return parse_blkdev_parts(blkdev_name, state); +} + diff --git a/fs/partitions/blkdev_parts.h b/fs/partitions/blkdev_parts.h new file mode 100755 index 00000000000..e6c5d40ed76 --- /dev/null +++ b/fs/partitions/blkdev_parts.h @@ -0,0 +1,14 @@ +/* + * + * Copyright (C) ST-Ericsson SA 2010 + * + * Author: Ulf Hansson for ST-Ericsson + * License terms: GNU General Public License (GPL) version 2 + * + * Create partitions for block devices by reading from the kernel + * command line (kernel boot arguments). + * + */ + +int blkdev_partition(struct parsed_partitions *state, struct block_device *bdev); + diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 5dcd4b0c553..07b273bc1d7 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -27,6 +27,7 @@ #include "acorn.h" #include "amiga.h" #include "atari.h" +#include "blkdev_parts.h" #include "ldm.h" #include "mac.h" #include "msdos.h" @@ -50,6 +51,9 @@ static int (*check_part[])(struct parsed_partitions *) = { * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. */ +#ifdef CONFIG_BLKDEV_PARTITION + blkdev_partition, +#endif #ifdef CONFIG_ACORN_PARTITION_ICS adfspart_check_ICS, #endif -- cgit v1.2.3