From 921a1650de9eed40dd64d681aba4a4d98856f289 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 01:15:31 +0400 Subject: new helper: done_path_create() releases what needs to be released after {kern,user}_path_create() Signed-off-by: Al Viro --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index d2ef8b34b967..4bf19d8174ed 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -67,6 +67,7 @@ extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, int); extern struct dentry *user_path_create(int, const char __user *, struct path *, int); +extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); -- cgit v1.2.3 From e4fad8e5d220e3dfb1050eee752ee5058f29a232 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Jul 2012 15:33:25 +0400 Subject: consolidate pipe file creation Signed-off-by: Al Viro --- include/linux/fs.h | 3 --- include/linux/pipe_fs_i.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8fabb037a48d..478237844648 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2326,9 +2326,6 @@ static inline void i_readcount_inc(struct inode *inode) } #endif extern int do_pipe_flags(int *, int); -extern struct file *create_read_pipe(struct file *f, int flags); -extern struct file *create_write_pipe(int flags); -extern void free_write_pipe(struct file *); extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e1ac1ce16fb0..e16dcb31f0c7 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -162,4 +162,6 @@ void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file); +int create_pipe_files(struct file **, int); + #endif -- cgit v1.2.3 From 800179c9b8a1e796e441674776d11cd4c05d61d7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jul 2012 17:29:07 -0700 Subject: fs: add link restrictions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds symlink and hardlink restrictions to the Linux VFS. Symlinks: A long-standing class of security issues is the symlink-based time-of-check-time-of-use race, most commonly seen in world-writable directories like /tmp. The common method of exploitation of this flaw is to cross privilege boundaries when following a given symlink (i.e. a root process follows a symlink belonging to another user). For a likely incomplete list of hundreds of examples across the years, please see: http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp The solution is to permit symlinks to only be followed when outside a sticky world-writable directory, or when the uid of the symlink and follower match, or when the directory owner matches the symlink's owner. Some pointers to the history of earlier discussion that I could find: 1996 Aug, Zygo Blaxell http://marc.info/?l=bugtraq&m=87602167419830&w=2 1996 Oct, Andrew Tridgell http://lkml.indiana.edu/hypermail/linux/kernel/9610.2/0086.html 1997 Dec, Albert D Cahalan http://lkml.org/lkml/1997/12/16/4 2005 Feb, Lorenzo Hernández García-Hierro http://lkml.indiana.edu/hypermail/linux/kernel/0502.0/1896.html 2010 May, Kees Cook https://lkml.org/lkml/2010/5/30/144 Past objections and rebuttals could be summarized as: - Violates POSIX. - POSIX didn't consider this situation and it's not useful to follow a broken specification at the cost of security. - Might break unknown applications that use this feature. - Applications that break because of the change are easy to spot and fix. Applications that are vulnerable to symlink ToCToU by not having the change aren't. Additionally, no applications have yet been found that rely on this behavior. - Applications should just use mkstemp() or O_CREATE|O_EXCL. - True, but applications are not perfect, and new software is written all the time that makes these mistakes; blocking this flaw at the kernel is a single solution to the entire class of vulnerability. - This should live in the core VFS. - This should live in an LSM. (https://lkml.org/lkml/2010/5/31/135) - This should live in an LSM. - This should live in the core VFS. (https://lkml.org/lkml/2010/8/2/188) Hardlinks: On systems that have user-writable directories on the same partition as system files, a long-standing class of security issues is the hardlink-based time-of-check-time-of-use race, most commonly seen in world-writable directories like /tmp. The common method of exploitation of this flaw is to cross privilege boundaries when following a given hardlink (i.e. a root process follows a hardlink created by another user). Additionally, an issue exists where users can "pin" a potentially vulnerable setuid/setgid file so that an administrator will not actually upgrade a system fully. The solution is to permit hardlinks to only be created when the user is already the existing file's owner, or if they already have read/write access to the existing file. Many Linux users are surprised when they learn they can link to files they have no access to, so this change appears to follow the doctrine of "least surprise". Additionally, this change does not violate POSIX, which states "the implementation may require that the calling process has permission to access the existing file"[1]. This change is known to break some implementations of the "at" daemon, though the version used by Fedora and Ubuntu has been fixed[2] for a while. Otherwise, the change has been undisruptive while in use in Ubuntu for the last 1.5 years. [1] http://pubs.opengroup.org/onlinepubs/9699919799/functions/linkat.html [2] http://anonscm.debian.org/gitweb/?p=collab-maint/at.git;a=commitdiff;h=f4114656c3a6c6f6070e315ffdf940a49eda3279 This patch is based on the patches in Openwall and grsecurity, along with suggestions from Al Viro. I have added a sysctl to enable the protected behavior, and documentation. Signed-off-by: Kees Cook Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 478237844648..80c819cbe272 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -437,6 +437,8 @@ extern unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; +extern int sysctl_protected_symlinks; +extern int sysctl_protected_hardlinks; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, -- cgit v1.2.3 From a51d9eaa41866ab6b4b6ecad7b621f8b66ece0dc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jul 2012 17:29:08 -0700 Subject: fs: add link restriction audit reporting Adds audit messages for unexpected link restriction violations so that system owners will have some sort of potentially actionable information about misbehaving processes. Signed-off-by: Kees Cook Signed-off-by: Al Viro --- include/linux/audit.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 22f292a917a3..36abf2aa7e68 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -130,6 +130,7 @@ #define AUDIT_LAST_KERN_ANOM_MSG 1799 #define AUDIT_ANOM_PROMISCUOUS 1700 /* Device changed promiscuous mode */ #define AUDIT_ANOM_ABEND 1701 /* Process ended abnormally */ +#define AUDIT_ANOM_LINK 1702 /* Suspicious use of file links */ #define AUDIT_INTEGRITY_DATA 1800 /* Data integrity verification */ #define AUDIT_INTEGRITY_METADATA 1801 /* Metadata integrity verification */ #define AUDIT_INTEGRITY_STATUS 1802 /* Integrity enable status */ @@ -687,6 +688,8 @@ extern void audit_log_d_path(struct audit_buffer *ab, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); +extern void audit_log_link_denied(const char *operation, + struct path *link); extern void audit_log_lost(const char *message); #ifdef CONFIG_SECURITY extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); @@ -716,6 +719,7 @@ extern int audit_enabled; #define audit_log_untrustedstring(a,s) do { ; } while (0) #define audit_log_d_path(b, p, d) do { ; } while (0) #define audit_log_key(b, k) do { ; } while (0) +#define audit_log_link_denied(o, l) do { ; } while (0) #define audit_log_secctx(b,s) do { ; } while (0) #define audit_enabled 0 #endif -- cgit v1.2.3 From 4fcf1c6205fcfc7a226a144ae4d83b7f5415cab8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:29 +0200 Subject: mm: Make default vm_ops provide ->page_mkwrite handler Make default vm_ops provide ->page_mkwrite handler. Currently it only updates file's modification times and gets locked page but later it will also handle filesystem freezing. BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b36d08ce5c57..15987d8e1d59 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1411,6 +1411,7 @@ extern void truncate_inode_pages_range(struct address_space *, /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); -- cgit v1.2.3 From 4a55c1017b8dcfd0554734ce3f19374d5b522d59 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:33 +0200 Subject: nfsd: Push mnt_want_write() outside of i_mutex When mnt_want_write() starts to handle freezing it will get a full lock semantics requiring proper lock ordering. So push mnt_want_write() call consistently outside of i_mutex. CC: linux-nfs@vger.kernel.org CC: "J. Bruce Fields" Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/nfsd/nfsfh.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index ce4743a26015..fa63048fecff 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -143,6 +143,7 @@ typedef struct svc_fh { int fh_maxsize; /* max size for fh_handle */ unsigned char fh_locked; /* inode locked by us */ + unsigned char fh_want_write; /* remount protection taken */ #ifdef CONFIG_NFSD_V3 unsigned char fh_post_saved; /* post-op attrs saved */ -- cgit v1.2.3 From 5accdf82ba25cacefd6c1867f1704beb4d244cdd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:34 +0200 Subject: fs: Improve filesystem freezing handling vfs_check_frozen() tests are racy since the filesystem can be frozen just after the test is performed. Thus in write paths we can end up marking some pages or inodes dirty even though the file system is already frozen. This creates problems with flusher thread hanging on frozen filesystem. Another problem is that exclusion between ->page_mkwrite() and filesystem freezing has been handled by setting page dirty and then verifying s_frozen. This guaranteed that either the freezing code sees the faulted page, writes it, and writeprotects it again or we see s_frozen set and bail out of page fault. This works to protect from page being marked writeable while filesystem freezing is running but has an unpleasant artefact of leaving dirty (although unmodified and writeprotected) pages on frozen filesystem resulting in similar problems with flusher thread as the first problem. This patch aims at providing exclusion between write paths and filesystem freezing. We implement a writer-freeze read-write semaphore in the superblock. Actually, there are three such semaphores because of lock ranking reasons - one for page fault handlers (->page_mkwrite), one for all other writers, and one of internal filesystem purposes (used e.g. to track running transactions). Write paths which should block freezing (e.g. directory operations, ->aio_write(), ->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem takes the writer side. Only that we don't really want to bounce cachelines of the semaphores between CPUs for each write happening. So we implement the reader side of the semaphore as a per-cpu counter and the writer side is implemented using s_writers.frozen superblock field. [AV: microoptimize sb_start_write(); we want it fast in normal case] BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 143 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 80c819cbe272..aefed9426b03 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -412,6 +412,7 @@ struct inodes_stat_t { #include #include #include +#include #include @@ -1439,6 +1440,8 @@ extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); +struct mm_struct; + /* * Umount options */ @@ -1452,6 +1455,32 @@ extern int send_sigurg(struct fown_struct *fown); extern struct list_head super_blocks; extern spinlock_t sb_lock; +/* Possible states of 'frozen' field */ +enum { + SB_UNFROZEN = 0, /* FS is unfrozen */ + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ + SB_FREEZE_TRANS = 2, + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop + * internal threads if needed) */ + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { + /* Counters for counting writers at each level */ + struct percpu_counter counter[SB_FREEZE_LEVELS]; + wait_queue_head_t wait; /* queue for waiting for + writers / faults to finish */ + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* queue for waiting for + sb to be thawed */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map lock_map[SB_FREEZE_LEVELS]; +#endif +}; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1501,6 +1530,7 @@ struct super_block { int s_frozen; wait_queue_head_t s_wait_unfrozen; + struct sb_writers s_writers; char s_id[32]; /* Informational name */ u8 s_uuid[16]; /* UUID */ @@ -1555,14 +1585,119 @@ extern struct timespec current_fs_time(struct super_block *sb); /* * Snapshotting support. */ -enum { - SB_UNFROZEN = 0, - SB_FREEZE_WRITE = 1, - SB_FREEZE_TRANS = 2, -}; +/* Will go away when all users are converted */ +#define vfs_check_frozen(sb, level) do { } while (0) + +void __sb_end_write(struct super_block *sb, int level); +int __sb_start_write(struct super_block *sb, int level, bool wait); + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem. Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + * -> i_mutex (write path, truncate, directory ops, ...) + * -> s_umount (freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_WRITE, true); +} + +static inline int sb_start_write_trylock(struct super_block *sb) +{ + return __sb_start_write(sb, SB_FREEZE_WRITE, false); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_sem + * -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); +} + +/* + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_FS, true); +} -#define vfs_check_frozen(sb, level) \ - wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) extern bool inode_owner_or_capable(const struct inode *inode); @@ -1886,6 +2021,7 @@ struct file_system_type { struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; + struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; -- cgit v1.2.3 From d9c95bdd53a8d9116d269c91ce3d151472e6bcd6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:47 +0200 Subject: fs: Remove old freezing mechanism Now that all users are converted, we can remove functions, variables, and constants defined by the old freezing mechanism. BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- include/linux/fs.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index aefed9426b03..0f4b79be8717 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1459,7 +1459,6 @@ extern spinlock_t sb_lock; enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ - SB_FREEZE_TRANS = 2, SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ @@ -1528,8 +1527,6 @@ struct super_block { struct hlist_node s_instances; struct quota_info s_dquot; /* Diskquota specific options */ - int s_frozen; - wait_queue_head_t s_wait_unfrozen; struct sb_writers s_writers; char s_id[32]; /* Informational name */ @@ -1585,8 +1582,6 @@ extern struct timespec current_fs_time(struct super_block *sb); /* * Snapshotting support. */ -/* Will go away when all users are converted */ -#define vfs_check_frozen(sb, level) do { } while (0) void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); -- cgit v1.2.3