UBUNTU: SAUCE: x86: implement cs-limit nx-emulation for ia32

OriginalAuthor: Kyle McMartin <kyle@redhat.com>, Dave Jones <djones@redhat.com>, Solar Designer <solar at openwall.com> OriginalLocation: http://cvs.fedoraproject.org/viewvc/devel/kernel/linux-2.6-execshield.patch?view=log Bug: #369978 This is a refresh from version 1.117 as carried by the Fedora Project. Implements NX emulation via CS-limits. It closes a gap in security protections on ia32 kernels without PAE, and for ia32 hardware that lacks the NX feature. Upstream feels this NX emulation is not appropriate for mainline, and as such, RedHat and others have carried it in their kernels for a long time now. Also reference https://blueprints.edge.launchpad.net/ubuntu/+spec/use-pae-when-possible Signed-off-by: Kees Cook <kees.cook@canonical.com> Signed-off-by: Leann Ogasawara <leann.ogasawara@canonical.com>
author: Kees Cook <kees.cook@canonical.com> 2010-05-25 09:51:25 -0700
committer: Leann Ogasawara <leann.ogasawara@canonical.com> 2010-07-23 11:28:57 +0200
commit: 28e2e2d40e35aa03ebcc0e514a58b7bfd5f5cab5 (patch)
tree: c9f669c235db3ba955cf83d7beea316acc86e175
parent: dac8f50d6a1e4f1a13ef93c9a40ebf5be612bcfc (diff)
23 files changed, 463 insertions, 27 deletions
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 617bd56b307..526248dd31e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -5,6 +5,7 @@
 #include <asm/ldt.h>
 #include <asm/mmu.h>
 #include <linux/smp.h>
+#include <linux/mm_types.h>
 
 static inline void fill_ldt(struct desc_struct *desc,
 			    const struct user_desc *info)
@@ -93,6 +94,9 @@ static inline int desc_empty(const void *ptr)
 
 #define load_TLS(t, cpu) native_load_tls(t, cpu)
 #define set_ldt native_set_ldt
+#ifdef CONFIG_X86_32
+#define load_user_cs_desc native_load_user_cs_desc
+#endif /*CONFIG_X86_32*/
 
 #define write_ldt_entry(dt, entry, desc)	\
 	native_write_ldt_entry(dt, entry, desc)
@@ -392,4 +396,25 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
 	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
 }
 
+#ifdef CONFIG_X86_32
+static inline void set_user_cs(struct desc_struct *desc, unsigned long limit)
+{
+	limit = (limit - 1) / PAGE_SIZE;
+	desc->a = limit & 0xffff;
+	desc->b = (limit & 0xf0000) | 0x00c0fb00;
+}
+
+static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm)
+{
+	get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs;
+}
+
+#define arch_add_exec_range arch_add_exec_range
+#define arch_remove_exec_range arch_remove_exec_range
+#define arch_flush_exec_range arch_flush_exec_range
+extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit);
+extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit);
+extern void arch_flush_exec_range(struct mm_struct *mm);
+#endif /* CONFIG_X86_32 */
+
 #endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee5bea..8314c66c7c3 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -7,12 +7,19 @@
 /*
  * The x86 doesn't have a mmu context, but
  * we put the segment information here.
+ *
+ * exec_limit is used to track the range PROT_EXEC
+ * mappings span.
  */
 typedef struct {
 	void *ldt;
 	int size;
 	struct mutex lock;
 	void *vdso;
+#ifdef CONFIG_X86_32
+	struct desc_struct user_cs;
+	unsigned long exec_limit;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e..55dadb2a61b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -289,6 +289,12 @@ static inline void set_ldt(const void *addr, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
 }
+#ifdef CONFIG_X86_32
+static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm)
+{
+	PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm);
+}
+#endif /*CONFIG_X86_32*/
 static inline void store_gdt(struct desc_ptr *dtr)
 {
 	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef553234..19c2793fb3c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -118,6 +118,9 @@ struct pv_cpu_ops {
 	void (*store_gdt)(struct desc_ptr *);
 	void (*store_idt)(struct desc_ptr *);
 	void (*set_ldt)(const void *desc, unsigned entries);
+#ifdef CONFIG_X86_32
+	void (*load_user_cs_desc)(int cpu, struct mm_struct *mm);
+#endif
 	unsigned long (*store_tr)(void);
 	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8e..9de0cba46d9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -158,6 +158,9 @@ static inline int hlt_works(int cpu)
 
 #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
 
+#define __HAVE_ARCH_ALIGN_STACK
+extern unsigned long arch_align_stack(unsigned long sp);
+
 extern void cpu_detect(struct cpuinfo_x86 *c);
 
 extern struct pt_regs *idle_regs(struct pt_regs *);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211..40f5bb7e8a4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -802,6 +802,20 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	/* Filter out anything that depends on CPUID levels we don't have */
 	filter_cpuid_features(c, true);
 
+	/*
+	 *  emulation of NX with segment limits unfortunately means
+	 *  we have to disable the fast system calls, due to the way that
+	 *  sysexit clears the segment limits on return.
+	 *  If we have either disabled exec-shield on the boot command line,
+	 *  or we have NX, then we don't need to do this.
+	 */
+	if (exec_shield != 0) {
+#ifdef CONFIG_X86_PAE
+		if (!test_cpu_cap(c, X86_FEATURE_NX))
+#endif
+			clear_cpu_cap(c, X86_FEATURE_SEP);
+	}
+
 	/* If the model name is still unset, do table lookup. */
 	if (!c->x86_model_id[0]) {
 		const char *p;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c0..238b97d2415 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -345,6 +345,9 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
+#ifdef CONFIG_X86_32
+	.load_user_cs_desc = native_load_user_cs_desc,
+#endif /*CONFIG_X86_32*/
 	.load_gdt = native_load_gdt,
 	.load_idt = native_load_idt,
 	.store_gdt = native_store_gdt,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..fde71dfe9cd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -243,7 +243,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 void
 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
+	int cpu;
+
 	set_user_gs(regs, 0);
+
 	regs->fs		= 0;
 	set_fs(USER_DS);
 	regs->ds		= __USER_DS;
@@ -252,6 +255,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	regs->cs		= __USER_CS;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
+
+	cpu = get_cpu();
+	load_user_cs_desc(cpu, current->mm);
+	put_cpu();
+
 	/*
 	 * Free the old FP and other extended state
 	 */
@@ -311,6 +319,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (preload_fpu)
 		prefetch(next->fpu.state);
 
+	if (next_p->mm)
+		load_user_cs_desc(cpu, next_p->mm);
+
 	/*
 	 * Reload esp0.
 	 */
@@ -404,3 +415,40 @@ unsigned long get_wchan(struct task_struct *p)
 	return 0;
 }
 
+static void modify_cs(struct mm_struct *mm, unsigned long limit)
+{
+	mm->context.exec_limit = limit;
+	set_user_cs(&mm->context.user_cs, limit);
+	if (mm == current->mm) {
+		int cpu;
+
+		cpu = get_cpu();
+		load_user_cs_desc(cpu, mm);
+		put_cpu();
+	}
+}
+
+void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
+{
+	if (limit > mm->context.exec_limit)
+		modify_cs(mm, limit);
+}
+
+void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
+{
+	struct vm_area_struct *vma;
+	unsigned long limit = PAGE_SIZE;
+
+	if (old_end == mm->context.exec_limit) {
+		for (vma = mm->mmap; vma; vma = vma->vm_next)
+			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+				limit = vma->vm_end;
+		modify_cs(mm, limit);
+	}
+}
+
+void arch_flush_exec_range(struct mm_struct *mm)
+{
+	mm->context.exec_limit = 0;
+	set_user_cs(&mm->context.user_cs, 0);
+}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd..43f1281d440 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -109,6 +109,78 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
+#ifdef CONFIG_X86_32
+static inline int
+__compare_user_cs_desc(const struct desc_struct *desc1,
+	const struct desc_struct *desc2)
+{
+	return ((desc1->limit0 != desc2->limit0) ||
+		(desc1->limit != desc2->limit) ||
+		(desc1->base0 != desc2->base0) ||
+		(desc1->base1 != desc2->base1) ||
+		(desc1->base2 != desc2->base2));
+}
+
+/*
+ * lazy-check for CS validity on exec-shield binaries:
+ *
+ * the original non-exec stack patch was written by
+ * Solar Designer <solar at openwall.com>. Thanks!
+ */
+static int
+check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code)
+{
+	struct desc_struct *desc1, *desc2;
+	struct vm_area_struct *vma;
+	unsigned long limit;
+
+	if (current->mm == NULL)
+		return 0;
+
+	limit = -1UL;
+	if (current->mm->context.exec_limit != -1UL) {
+		limit = PAGE_SIZE;
+		spin_lock(&current->mm->page_table_lock);
+		for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+			if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+				limit = vma->vm_end;
+		vma = get_gate_vma(current);
+		if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+			limit = vma->vm_end;
+		spin_unlock(&current->mm->page_table_lock);
+		if (limit >= TASK_SIZE)
+			limit = -1UL;
+		current->mm->context.exec_limit = limit;
+	}
+	set_user_cs(&current->mm->context.user_cs, limit);
+
+	desc1 = &current->mm->context.user_cs;
+	desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS;
+
+	if (__compare_user_cs_desc(desc1, desc2)) {
+		/*
+		 * The CS was not in sync - reload it and retry the
+		 * instruction. If the instruction still faults then
+		 * we won't hit this branch next time around.
+		 */
+		if (print_fatal_signals >= 2) {
+			printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n",
+				error_code, error_code/8, regs->ip,
+				smp_processor_id());
+			printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n",
+				current->mm->context.exec_limit,
+				desc1->a, desc1->b, desc2->a, desc2->b);
+		}
+
+		load_user_cs_desc(cpu, current->mm);
+
+		return 1;
+	}
+
+	return 0;
+}
+#endif
+
 static void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
@@ -265,6 +337,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	if (!user_mode(regs))
 		goto gp_in_kernel;
 
+#ifdef CONFIG_X86_32
+{
+	int cpu;
+	int ok;
+
+	cpu = get_cpu();
+	ok = check_lazy_exec_limit(cpu, regs, error_code);
+	put_cpu();
+
+	if (ok)
+		return;
+
+	if (print_fatal_signals) {
+		printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n",
+			error_code, error_code/8, regs->ip, smp_processor_id());
+		printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n",
+			current->mm->context.exec_limit,
+			current->mm->context.user_cs.a,
+			current->mm->context.user_cs.b);
+	}
+}
+#endif /*CONFIG_X86_32*/
+
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = 13;
 
@@ -801,19 +896,37 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 }
 
 #ifdef CONFIG_X86_32
+/*
+ * The fixup code for errors in iret jumps to here (iret_exc). It loses
+ * the original trap number and erorr code. The bogus trap 32 and error
+ * code 0 are what the vanilla kernel delivers via:
+ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+ *
+ * NOTE: Because of the final "1" in the macro we need to enable interrupts.
+ *
+ * In case of a general protection fault in the iret instruction, we
+ * need to check for a lazy CS update for exec-shield.
+ */
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
-	siginfo_t info;
+	int ok;
+	int cpu;
+
 	local_irq_enable();
 
-	info.si_signo = SIGILL;
-	info.si_errno = 0;
-	info.si_code = ILL_BADSTK;
-	info.si_addr = NULL;
-	if (notify_die(DIE_TRAP, "iret exception",
-			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
-		return;
-	do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+	cpu = get_cpu();
+	ok = check_lazy_exec_limit(cpu, regs, error_code);
+	put_cpu();
+
+	if (!ok && notify_die(DIE_TRAP, "iret exception", regs,
+		error_code, 32, SIGSEGV) != NOTIFY_STOP) {
+			siginfo_t info;
+			info.si_signo = SIGSEGV;
+			info.si_errno = 0;
+			info.si_code = ILL_BADSTK;
+			info.si_addr = 0;
+			do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info);
+	}
 }
 #endif
 
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1dab5194fd9..360f39d585f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -124,13 +124,16 @@ static unsigned long mmap_legacy_base(void)
  */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
-	if (mmap_is_legacy()) {
+	if (!(2 & exec_shield) && mmap_is_legacy()) {
 		mm->mmap_base = mmap_legacy_base();
 		mm->get_unmapped_area = arch_get_unmapped_area;
 		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		if (!(current->personality & READ_IMPLIES_EXEC)
+		    && mmap_is_ia32())
+			mm->get_unmapped_exec_area = arch_get_unmapped_exec_area;
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa3408..e0d9cce0338 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -1,3 +1,4 @@
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/init.h>
@@ -23,6 +24,7 @@ static int __init noexec_setup(char *str)
 		disable_nx = 0;
 	} else if (!strncmp(str, "off", 3)) {
 		disable_nx = 1;
+		exec_shield = 0;
 	}
 	x86_configure_nx();
 	return 0;
@@ -40,6 +42,10 @@ void __cpuinit x86_configure_nx(void)
 void __init x86_report_nx(void)
 {
 	if (!cpu_has_nx) {
+		if (exec_shield)
+			printk(KERN_INFO "Using x86 segment limits to approximate NX protection\n");
+		else
+
 		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
 		       "missing in CPU or disabled in BIOS!\n");
 	} else {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d..e0286b18d49 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,6 +6,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 
+#include <asm/desc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/cache.h>
@@ -131,6 +132,12 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
 	union smp_flush_state *f;
 
 	cpu = smp_processor_id();
+
+#ifdef CONFIG_X86_32
+	if (current->active_mm)
+		load_user_cs_desc(cpu, current->active_mm);
+#endif
+
 	/*
 	 * orig_rax contains the negated interrupt vector.
 	 * Use that to determine where the sender put the data.
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e9200..957bb679807 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (compat)
 		addr = VDSO_HIGH_BASE;
 	else {
-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1);
 		if (IS_ERR_VALUE(addr)) {
 			ret = addr;
 			goto up_fail;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a..1ea06f842a9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -335,6 +335,24 @@ static void xen_set_ldt(const void *addr, unsigned entries)
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
+#ifdef CONFIG_X86_32
+static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm)
+{
+	void *gdt;
+	xmaddr_t mgdt;
+	u64 descriptor;
+	struct desc_struct user_cs;
+
+	gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS];
+	mgdt = virt_to_machine(gdt);
+
+	user_cs = mm->context.user_cs;
+	descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32;
+
+	HYPERVISOR_update_descriptor(mgdt.maddr, descriptor);
+}
+#endif /*CONFIG_X86_32*/
+
 static void xen_load_gdt(const struct desc_ptr *dtr)
 {
 	unsigned long va = dtr->address;
@@ -961,6 +979,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
 	.load_tr_desc = paravirt_nop,
 	.set_ldt = xen_set_ldt,
+#ifdef CONFIG_X86_32
+	.load_user_cs_desc = xen_load_user_cs_desc,
+#endif /*CONFIG_X86_32*/
 	.load_gdt = xen_load_gdt,
 	.load_idt = xen_load_idt,
 	.load_tls = xen_load_tls,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a..d114af6d44d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -74,7 +74,7 @@ static struct linux_binfmt elf_format = {
 		.hasvdso	= 1
 };
 
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+#define BAD_ADDR(x) IS_ERR_VALUE(x)
 
 static int set_brk(unsigned long start, unsigned long end)
 {
@@ -701,6 +701,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			break;
 		}
 
+	if (current->personality == PER_LINUX && (exec_shield & 2)) {
+		executable_stack = EXSTACK_DISABLE_X;
+		current->flags |= PF_RANDOMIZE;
+	}
+
 	/* Some simple consistency checks for the interpreter */
 	if (elf_interpreter) {
 		retval = -ELIBBAD;
@@ -717,6 +722,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (retval)
 		goto out_free_dentry;
 
+#ifdef CONFIG_X86_32
+	/*
+	 * Turn off the CS limit completely if exec-shield disabled or
+	 * NX active:
+	 */
+	if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || (__supported_pte_mask & _PAGE_NX))
+		arch_add_exec_range(current->mm, -1);
+#endif
+
 	/* OK, This is the point of no return */
 	current->flags &= ~PF_FORKNOEXEC;
 	current->mm->def_flags = def_flags;
@@ -724,7 +738,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
 	   may depend on the personality.  */
 	SET_PERSONALITY(loc->elf_ex);
-	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
+	if (!(exec_shield & 2) &&
+			elf_read_implies_exec(loc->elf_ex, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
@@ -890,7 +905,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 					    interpreter,
 					    &interp_map_addr,
 					    load_bias);
-		if (!IS_ERR((void *)elf_entry)) {
+		if (!BAD_ADDR(elf_entry)) {
 			/*
 			 * load_elf_interp() returns relocation
 			 * adjustment
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2b48041b91..09446e61bcf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1263,7 +1263,13 @@ extern int install_special_mapping(struct mm_struct *mm,
 				   unsigned long addr, unsigned long len,
 				   unsigned long flags, struct page **pages);
 
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int);
+
+static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);
+}
 
 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b8bb9a6a1f3..f478e39e3cc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -227,6 +227,9 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
+       unsigned long (*get_unmapped_exec_area) (struct file *filp,
+				unsigned long addr, unsigned long len,
+				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
diff --git a/include/linux/resource.h b/include/linux/resource.h
index f1e914eefea..d2aef9a9ebe 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -53,8 +53,11 @@ struct rlimit {
 /*
  * Limit the stack by to some sane default: root can always
  * increase this limit if needed..  8MB seems reasonable.
+ *
+ * (2MB more to cover randomization effects.)
  */
-#define _STK_LIM	(8*1024*1024)
+#define _STK_LIM	(10*1024*1024)
+#define EXEC_STACK_BIAS	(2*1024*1024)
 
 /*
  * GPG2 wants 64kB of mlocked memory, to make sure pass phrases
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 747fcaedddb..c8544513ea4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -101,6 +101,9 @@ struct bio_list;
 struct fs_struct;
 struct perf_event_context;
 
+extern int exec_shield;
+extern int print_fatal_signals;
+
 /*
  * List of flags we want to share for kernel threads,
  * if only because they are not used by them anyway.
@@ -394,6 +397,10 @@ extern void arch_pick_mmap_layout(struct mm_struct *mm);
 extern unsigned long
 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 		       unsigned long, unsigned long);
+
+extern unsigned long
+arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
+		       unsigned long, unsigned long);
 extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9be395856eb..0537d2e3bd9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -103,6 +103,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
+
+int exec_shield = (1<<0);
+/* exec_shield is a bitmask:
+ * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE
+ * (1<<0) 1: on [also on if !=0]
+ * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK
+ * The old settings
+ * (1<<2) 4: vdso just below .text of main (unless too low)
+ * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low)
+ * are ignored because the vdso is placed completely randomly
+ */
+
+static int __init setup_exec_shield(char *str)
+{
+	get_option(&str, &exec_shield);
+
+	return 1;
+}
+__setup("exec-shield=", setup_exec_shield);
+
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
@@ -430,6 +450,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "exec-shield",
+		.data		= &exec_shield,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f2788..9351762d175 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/random.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -44,6 +45,18 @@
 #define arch_rebalance_pgtables(addr, len)		(addr)
 #endif
 
+/* No sane architecture will #define these to anything else */
+#ifndef arch_add_exec_range
+#define arch_add_exec_range(mm, limit)	do { ; } while (0)
+#endif
+#ifndef arch_flush_exec_range
+#define arch_flush_exec_range(mm)	do { ; } while (0)
+#endif
+#ifndef arch_remove_exec_range
+#define arch_remove_exec_range(mm, limit)	do { ; } while (0)
+#endif
+
+
 static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
@@ -388,6 +401,8 @@ static inline void
 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct rb_node *rb_parent)
 {
+	if (vma->vm_flags & VM_EXEC)
+		arch_add_exec_range(mm, vma->vm_end);
 	if (prev) {
 		vma->vm_next = prev->vm_next;
 		prev->vm_next = vma;
@@ -489,6 +504,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 	rb_erase(&vma->vm_rb, &mm->mm_rb);
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
+	if (vma->vm_flags & VM_EXEC)
+		arch_remove_exec_range(mm, vma->vm_end);
 }
 
 /*
@@ -790,6 +807,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 		} else					/* cases 2, 5, 7 */
 			err = vma_adjust(prev, prev->vm_start,
 				end, prev->vm_pgoff, NULL);
+		if (prev->vm_flags & VM_EXEC)
+			arch_add_exec_range(mm, prev->vm_end);
 		if (err)
 			return NULL;
 		return prev;
@@ -982,7 +1001,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+		prot & PROT_EXEC);
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -1534,8 +1554,8 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 }
 
 unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, int exec)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long);
@@ -1548,7 +1568,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
+	if (exec && current->mm->get_unmapped_exec_area)
+		get_area = current->mm->get_unmapped_exec_area;
+	else
+		get_area = current->mm->get_unmapped_area;
+
 	if (file && file->f_op && file->f_op->get_unmapped_area)
 		get_area = file->f_op->get_unmapped_area;
 	addr = get_area(file, addr, len, pgoff, flags);
@@ -1562,8 +1586,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 
 	return arch_rebalance_pgtables(addr, len);
 }
+EXPORT_SYMBOL(get_unmapped_area_prot);
+
+#define SHLIB_BASE	0x00110000
+
+unsigned long
+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+		unsigned long len0, unsigned long pgoff, unsigned long flags)
+{
+	unsigned long addr = addr0, len = len0;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long tmp;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	if (!addr)
+		addr = randomize_range(SHLIB_BASE, 0x01000000, len);
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	addr = SHLIB_BASE;
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+
+		if (!vma || addr + len <= vma->vm_start) {
+			/*
+			 * Must not let a PROT_EXEC mapping get into the
+			 * brk area:
+			 */
+			if (addr + len > mm->brk)
+				goto failed;
+
+			/*
+			 * Up until the brk area we randomize addresses
+			 * as much as possible:
+			 */
+			if (addr >= 0x01000000) {
+				tmp = randomize_range(0x01000000,
+					PAGE_ALIGN(max(mm->start_brk,
+					(unsigned long)0x08000000)), len);
+				vma = find_vma(mm, tmp);
+				if (TASK_SIZE - len >= tmp &&
+				    (!vma || tmp + len <= vma->vm_start))
+					return tmp;
+			}
+			/*
+			 * Ok, randomization didnt work out - return
+			 * the result of the linear search:
+			 */
+			return addr;
+		}
+		addr = vma->vm_end;
+	}
+
+failed:
+	return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
+}
 
-EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
@@ -1638,6 +1730,16 @@ out:
 	return prev ? prev->vm_next : vma;
 }
 
+static int over_stack_limit(unsigned long sz)
+{
+	struct rlimit *rlim = current->signal->rlim;
+
+	if (sz < EXEC_STACK_BIAS)
+		return 0;
+	return (sz - EXEC_STACK_BIAS) >
+			ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur);
+}
+
 /*
  * Verify that the stack growth is acceptable and
  * update accounting. This is shared with both the
@@ -1654,7 +1756,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 		return -ENOMEM;
 
 	/* Stack limit test */
-	if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+	if (over_stack_limit(size))
 		return -ENOMEM;
 
 	/* mlock limit tests */
@@ -1966,10 +2068,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
 
-	if (new_below)
+	if (new_below) {
+		unsigned long old_end = vma->vm_end;
+
 		err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 			((addr - new->vm_start) >> PAGE_SHIFT), new);
-	else
+		if (vma->vm_flags & VM_EXEC)
+			arch_remove_exec_range(mm, old_end);
+	} else
 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 
 	/* Success. */
@@ -2254,6 +2360,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
+	arch_flush_exec_range(mm);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf885..4ea460ed538 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -25,9 +25,14 @@
 #include <linux/perf_event.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#ifndef arch_remove_exec_range
+#define arch_remove_exec_range(mm, limit)      do { ; } while (0)
+#endif
+
 #ifndef pgprot_modify
 static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 {
@@ -138,7 +143,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
-	unsigned long charged = 0;
+	unsigned long charged = 0, old_end = vma->vm_end;
 	pgoff_t pgoff;
 	int error;
 	int dirty_accountable = 0;
@@ -203,6 +208,9 @@ success:
 		dirty_accountable = 1;
 	}
 
+	if (oldflags & VM_EXEC)
+		arch_remove_exec_range(current->mm, old_end);
+
 	mmu_notifier_invalidate_range_start(mm, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef..4c65678e7a6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -487,10 +487,10 @@ unsigned long do_mremap(unsigned long addr,
 		if (vma->vm_flags & VM_MAYSHARE)
 			map_flags |= MAP_SHARED;
 
-		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+		new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len,
 					vma->vm_pgoff +
 					((addr - vma->vm_start) >> PAGE_SHIFT),
-					map_flags);
+					map_flags, vma->vm_flags & VM_EXEC);
 		if (new_addr & ~PAGE_MASK) {
 			ret = new_addr;
 			goto out;
author	Kees Cook <kees.cook@canonical.com>	2010-05-25 09:51:25 -0700
committer	Leann Ogasawara <leann.ogasawara@canonical.com>	2010-07-23 11:28:57 +0200
commit	28e2e2d40e35aa03ebcc0e514a58b7bfd5f5cab5 (patch)
tree	c9f669c235db3ba955cf83d7beea316acc86e175
parent	dac8f50d6a1e4f1a13ef93c9a40ebf5be612bcfc (diff)