From 4066c33d0308f87e9a3b0c7fafb9141c0bfbfa77 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 24 Jun 2015 16:55:54 -0700 Subject: mm/slab_common: support the slub_debug boot option on specific object size The slub_debug=PU,kmalloc-xx cannot work because in the create_kmalloc_caches() the s->name is created after the create_kmalloc_cache() is called. The name is NULL in the create_kmalloc_cache() so the kmem_cache_flags() would not set the slub_debug flags to the s->flags. The fix here set up a kmalloc_names string array for the initialization purpose and delete the dynamic name creation of kmalloc_caches. [akpm@linux-foundation.org: s/kmalloc_names/kmalloc_info/, tweak comment text] Signed-off-by: Gavin Guo Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 62 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 999bb3424d44..84e14582a14c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -783,6 +783,31 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[index]; } +/* + * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is + * kmalloc-67108864. + */ +static struct { + const char *name; + unsigned long size; +} const kmalloc_info[] __initconst = { + {NULL, 0}, {"kmalloc-96", 96}, + {"kmalloc-192", 192}, {"kmalloc-8", 8}, + {"kmalloc-16", 16}, {"kmalloc-32", 32}, + {"kmalloc-64", 64}, {"kmalloc-128", 128}, + {"kmalloc-256", 256}, {"kmalloc-512", 512}, + {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, + {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, + {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, + {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, + {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, + {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, + {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, + {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, + {"kmalloc-67108864", 67108864} +}; + /* * Create the kmalloc array. Some of the regular kmalloc arrays * may already have been created because they were needed to @@ -833,39 +858,30 @@ void __init create_kmalloc_caches(unsigned long flags) for (i = 128 + 8; i <= 192; i += 8) size_index[size_index_elem(i)] = 8; } - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache(NULL, - 1 << i, flags); + kmalloc_caches[i] = create_kmalloc_cache( + kmalloc_info[i].name, + kmalloc_info[i].size, + flags); } /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches + * "i == 2" is the "kmalloc-192" case which is the last special + * case for initialization and it's the point to jump to + * allocate the minimize size of the object. In slab allocator, + * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4 + * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is + * defined, it may be larger than 2^5 and here is also the + * trick to skip the empty gap. */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); - - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + if (i == 2) + i = (KMALLOC_SHIFT_LOW - 1); } /* Kmalloc array is now usable */ slab_state = UP; - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - char *n; - - if (s) { - n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); - - BUG_ON(!n); - s->name = n; - } - } - #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { struct kmem_cache *s = kmalloc_caches[i]; -- cgit v1.2.3 From 34cc6990d4d2d85f60e583ebe3070f8c3ada465c Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Wed, 24 Jun 2015 16:55:57 -0700 Subject: slab: correct size_index table before replacing the bootstrap kmem_cache_node This patch moves the initialization of the size_index table slightly earlier so that the first few kmem_cache_node's can be safely allocated when KMALLOC_MIN_SIZE is large. There are currently two ways to generate indices into kmalloc_caches (via kmalloc_index() and via the size_index table in slab_common.c) and on some arches (possibly only MIPS) they potentially disagree with each other until create_kmalloc_caches() has been called. It seems that the intention is that the size_index table is a fast equivalent to kmalloc_index() and that create_kmalloc_caches() patches the table to return the correct value for the cases where kmalloc_index()'s if-statements apply. The failing sequence was: * kmalloc_caches contains NULL elements * kmem_cache_init initialises the element that 'struct kmem_cache_node' will be allocated to. For 32-bit Mips, this is a 56-byte struct and kmalloc_index returns KMALLOC_SHIFT_LOW (7). * init_list is called which calls kmalloc_node to allocate a 'struct kmem_cache_node'. * kmalloc_slab selects the kmem_caches element using size_index[size_index_elem(size)]. For MIPS, size is 56, and the expression returns 6. * This element of kmalloc_caches is NULL and allocation fails. * If it had not already failed, it would have called create_kmalloc_caches() at this point which would have changed size_index[size_index_elem(size)] to 7. I don't believe the bug to be LLVM specific but GCC doesn't normally encounter the problem. I haven't been able to identify exactly what GCC is doing better (probably inlining) but it seems that GCC is managing to optimize to the point that it eliminates the problematic allocations. This theory is supported by the fact that GCC can be made to fail in the same way by changing inline, __inline, __inline__, and __always_inline in include/linux/compiler-gcc.h such that they don't actually inline things. Signed-off-by: Daniel Sanders Acked-by: Pekka Enberg Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 1 + mm/slab.h | 1 + mm/slab_common.c | 36 +++++++++++++++++++++--------------- mm/slub.c | 1 + 4 files changed, 24 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7eb38dd1cefa..200e22412a16 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void) kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; + setup_kmalloc_cache_index_table(); slab_early_init = 0; diff --git a/mm/slab.h b/mm/slab.h index 4c3ac12dd644..8da63e4e470f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags, #ifndef CONFIG_SLOB /* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(unsigned long); /* Find the kmalloc slab corresponding for a certain size */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 84e14582a14c..9f8d71f78404 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -809,25 +809,20 @@ static struct { }; /* - * Create the kmalloc array. Some of the regular kmalloc arrays - * may already have been created because they were needed to - * enable allocations for slab creation. + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN */ -void __init create_kmalloc_caches(unsigned long flags) +void __init setup_kmalloc_cache_index_table(void) { int i; - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); @@ -858,6 +853,17 @@ void __init create_kmalloc_caches(unsigned long flags) for (i = 128 + 8; i <= 192; i += 8) size_index[size_index_elem(i)] = 8; } +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[i]) { kmalloc_caches[i] = create_kmalloc_cache( diff --git a/mm/slub.c b/mm/slub.c index 54c0876b43d5..816df0016555 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3700,6 +3700,7 @@ void __init kmem_cache_init(void) kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); #ifdef CONFIG_SMP -- cgit v1.2.3 From e0de78dfb4655752897d123a8cce6ecab4175dc9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 24 Jun 2015 16:56:02 -0700 Subject: mm, hwpoison: add comment describing when to add new cases Here's another comment fix for hwpoison. It describes the "guiding principle" on when to add new memory error recovery code. Signed-off-by: Andi Kleen Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 501820c815b3..dfa9dd7f1aea 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -20,6 +20,14 @@ * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back -- cgit v1.2.3 From ebb09738d32b840be8157d556f7756e6dbcc1735 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 24 Jun 2015 16:56:05 -0700 Subject: mm, hwpoison: remove obsolete "Notebook" todo list All the items mentioned here have been either addressed, or were not really needed. So just remove the comment. Signed-off-by: Andi Kleen Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dfa9dd7f1aea..8e71b6e641ad 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -36,13 +36,6 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ #include #include #include -- cgit v1.2.3 From cd0924112119547b0d3fb13a9ed99717d924c2be Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 24 Jun 2015 16:56:07 -0700 Subject: thp: cleanup how khugepaged enters freezer khugepaged_do_scan() checks in every iteration whether freezing(current) is true, and in such case breaks out of the loop, which causes try_to_freeze() to be called immediately afterwards in khugepaged_wait_work(). If nothing else, this causes unnecessary freezing(current) test, and also makes the way khugepaged enters freezer a bit less obvious than necessary. Let's just try to freeze directly, instead of splitting it into two (directly adjacent) phases. Signed-off-by: Jiri Kosina Cc: Mel Gorman Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 078832cf3636..b3d8cd8d6968 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2799,7 +2799,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2820,8 +2820,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; -- cgit v1.2.3 From 36f881883c57941bb32d25cea6524f9612ab5a2c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Jun 2015 16:56:10 -0700 Subject: mm: fix mprotect() behaviour on VM_LOCKED VMAs On mlock(2) we trigger COW on private writable VMA to avoid faults in future. mm/gup.c: 840 long populate_vma_page_range(struct vm_area_struct *vma, 841 unsigned long start, unsigned long end, int *nonblocking) 842 { ... 855 * We want to touch writable mappings with a write fault in order 856 * to break COW, except for shared mappings because these don't COW 857 * and we would not want to dirty them for nothing. 858 */ 859 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 860 gup_flags |= FOLL_WRITE; But we miss this case when we make VM_LOCKED VMA writeable via mprotect(2). The test case: #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #define PAGE_SIZE 4096 int main(int argc, char **argv) { struct rusage usage; long before; char *p; int fd; /* Create a file and populate first page of page cache */ fd = open("/tmp", O_TMPFILE | O_RDWR, S_IRUSR | S_IWUSR); write(fd, "1", 1); /* Create a *read-only* *private* mapping of the file */ p = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); /* * Since the mapping is read-only, mlock() will populate the mapping * with PTEs pointing to page cache without triggering COW. */ mlock(p, PAGE_SIZE); /* * Mapping became read-write, but it's still populated with PTEs * pointing to page cache. */ mprotect(p, PAGE_SIZE, PROT_READ | PROT_WRITE); getrusage(RUSAGE_SELF, &usage); before = usage.ru_minflt; /* Trigger COW: fault in mlock()ed VMA. */ *p = 1; getrusage(RUSAGE_SELF, &usage); printf("faults: %ld\n", usage.ru_minflt - before); return 0; } $ ./test faults: 1 Let's fix it by triggering populating of VMA in mprotect_fixup() on this condition. We don't care about population error as we don't in other similar cases i.e. mremap. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 88584838e704..e7d6f1171ecb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,8 @@ #include #include +#include "internal.h" + /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -322,6 +324,15 @@ success: change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + /* + * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major + * fault on access. + */ + if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && + (newflags & VM_WRITE)) { + populate_vma_page_range(vma, start, end, NULL); + } + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); -- cgit v1.2.3 From e81f2d22370f8231cb7f13f454bcc8c0eb4e23f2 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 24 Jun 2015 16:56:13 -0700 Subject: mm/hugetlb: reduce arch dependent code about huge_pmd_unshare Currently we have many duplicates in definitions of huge_pmd_unshare. In all architectures this function just returns 0 when CONFIG_ARCH_WANT_HUGE_PMD_SHARE is N. This patch puts the default implementation in mm/hugetlb.c and lets these architectures use the common code. Signed-off-by: Zhang Zhen Cc: Russell King Cc: Catalin Marinas Cc: Tony Luck Cc: James Hogan Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Chris Metcalf Cc: David Rientjes Cc: James Yang Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 271e4432734c..716465ae57aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3789,6 +3789,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { return NULL; } + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -- cgit v1.2.3 From 4abad2ca4a4dbdd4a218c12451231ab628f2e60c Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Wed, 24 Jun 2015 16:56:19 -0700 Subject: mm: new arch_remap() hook Some architectures would like to be triggered when a memory area is moved through the mremap system call. This patch introduces a new arch_remap() mm hook which is placed in the path of mremap, and is called before the old area is unmapped (and the arch_unmap() hook is called). Signed-off-by: Laurent Dufour Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 034e2d360652..a7c93eceb1c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, old_addr, - moved_len, true); - return err; + } else { + if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, + old_addr, moved_len, true); + return err; + } } + arch_remap(mm, old_addr, old_addr + old_len, + new_addr, new_addr + new_len); } /* Conceal VM_ACCOUNT so old reservation is not undone */ -- cgit v1.2.3 From a9919c79359df9a706ff9e14fc0a93cd15791c9b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 24 Jun 2015 16:56:28 -0700 Subject: mm: only define hashdist variable when needed For !CONFIG_NUMA, hashdist will always be 0, since it's setter is otherwise compiled out. So we can save 4 bytes of data and some .text (although mostly in __init functions) by only defining it for CONFIG_NUMA. Signed-off-by: Rasmus Villemoes Acked-by: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ebffa0e4a9c0..159dbbc3375d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6013,9 +6013,9 @@ out: return ret; } +#ifdef CONFIG_NUMA int hashdist = HASHDIST_DEFAULT; -#ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) -- cgit v1.2.3 From 73933b3315e65d92e0c6cc4f8b4c51c9dc87546b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Jun 2015 16:56:30 -0700 Subject: mm: drop bogus VM_BUG_ON_PAGE assert in put_page() codepath My commit 8d63d99a5dfb ("mm: avoid tail page refcounting on non-THP compound pages") which was merged during 4.1 merge window caused regression: page:ffffea0010a15040 count:0 mapcount:1 mapping: (null) index:0x0 flags: 0x8000000000008014(referenced|dirty|tail) page dumped because: VM_BUG_ON_PAGE(page_mapcount(page) != 0) ------------[ cut here ]------------ kernel BUG at mm/swap.c:134! The problem can be reproduced by playing *two* audio files at the same time and then stopping one of players. I used two mplayers to trigger this. The VM_BUG_ON_PAGE() which triggers the bug is bogus: Sound subsystem uses compound pages for its buffers, but unlike most __GFP_COMP sound maps compound pages to userspace with PTEs. In our case with two players map the buffer twice and therefore elevates page_mapcount() on tail pages by two. When one of players exits it unmaps the VMA and drops page_mapcount() to one and try to release reference on the page with put_page(). My commit changes which path it takes under put_compound_page(). It hits put_unrefcounted_compound_page() where VM_BUG_ON_PAGE() is. It sees page_mapcount() == 1. The function wrongly assumes that subpages of compound page cannot be be mapped by itself with PTEs.. The solution is simply drop the VM_BUG_ON_PAGE(). Note: there's no need to move the check under put_page_testzero(). Allocator will check the mapcount by itself before putting on free list. Signed-off-by: Kirill A. Shutemov Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Reported-by: Borislav Petkov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index a7251a8ed532..a3a0a2f1f7c3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -131,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page) * here, see the comment above this function. */ VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); if (put_page_testzero(page_head)) { /* * If this is the tail of a slab THP page, -- cgit v1.2.3 From f4d2897b930cb546d435f64fd4b74ea5d1223dff Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Wed, 24 Jun 2015 16:56:36 -0700 Subject: mm/page_alloc.c: cleanup obsolete KM_USER* It's been five years now that KM_* kmap flags have been removed and that we can call clear_highpage from any context. So we remove prep_zero_pages accordingly. Signed-off-by: Anisse Astier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 159dbbc3375d..7a5cbe7cc9b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -380,20 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -static inline void prep_zero_page(struct page *page, unsigned int order, - gfp_t gfp_flags) -{ - int i; - - /* - * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO - * and __GFP_HIGHMEM from hard or soft interrupt context. - */ - VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly; @@ -975,7 +961,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); -- cgit v1.2.3 From f012a84aff7a7f1d50b060e8b205ad68ffb86045 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 24 Jun 2015 16:56:39 -0700 Subject: mm: vmscan: do not throttle based on pfmemalloc reserves if node has no reclaimable pages Based upon 675becce15 ("mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL") from Mel. We have a system with the following topology: # numactl -H available: 3 nodes (0,2-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 node 0 size: 28273 MB node 0 free: 27323 MB node 2 cpus: node 2 size: 16384 MB node 2 free: 0 MB node 3 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 node 3 size: 30533 MB node 3 free: 13273 MB node distances: node 0 2 3 0: 10 20 20 2: 20 10 20 3: 20 20 10 Node 2 has no free memory, because: # cat /sys/devices/system/node/node2/hugepages/hugepages-16777216kB/nr_hugepages 1 This leads to the following zoneinfo: Node 2, zone DMA pages free 0 min 1840 low 2300 high 2760 scanned 0 spanned 262144 present 262144 managed 262144 ... all_unreclaimable: 1 If one then attempts to allocate some normal 16M hugepages via echo 37 > /proc/sys/vm/nr_hugepages The echo never returns and kswapd2 consumes CPU cycles. This is because throttle_direct_reclaim ends up calling wait_event(pfmemalloc_wait, pfmemalloc_watermark_ok...). pfmemalloc_watermark_ok() in turn checks all zones on the node if there are any reserves, and if so, then indicates the watermarks are ok, by seeing if there are sufficient free pages. 675becce15 added a condition already for memoryless nodes. In this case, though, the node has memory, it is just all consumed (and not reclaimable). Effectively, though, the result is the same on this call to pfmemalloc_watermark_ok() and thus seems like a reasonable additional condition. With this change, the afore-mentioned 16M hugepage allocation attempt succeeds and correctly round-robins between Nodes 1 and 3. Signed-off-by: Nishanth Aravamudan Reviewed-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Mel Gorman Cc: Anton Blanchard Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5e8eadd71bac..c627fa4c991f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2646,7 +2646,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); -- cgit v1.2.3 From 95bbc0c7210a7397fec1cd219f896ca95bf29e3e Mon Sep 17 00:00:00 2001 From: Zhihui Zhang Date: Wed, 24 Jun 2015 16:56:42 -0700 Subject: mm: rename RECLAIM_SWAP to RECLAIM_UNMAP The name SWAP implies that we are dealing with anonymous pages only. In fact, the original patch that introduced the min_unmapped_ratio logic was to fix an issue related to file pages. Rename it to RECLAIM_UNMAP to match what does. Historically, commit a6dc60f8975a ("vmscan: rename sc.may_swap to may_unmap") renamed .may_swap to .may_unmap, leaving RECLAIM_SWAP behind. commit 2e2e42598908 ("vmscan,memcg: reintroduce sc->may_swap") reintroduced .may_swap for memory controller. Signed-off-by: Zhihui Zhang Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c627fa4c991f..19ef01e90ac4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3597,7 +3597,7 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -3639,12 +3639,12 @@ static long zone_pagecache_reclaimable(struct zone *zone) long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and zone_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) + if (zone_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); else nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); @@ -3675,15 +3675,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .order = order, .priority = ZONE_RECLAIM_PRIORITY, .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, }; cond_resched(); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP + * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * and RECLAIM_UNMAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); -- cgit v1.2.3 From 415c64c1453aa2bbcc7e30a38f8894d0894cb8ab Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:45 -0700 Subject: mm/memory-failure: split thp earlier in memory error handling memory_failure() doesn't handle thp itself at this time and need to split it before doing isolation. Currently thp is split in the middle of hwpoison_user_mappings(), but there're corner cases where memory_failure() wrongly tries to handle thp without splitting. 1) "non anonymous" thp, which is not a normal operating mode of thp, but a memory error could hit a thp before anon_vma is initialized. In such case, split_huge_page() fails and me_huge_page() (intended for hugetlb) is called for thp, which triggers BUG_ON in page_hstate(). 2) !PageLRU case, where hwpoison_user_mappings() returns with SWAP_SUCCESS and the result is the same as case 1. memory_failure() can't avoid splitting, so let's split it more earlier, which also reduces code which are prepared for both of normal page and thp. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 88 +++++++++++++++-------------------------------------- 1 file changed, 25 insertions(+), 63 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8e71b6e641ad..17a8e3bc3b01 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -928,7 +928,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1, forcekill; struct page *hpage = *hpagep; - struct page *ppage; /* * Here we are interested only in user-mapped pages, so skip any @@ -977,59 +976,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } } - /* - * ppage: poisoned page - * if p is regular page(4k page) - * ppage == real poisoned page; - * else p is hugetlb or THP, ppage == head page. - */ - ppage = hpage; - - if (PageTransHuge(hpage)) { - /* - * Verify that this isn't a hugetlbfs head page, the check for - * PageAnon is just for avoid tripping a split_huge_page - * internal debug check, as split_huge_page refuses to deal with - * anything that isn't an anon page. PageAnon can't go away fro - * under us because we hold a refcount on the hpage, without a - * refcount on the hpage. split_huge_page can't be safely called - * in the first place, having a refcount on the tail isn't - * enough * to be safe. - */ - if (!PageHuge(hpage) && PageAnon(hpage)) { - if (unlikely(split_huge_page(hpage))) { - /* - * FIXME: if splitting THP is failed, it is - * better to stop the following operation rather - * than causing panic by unmapping. System might - * survive if the page is freed later. - */ - printk(KERN_INFO - "MCE %#lx: failed to split THP\n", pfn); - - BUG_ON(!PageHWPoison(p)); - return SWAP_FAIL; - } - /* - * We pinned the head page for hwpoison handling, - * now we split the thp and we are interested in - * the hwpoisoned raw page, so move the refcount - * to it. Similarly, page lock is shifted. - */ - if (hpage != p) { - if (!(flags & MF_COUNT_INCREASED)) { - put_page(hpage); - get_page(p); - } - lock_page(p); - unlock_page(hpage); - *hpagep = p; - } - /* THP is split, so ppage should be the real poisoned page. */ - ppage = p; - } - } - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, @@ -1039,12 +985,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(ppage, ttu); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(ppage)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1056,7 +1002,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); @@ -1102,6 +1048,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page_state *ps; struct page *p; struct page *hpage; + struct page *orig_head; int res; unsigned int nr_pages; unsigned long page_flags; @@ -1117,7 +1064,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - hpage = compound_head(p); + orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; @@ -1180,6 +1127,21 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } } + if (!PageHuge(p) && PageTransHuge(hpage)) { + if (!PageAnon(hpage)) { + pr_err("MCE: %#lx: non anonymous thp\n", pfn); + put_page(p); + return -EBUSY; + } + if (unlikely(split_huge_page(hpage))) { + pr_err("MCE: %#lx: thp split failed\n", pfn); + put_page(p); + return -EBUSY; + } + VM_BUG_ON_PAGE(!page_count(p), p); + hpage = compound_head(p); + } + /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others @@ -1189,9 +1151,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageHuge(p)) { - if (!PageLRU(hpage)) - shake_page(hpage, 0); - if (!PageLRU(hpage)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { /* * shake_page could have turned it free. */ @@ -1212,7 +1174,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The page could have changed compound pages during the locking. * If this happens just bail out. */ - if (compound_head(p) != hpage) { + if (PageCompound(p) && compound_head(p) != orig_head) { action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); res = -EBUSY; goto out; -- cgit v1.2.3 From ead07f6a867b5b1b41cf703735e8b39094987a7d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:48 -0700 Subject: mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling memory_failure() can run in 2 different mode (specified by MF_COUNT_INCREASED) in page refcount perspective. When MF_COUNT_INCREASED is set, memory_failure() assumes that the caller takes a refcount of the target page. And if cleared, memory_failure() takes it in it's own. In current code, however, refcounting is done differently in each caller. For example, madvise_hwpoison() uses get_user_pages_fast() and hwpoison_inject() uses get_page_unless_zero(). So this inconsistent refcounting causes refcount failure especially for thp tail pages. Typical user visible effects are like memory leak or VM_BUG_ON_PAGE(!page_count(page)) in isolate_lru_page(). To fix this refcounting issue, this patch introduces get_hwpoison_page() to handle thp tail pages in the same manner for each caller of hwpoison code. memory_failure() might fail to split thp and in such case it returns without completing page isolation. This is not good because PageHWPoison on the thp is still set and there's no easy way to unpoison such thps. So this patch try to roll back any action to the thp in "non anonymous thp" case and "thp split failed" case, expecting an MCE(SRAR) generated by later access afterward will properly free such thps. [akpm@linux-foundation.org: fix CONFIG_HWPOISON_INJECT=m] Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 4 ++-- mm/memory-failure.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4ca5fe0042e1..bf73ac17dad4 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val) /* * This implies unable to support free buddy pages. */ - if (!get_page_unless_zero(hpage)) + if (!get_hwpoison_page(p)) return 0; if (!hwpoison_filter_enable) @@ -58,7 +58,7 @@ inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); put_out: - put_page(hpage); + put_page(p); return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 17a8e3bc3b01..a810ab1519f0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -915,6 +915,39 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } +/** + * get_hwpoison_page() - Get refcount for memory error handling: + * @page: raw error page (hit by memory error) + * + * Return: return 0 if failed to grab the refcount, otherwise true (some + * non-zero value.) + */ +int get_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) + return get_page_unless_zero(head); + + /* + * Thp tail page has special refcounting rule (refcount of tail pages + * is stored in ->_mapcount,) so we can't call get_page_unless_zero() + * directly for tail pages. + */ + if (PageTransHuge(head)) { + if (get_page_unless_zero(head)) { + if (PageTail(page)) + get_page(page); + return 1; + } else { + return 0; + } + } + + return get_page_unless_zero(page); +} +EXPORT_SYMBOL_GPL(get_hwpoison_page); + /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1097,8 +1130,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(hpage)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { action_result(pfn, MSG_BUDDY, DELAYED); return 0; @@ -1130,12 +1162,20 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageHuge(p) && PageTransHuge(hpage)) { if (!PageAnon(hpage)) { pr_err("MCE: %#lx: non anonymous thp\n", pfn); + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &num_poisoned_pages); put_page(p); + if (p != hpage) + put_page(hpage); return -EBUSY; } if (unlikely(split_huge_page(hpage))) { pr_err("MCE: %#lx: thp split failed\n", pfn); + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &num_poisoned_pages); put_page(p); + if (p != hpage) + put_page(hpage); return -EBUSY; } VM_BUG_ON_PAGE(!page_count(p), p); @@ -1413,12 +1453,12 @@ int unpoison_memory(unsigned long pfn) */ if (!PageHuge(page) && PageTransHuge(page)) { pr_info("MCE: Memory failure is now running on %#lx\n", pfn); - return 0; + return 0; } nr_pages = 1 << compound_order(page); - if (!get_page_unless_zero(page)) { + if (!get_hwpoison_page(p)) { /* * Since HWPoisoned hugepage should have non-zero refcount, * race between memory failure and unpoison seems to happen. @@ -1486,7 +1526,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) * When the target page is a free hugepage, just remove it * from free hugepage list. */ - if (!get_page_unless_zero(compound_head(p))) { + if (!get_hwpoison_page(p)) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = 0; -- cgit v1.2.3 From add05cecef803f3372c5fc1d2a964171872daf9f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:50 -0700 Subject: mm: soft-offline: don't free target page in successful page migration Stress testing showed that soft offline events for a process iterating "mmap-pagefault-munmap" loop can trigger VM_BUG_ON(PAGE_FLAGS_CHECK_AT_PREP) in __free_one_page(): Soft offlining page 0x70fe1 at 0x70100008d000 Soft offlining page 0x705fb at 0x70300008d000 page:ffffea0001c3f840 count:0 mapcount:0 mapping: (null) index:0x2 flags: 0x1fffff80800000(hwpoison) page dumped because: VM_BUG_ON_PAGE(page->flags & ((1 << 25) - 1)) ------------[ cut here ]------------ kernel BUG at /src/linux-dev/mm/page_alloc.c:585! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: cfg80211 rfkill crc32c_intel microcode ppdev parport_pc pcspkr serio_raw virtio_balloon parport i2c_piix4 virtio_blk virtio_net ata_generic pata_acpi floppy CPU: 3 PID: 1779 Comm: test_base_madv_ Not tainted 4.0.0-v4.0-150511-1451-00009-g82360a3730e6 #139 RIP: free_pcppages_bulk+0x52a/0x6f0 Call Trace: drain_pages_zone+0x3d/0x50 drain_local_pages+0x1d/0x30 on_each_cpu_mask+0x46/0x80 drain_all_pages+0x14b/0x1e0 soft_offline_page+0x432/0x6e0 SyS_madvise+0x73c/0x780 system_call_fastpath+0x12/0x17 Code: ff 89 45 b4 48 8b 45 c0 48 83 b8 a8 00 00 00 00 0f 85 e3 fb ff ff 0f 1f 00 0f 0b 48 8b 7d 90 48 c7 c6 e8 95 a6 81 e8 e6 32 02 00 <0f> 0b 8b 45 cc 49 89 47 30 41 8b 47 18 83 f8 ff 0f 85 10 ff ff RIP [] free_pcppages_bulk+0x52a/0x6f0 RSP ---[ end trace 53926436e76d1f35 ]--- When soft offline successfully migrates page, the source page is supposed to be freed. But there is a race condition where a source page looks isolated (i.e. the refcount is 0 and the PageHWPoison is set) but somewhat linked to pcplist. Then another soft offline event calls drain_all_pages() and tries to free such hwpoisoned page, which is forbidden. This odd page state seems to happen due to the race between put_page() in putback_lru_page() and __pagevec_lru_add_fn(). But I don't want to play with tweaking drain code as done in commit 9ab3b598d2df "mm: hwpoison: drop lru_add_drain_all() in __soft_offline_page()", or to change page freeing code for this soft offline's purpose. Instead, let's think about the difference between hard offline and soft offline. There is an interesting difference in how to isolate the in-use page between these, that is, hard offline marks PageHWPoison of the target page at first, and doesn't free it by keeping its refcount 1. OTOH, soft offline tries to free the target page then marks PageHWPoison. This difference might be the source of complexity and result in bugs like the above. So making soft offline isolate with keeping refcount can be a solution for this problem. We can pass to page migration code the "reason" which shows the caller, so let's use this more to avoid calling putback_lru_page() when called from soft offline, which effectively does the isolation for soft offline. With this change, target pages of soft offline never be reused without changing migratetype, so this patch also removes the related code. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 22 ---------------------- mm/migrate.c | 9 ++++++--- 2 files changed, 6 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a810ab1519f0..62ebb1b7f4bf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1697,20 +1697,7 @@ static int __soft_offline_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - /* - * After page migration succeeds, the source page can - * be trapped in pagevec and actual freeing is delayed. - * Freeing code works differently based on PG_hwpoison, - * so there's a race. We need to make sure that the - * source page should be freed back to buddy before - * setting PG_hwpoison. - */ - if (!is_free_buddy_page(page)) - drain_all_pages(page_zone(page)); SetPageHWPoison(page); - if (!is_free_buddy_page(page)) - pr_info("soft offline: %#lx: page leaked\n", - pfn); atomic_long_inc(&num_poisoned_pages); } } else { @@ -1762,14 +1749,6 @@ int soft_offline_page(struct page *page, int flags) get_online_mems(); - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - set_migratetype_isolate(page, true); - ret = get_any_page(page, pfn, flags); put_online_mems(); if (ret > 0) { /* for in-use pages */ @@ -1788,6 +1767,5 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } - unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/mm/migrate.c b/mm/migrate.c index f53838fe3dfe..d4fe1f94120b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -918,7 +918,8 @@ out: static ICE_noinline int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, - int force, enum migrate_mode mode) + int force, enum migrate_mode mode, + enum migrate_reason reason) { int rc = 0; int *result = NULL; @@ -949,7 +950,8 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + if (reason != MR_MEMORY_FAILURE) + putback_lru_page(page); } /* @@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, pass > 2, mode); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode); + private, page, pass > 2, mode, + reason); switch(rc) { case -ENOMEM: -- cgit v1.2.3 From 2491ffee9e66edc2a6ff5ddb49118377257c0014 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:53 -0700 Subject: mm/memory-failure: me_huge_page() does nothing for thp memory_failure() is supposed not to handle thp itself, but to split it. But if something were wrong and page_action() were called on thp, me_huge_page() (action routine for hugepages) should be better to take no action, rather than to take wrong action prepared for hugetlb (which triggers BUG_ON().) This change is for potential problems, but makes sense to me because thp is an actively developing feature and this code path can be open in the future. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 62ebb1b7f4bf..c72f41bfbaaf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -777,6 +777,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + + if (!PageHuge(hpage)) + return MF_DELAYED; + /* * We can safely recover from error on free or reserved (i.e. * not in-use) hugepage by dequeuing it from freelist. -- cgit v1.2.3 From 414e2fb8ce5a999571c21eb2ca4d66e53ddce800 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 24 Jun 2015 16:56:56 -0700 Subject: rmap: fix theoretical race between do_wp_page and shrink_active_list As noted by Paul the compiler is free to store a temporary result in a variable on stack, heap or global unless it is explicitly marked as volatile, see: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html#sample-optimizations This can result in a race between do_wp_page() and shrink_active_list() as follows. In do_wp_page() we can call page_move_anon_rmap(), which sets page->mapping as follows: anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; The page in question may be on an LRU list, because nowhere in do_wp_page() we remove it from the list, neither do we take any LRU related locks. Although the page is locked, shrink_active_list() can still call page_referenced() on it concurrently, because the latter does not require an anonymous page to be locked: CPU0 CPU1 ---- ---- do_wp_page shrink_active_list lock_page page_referenced PageAnon->yes, so skip trylock_page page_move_anon_rmap page->mapping = anon_vma rmap_walk PageAnon->no rmap_walk_file BUG page->mapping += PAGE_MAPPING_ANON This patch fixes this race by explicitly forbidding the compiler to split page->mapping store in page_move_anon_rmap() with the aid of WRITE_ONCE. [akpm@linux-foundation.org: tweak comment, per Minchan] Signed-off-by: Vladimir Davydov Cc: "Paul E. McKenney" Acked-by: Kirill A. Shutemov Acked-by: Rik van Riel Cc: Hugh Dickins Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 24dd3f9fee27..9f47f152b01e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -950,7 +950,12 @@ void page_move_anon_rmap(struct page *page, VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; + /* + * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * simultaneously, so a concurrent reader (eg page_referenced()'s + * PageAnon()) will not see one without the other. + */ + WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); } /** -- cgit v1.2.3 From 641844f5616d7c6597309f560838f996466d7aac Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 24 Jun 2015 16:56:59 -0700 Subject: mm/hugetlb: introduce minimum hugepage order Currently the initial value of order in dissolve_free_huge_page is 64 or 32, which leads to the following warning in static checker: mm/hugetlb.c:1203 dissolve_free_huge_pages() warn: potential right shift more than type allows '9,18,64' This is a potential risk of infinite loop, because 1 << order (== 0) is used in for-loop like this: for (pfn =3D start_pfn; pfn < end_pfn; pfn +=3D 1 << order) ... So this patch fixes it by using global minimum_order calculated at boot time. text data bss dec hex filename 28313 469 84236 113018 1b97a mm/hugetlb.o 28256 473 84236 112965 1b945 mm/hugetlb.o (patched) Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage") Reported-by: Dan Carpenter Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 716465ae57aa..10de25cf1f99 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,11 @@ int hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +/* + * Minimum page order among possible hugepage sizes, set to a proper value + * at boot time. + */ +static unsigned int minimum_order __read_mostly = UINT_MAX; __initdata LIST_HEAD(huge_boot_pages); @@ -1188,19 +1193,13 @@ static void dissolve_free_huge_page(struct page *page) */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned int order = 8 * sizeof(void *); unsigned long pfn; - struct hstate *h; if (!hugepages_supported()) return; - /* Set scan step to minimum hugepage size */ - for_each_hstate(h) - if (order > huge_page_order(h)) - order = huge_page_order(h); - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } @@ -1627,10 +1626,14 @@ static void __init hugetlb_init_hstates(void) struct hstate *h; for_each_hstate(h) { + if (minimum_order > huge_page_order(h)) + minimum_order = huge_page_order(h); + /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } + VM_BUG_ON(minimum_order == UINT_MAX); } static char * __init memfmt(char *buf, unsigned long n) -- cgit v1.2.3 From febd5949e134e279bde927353dc705ce41b18e2d Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 24 Jun 2015 16:57:02 -0700 Subject: mm/memory hotplug: init the zone's size when calculating node totalpages Init the zone's size when calculating node totalpages to avoid duplicated operations in free_area_init_core(). Signed-off-by: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a5cbe7cc9b6..3b02be4def90 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4756,22 +4756,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long realtotalpages, totalpages = 0; + unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - zones_size); - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= - zone_absent_pages_in_node(pgdat->node_id, i, + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long size, real_size; + + size = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zones_size); + real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + zone->spanned_pages = size; + zone->present_pages = real_size; + + totalpages += size; + realtotalpages += real_size; + } + + pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4881,8 +4887,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, unsigned long node_end_pfn) { enum zone_type j; int nid = pgdat->node_id; @@ -4903,12 +4908,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, node_start_pfn, - node_end_pfn, zones_size); - realsize = freesize = size - zone_absent_pages_in_node(nid, j, - node_start_pfn, - node_end_pfn, - zholes_size); + size = zone->spanned_pages; + realsize = freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory @@ -4943,8 +4944,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; - zone->spanned_pages = size; - zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. @@ -5050,8 +5049,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v1.2.3 From 3f5ab8cfbf15e8e02838ffc3549191351305df0e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:04 -0700 Subject: mm: oom_kill: remove unnecessary locking in oom_enable() Setting oom_killer_disabled to false is atomic, there is no need for further synchronization with ongoing allocations trying to OOM-kill. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2b665da1b3c9..73763e489e86 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -488,9 +488,7 @@ bool oom_killer_disable(void) */ void oom_killer_enable(void) { - down_write(&oom_sem); oom_killer_disabled = false; - up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) -- cgit v1.2.3 From 16e951966f05da5ccd650104176f6ba289f7fa20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:07 -0700 Subject: mm: oom_kill: clean up victim marking and exiting interfaces Rename unmark_oom_victim() to exit_oom_victim(). Marking and unmarking are related in functionality, but the interface is not symmetrical at all: one is an internal OOM killer function used during the killing, the other is for an OOM victim to signal its own death on exit later on. This has locking implications, see follow-up changes. While at it, rename mark_tsk_oom_victim() to mark_oom_victim(), which is easier on the eye. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/oom_kill.c | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a04225d372ba..20a7e874f719 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1536,7 +1536,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - mark_tsk_oom_victim(current); + mark_oom_victim(current); return; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 73763e489e86..b2f081fe4b1a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -408,13 +408,13 @@ bool oom_killer_disabled __read_mostly; static DECLARE_RWSEM(oom_sem); /** - * mark_tsk_oom_victim - marks the given task as OOM victim. + * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark * * Has to be called with oom_sem taken for read and never after * oom has been disabled already. */ -void mark_tsk_oom_victim(struct task_struct *tsk) +void mark_oom_victim(struct task_struct *tsk) { WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ @@ -431,11 +431,9 @@ void mark_tsk_oom_victim(struct task_struct *tsk) } /** - * unmark_oom_victim - unmarks the current task as OOM victim. - * - * Wakes up all waiters in oom_killer_disable() + * exit_oom_victim - note the exit of an OOM victim */ -void unmark_oom_victim(void) +void exit_oom_victim(void) { if (!test_and_clear_thread_flag(TIF_MEMDIE)) return; @@ -515,7 +513,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ task_lock(p); if (p->mm && task_will_free_mem(p)) { - mark_tsk_oom_victim(p); + mark_oom_victim(p); task_unlock(p); put_task_struct(p); return; @@ -570,7 +568,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; - mark_tsk_oom_victim(victim); + mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), @@ -728,7 +726,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { - mark_tsk_oom_victim(current); + mark_oom_victim(current); return; } -- cgit v1.2.3 From 464027785ff633468ecbb683ede14672d514b3d3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:10 -0700 Subject: mm: oom_kill: switch test-and-clear of known TIF_MEMDIE to clear exit_oom_victim() already knows that TIF_MEMDIE is set, and nobody else can clear it concurrently. Use clear_thread_flag() directly. Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b2f081fe4b1a..4b9547be9170 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -435,8 +435,7 @@ void mark_oom_victim(struct task_struct *tsk) */ void exit_oom_victim(void) { - if (!test_and_clear_thread_flag(TIF_MEMDIE)) - return; + clear_thread_flag(TIF_MEMDIE); down_read(&oom_sem); /* -- cgit v1.2.3 From c38f1025f2910d6183e9923d4b4d5804474b50c5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:13 -0700 Subject: mm: oom_kill: generalize OOM progress waitqueue It turns out that the mechanism to wait for exiting OOM victims is less generic than it looks: it won't issue wakeups unless the OOM killer is disabled. The reason this check was added was the thought that, since only the OOM disabling code would wait on this queue, wakeup operations could be saved when that specific consumer is known to be absent. However, this is quite the handgrenade. Later attempts to reuse the waitqueue for other purposes will lead to completely unexpected bugs and the failure mode will appear seemingly illogical. Generally, providers shouldn't make unnecessary assumptions about consumers. This could have been replaced with waitqueue_active(), but it only saves a few instructions in one of the coldest paths in the kernel. Simply remove it. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4b9547be9170..472f124e5f08 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -438,11 +438,7 @@ void exit_oom_victim(void) clear_thread_flag(TIF_MEMDIE); down_read(&oom_sem); - /* - * There is no need to signal the lasst oom_victim if there - * is nobody who cares. - */ - if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); up_read(&oom_sem); } -- cgit v1.2.3 From da51b14adb671829077da3aeb9e9edd6f8c80afe Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:16 -0700 Subject: mm: oom_kill: remove unnecessary locking in exit_oom_victim() Disabling the OOM killer needs to exclude allocators from entering, not existing victims from exiting. Right now the only waiter is suspend code, which achieves quiescence by disabling the OOM killer. But later on we want to add waits that hold the lock instead to stop new victims from showing up. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 472f124e5f08..d3490b019d46 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -437,10 +437,8 @@ void exit_oom_victim(void) { clear_thread_flag(TIF_MEMDIE); - down_read(&oom_sem); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); - up_read(&oom_sem); } /** -- cgit v1.2.3 From dc56401fc9f25e8f93899991ec858c98a331d88c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:19 -0700 Subject: mm: oom_kill: simplify OOM killer locking The zonelist locking and the oom_sem are two overlapping locks that are used to serialize global OOM killing against different things. The historical zonelist locking serializes OOM kills from allocations with overlapping zonelists against each other to prevent killing more tasks than necessary in the same memory domain. Only when neither tasklists nor zonelists from two concurrent OOM kills overlap (tasks in separate memcgs bound to separate nodes) are OOM kills allowed to execute in parallel. The younger oom_sem is a read-write lock to serialize OOM killing against the PM code trying to disable the OOM killer altogether. However, the OOM killer is a fairly cold error path, there is really no reason to optimize for highly performant and concurrent OOM kills. And the oom_sem is just flat-out redundant. Replace both locking schemes with a single global mutex serializing OOM kills regardless of context. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 ++++---- mm/oom_kill.c | 127 ++++++++++++-------------------------------------------- mm/page_alloc.c | 8 ++-- 3 files changed, 42 insertions(+), 111 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20a7e874f719..8da44a083397 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1530,6 +1530,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int points = 0; struct task_struct *chosen = NULL; + mutex_lock(&oom_lock); + /* * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may @@ -1537,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { mark_oom_victim(current); - return; + goto unlock; } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); @@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); - return; + goto unlock; case OOM_SCAN_OK: break; }; @@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_end(&it); } - if (!chosen) - return; - points = chosen_points * 1000 / totalpages; - oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, - NULL, "Memory cgroup out of memory"); + if (chosen) { + points = chosen_points * 1000 / totalpages; + oom_kill_process(chosen, gfp_mask, order, points, totalpages, + memcg, NULL, "Memory cgroup out of memory"); + } +unlock: + mutex_unlock(&oom_lock); } #if MAX_NUMNODES > 1 diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d3490b019d46..5cfda39b3268 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -42,7 +42,8 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; -static DEFINE_SPINLOCK(zone_scan_lock); + +DEFINE_MUTEX(oom_lock); #ifdef CONFIG_NUMA /** @@ -405,13 +406,12 @@ static atomic_t oom_victims = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; -static DECLARE_RWSEM(oom_sem); /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark * - * Has to be called with oom_sem taken for read and never after + * Has to be called with oom_lock held and never after * oom has been disabled already. */ void mark_oom_victim(struct task_struct *tsk) @@ -460,14 +460,14 @@ bool oom_killer_disable(void) * Make sure to not race with an ongoing OOM killer * and that the current is not the victim. */ - down_write(&oom_sem); + mutex_lock(&oom_lock); if (test_thread_flag(TIF_MEMDIE)) { - up_write(&oom_sem); + mutex_unlock(&oom_lock); return false; } oom_killer_disabled = true; - up_write(&oom_sem); + mutex_unlock(&oom_lock); wait_event(oom_victims_wait, !atomic_read(&oom_victims)); @@ -634,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); -/* - * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero - * if a parallel OOM killing is already taking place that includes a zone in - * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. - */ -bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) -{ - struct zoneref *z; - struct zone *zone; - bool ret = true; - - spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { - ret = false; - goto out; - } - - /* - * Lock each zone in the zonelist under zone_scan_lock so a parallel - * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. - */ - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - set_bit(ZONE_OOM_LOCKED, &zone->flags); - -out: - spin_unlock(&zone_scan_lock); - return ret; -} - -/* - * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed - * allocation attempts with zonelists containing them may now recall the OOM - * killer, if necessary. - */ -void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) -{ - struct zoneref *z; - struct zone *zone; - - spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - clear_bit(ZONE_OOM_LOCKED, &zone->flags); - spin_unlock(&zone_scan_lock); -} - /** * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer @@ -693,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; struct task_struct *p; @@ -704,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, enum oom_constraint constraint = CONSTRAINT_NONE; int killed = 0; + if (oom_killer_disabled) + return false; + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ - return; + goto out; /* * If current has a pending SIGKILL or is exiting, then automatically @@ -720,7 +677,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); - return; + goto out; } /* @@ -760,32 +717,8 @@ out: */ if (killed) schedule_timeout_killable(1); -} - -/** - * out_of_memory - tries to invoke OOM killer. - * @zonelist: zonelist pointer - * @gfp_mask: memory allocation flags - * @order: amount of memory being requested as a power of 2 - * @nodemask: nodemask passed to page allocator - * @force_kill: true if a task must be killed, even if others are exiting - * - * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() - * when it returns false. Otherwise returns true. - */ -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) -{ - bool ret = false; - - down_read(&oom_sem); - if (!oom_killer_disabled) { - __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); - ret = true; - } - up_read(&oom_sem); - return ret; + return true; } /* @@ -795,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ void pagefault_out_of_memory(void) { - struct zonelist *zonelist; - - down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - goto unlock; + return; - zonelist = node_zonelist(first_memory_node, GFP_KERNEL); - if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - if (!oom_killer_disabled) - __out_of_memory(NULL, 0, 0, NULL, false); - else - /* - * There shouldn't be any user tasks runable while the - * OOM killer is disabled so the current task has to - * be a racing OOM victim for which oom_killer_disable() - * is waiting for. - */ - WARN_ON(test_thread_flag(TIF_MEMDIE)); + if (!mutex_trylock(&oom_lock)) + return; - oom_zonelist_unlock(zonelist, GFP_KERNEL); + if (!out_of_memory(NULL, 0, 0, NULL, false)) { + /* + * There shouldn't be any user tasks runnable while the + * OOM killer is disabled, so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); } -unlock: - up_read(&oom_sem); + + mutex_unlock(&oom_lock); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b02be4def90..cae21dc9d54e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2360,10 +2360,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; /* - * Acquire the per-zone oom lock for each zone. If that - * fails, somebody else is making progress for us. + * Acquire the oom lock. If that fails, somebody else is + * making progress for us. */ - if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { + if (!mutex_trylock(&oom_lock)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; @@ -2408,7 +2408,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: - oom_zonelist_unlock(ac->zonelist, gfp_mask); + mutex_unlock(&oom_lock); return page; } -- cgit v1.2.3 From 9083905a2a13dec4093a9c35a9b7f60037b87672 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Jun 2015 16:57:21 -0700 Subject: mm: page_alloc: inline should_alloc_retry() The should_alloc_retry() function was meant to encapsulate retry conditions of the allocator slowpath, but there are still checks remaining in the main function, and much of how the retrying is performed also depends on the OOM killer progress. The physical separation of those conditions make the code hard to follow. Inline the should_alloc_retry() checks. Notes: - The __GFP_NOFAIL check is already done in __alloc_pages_may_oom(), replace it with looping on OOM killer progress - The pm_suspended_storage() check is meant to skip the OOM killer when reclaim has no IO available, move to __alloc_pages_may_oom() - The order <= PAGE_ALLOC_COSTLY order is re-united with its original counterpart of checking whether reclaim actually made any progress Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: Tetsuo Handa Cc: Andrea Arcangeli Cc: Dave Chinner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 104 +++++++++++++++++--------------------------------------- 1 file changed, 32 insertions(+), 72 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cae21dc9d54e..2880f6f61d05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2309,48 +2309,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) show_mem(filter); } -static inline int -should_alloc_retry(gfp_t gfp_mask, unsigned int order, - unsigned long did_some_progress, - unsigned long pages_reclaimed) -{ - /* Do not loop if specifically requested */ - if (gfp_mask & __GFP_NORETRY) - return 0; - - /* Always retry if specifically requested */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - - /* - * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim - * making forward progress without invoking OOM. Suspend also disables - * storage devices so kswapd will not help. Bail if we are suspending. - */ - if (!did_some_progress && pm_suspended_storage()) - return 0; - - /* - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - */ - if (order <= PAGE_ALLOC_COSTLY_ORDER) - return 1; - - /* - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ - if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) - return 1; - - return 0; -} - static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -2389,16 +2347,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for light reclaim */ + /* The OOM killer does not compensate for IO-less reclaim */ if (!(gfp_mask & __GFP_FS)) { /* * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but - * keep looping as per should_alloc_retry(). + * keep looping as per tradition. */ *did_some_progress = 1; goto out; } + if (pm_suspended_storage()) + goto out; /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; @@ -2781,40 +2741,40 @@ retry: if (page) goto got_pg; - /* Check if we should retry the allocation */ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + goto noretry; + + /* Keep reclaiming pages as long as there is reasonable progress */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, did_some_progress, - pages_reclaimed)) { - /* - * If we fail to make progress by freeing individual - * pages, but the allocation wants us to keep going, - * start OOM killing tasks. - */ - if (!did_some_progress) { - page = __alloc_pages_may_oom(gfp_mask, order, ac, - &did_some_progress); - if (page) - goto got_pg; - if (!did_some_progress) - goto nopage; - } + if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || + ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; - } else { - /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary - */ - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); - if (page) - goto got_pg; } + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); + if (page) + goto got_pg; + + /* Retry as long as the OOM killer is making progress */ + if (did_some_progress) + goto retry; + +noretry: + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, + ac, migration_mode, + &contended_compaction, + &deferred_compaction); + if (page) + goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: -- cgit v1.2.3 From 4165b9b46181290d7e6ac276080c89b65623c633 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Jun 2015 16:57:24 -0700 Subject: hugetlb: do not account hugetlb pages as NR_FILE_PAGES hugetlb pages uses add_to_page_cache to track shared mappings. This is OK from the data structure point of view but it is less so from the NR_FILE_PAGES accounting: - huge pages are accounted as 4k which is clearly wrong - this counter is used as the amount of the reclaimable page cache which is incorrect as well because hugetlb pages are special and not reclaimable - the counter is then exported to userspace via /proc/meminfo (in Cached:), /proc/vmstat and /proc/zoneinfo as nr_file_pages which is confusing at least: Cached: 8883504 kB HugePages_Free: 8348 ... Cached: 8916048 kB HugePages_Free: 156 ... thats 8192 huge pages allocated which is ~16G accounted as 32M There are usually not that many huge pages in the system for this to make any visible difference e.g. by fooling __vm_enough_memory or zone_pagecache_reclaimable. Fix this by special casing huge pages in both __delete_from_page_cache and __add_to_page_cache_locked. replace_page_cache_page is currently only used by fuse and that shouldn't touch hugetlb pages AFAICS but it is more robust to check for special casing there as well. Hugetlb pages shouldn't get to any other paths where we do accounting: - migration - we have a special handling via hugetlbfs_migrate_page - shmem - doesn't handle hugetlb pages directly even for SHM_HUGETLB resp. MAP_HUGETLB - swapcache - hugetlb is not swapable This has a user visible effect but I believe it is reasonable because the previously exported number is simply bogus. An alternative would be to account hugetlb pages with their real size and treat them similar to shmem. But this has some drawbacks. First we would have to special case in kernel users of NR_FILE_PAGES and considering how hugetlb is special we would have to do it everywhere. We do not want Cached exported by /proc/meminfo to include it because the value would be even more misleading. __vm_enough_memory and zone_pagecache_reclaimable would have to do the same thing because those pages are simply not reclaimable. The correction is even not trivial because we would have to consider all active hugetlb page sizes properly. Users of the counter outside of the kernel would have to do the same. So the question is why to account something that needs to be basically excluded for each reasonable usage. This doesn't make much sense to me. It seems that this has been broken since hugetlb was introduced but I haven't checked the whole history. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Michal Hocko Acked-by: Mel Gorman Tested-by: Mike Kravetz Acked-by: Johannes Weiner Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6bf5e42d560a..519af0045b1c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -196,7 +196,9 @@ void __delete_from_page_cache(struct page *page, void *shadow) page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - __dec_zone_page_state(page, NR_FILE_PAGES); + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(page)) + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); @@ -483,7 +485,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; - __inc_zone_page_state(new, NR_FILE_PAGES); + + /* + * hugetlb pages do not participate in page cache accounting. + */ + if (!PageHuge(new)) + __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); @@ -575,7 +582,10 @@ static int __add_to_page_cache_locked(struct page *page, radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - __inc_zone_page_state(page, NR_FILE_PAGES); + + /* hugetlb pages do not participate in page cache accounting. */ + if (!huge) + __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_commit_charge(page, memcg, false); -- cgit v1.2.3 From eb3c24f305e56caaf5c4bd34d2923839688d470e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 24 Jun 2015 16:57:27 -0700 Subject: mm, memcg: Try charging a page before setting page up to date Historically memcg overhead was high even if memcg was unused. This has improved a lot but it still showed up in a profile summary as being a problem. /usr/src/linux-4.0-vanilla/mm/memcontrol.c 6.6441 395842 mem_cgroup_try_charge 2.950% 175781 __mem_cgroup_count_vm_event 1.431% 85239 mem_cgroup_page_lruvec 0.456% 27156 mem_cgroup_commit_charge 0.392% 23342 uncharge_list 0.323% 19256 mem_cgroup_update_lru_size 0.278% 16538 memcg_check_events 0.216% 12858 mem_cgroup_charge_statistics.isra.22 0.188% 11172 try_charge 0.150% 8928 commit_charge 0.141% 8388 get_mem_cgroup_from_mm 0.121% 7184 That is showing that 6.64% of system CPU cycles were in memcontrol.c and dominated by mem_cgroup_try_charge. The annotation shows that the bulk of the cost was checking PageSwapCache which is expected to be cache hot but is very expensive. The problem appears to be that __SetPageUptodate is called just before the check which is a write barrier. It is required to make sure struct page and page data is written before the PTE is updated and the data visible to userspace. memcg charging does not require or need the barrier but gets unfairly hit with the cost so this patch attempts the charging before the barrier. Aside from the accidental cost to memcg there is the added benefit that the barrier is avoided if the page cannot be charged. When applied the relevant profile summary is as follows. /usr/src/linux-4.0-chargefirst-v2r1/mm/memcontrol.c 3.7907 223277 __mem_cgroup_count_vm_event 1.143% 67312 mem_cgroup_page_lruvec 0.465% 27403 mem_cgroup_commit_charge 0.381% 22452 uncharge_list 0.332% 19543 mem_cgroup_update_lru_size 0.284% 16704 get_mem_cgroup_from_mm 0.271% 15952 mem_cgroup_try_charge 0.237% 13982 memcg_check_events 0.222% 13058 mem_cgroup_charge_statistics.isra.22 0.185% 10920 commit_charge 0.140% 8235 try_charge 0.131% 7716 That brings the overhead down to 3.79% and leaves the memcg fault accounting to the root cgroup but it's an improvement. The difference in headline performance of the page fault microbench is marginal as memcg is such a small component of it. pft faults 4.0.0 4.0.0 vanilla chargefirst Hmean faults/cpu-1 1443258.1051 ( 0.00%) 1509075.7561 ( 4.56%) Hmean faults/cpu-3 1340385.9270 ( 0.00%) 1339160.7113 ( -0.09%) Hmean faults/cpu-5 875599.0222 ( 0.00%) 874174.1255 ( -0.16%) Hmean faults/cpu-7 601146.6726 ( 0.00%) 601370.9977 ( 0.04%) Hmean faults/cpu-8 510728.2754 ( 0.00%) 510598.8214 ( -0.03%) Hmean faults/sec-1 1432084.7845 ( 0.00%) 1497935.5274 ( 4.60%) Hmean faults/sec-3 3943818.1437 ( 0.00%) 3941920.1520 ( -0.05%) Hmean faults/sec-5 3877573.5867 ( 0.00%) 3869385.7553 ( -0.21%) Hmean faults/sec-7 3991832.0418 ( 0.00%) 3992181.4189 ( 0.01%) Hmean faults/sec-8 3987189.8167 ( 0.00%) 3986452.2204 ( -0.02%) It's only visible at single threaded. The overhead is there for higher threads but other factors dominate. Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 17734c3c1183..11b9ca176740 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2081,11 +2081,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, goto oom; cow_user_page(new_page, old_page, address, vma); } - __SetPageUptodate(new_page); if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; + __SetPageUptodate(new_page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* @@ -2689,6 +2690,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + goto oom_free_page; + /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before @@ -2696,9 +2701,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) - goto oom_free_page; - entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); -- cgit v1.2.3 From cc637b1704d78b068c2eb700eec384c69ea56cdf Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:30 -0700 Subject: memory-failure: export page_type and action result Export 'outcome' and 'action_page_type' to mm.h, so we could use this emnus outside. This patch is preparation for adding trace events for memory-failure recovery action. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 168 +++++++++++++++++++++------------------------------- 1 file changed, 67 insertions(+), 101 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c72f41bfbaaf..b71a3cd3d0b0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -504,68 +504,34 @@ static void collect_procs(struct page *page, struct list_head *tokill, kfree(tk); } -/* - * Error handlers for various types of pages. - */ - -enum outcome { - IGNORED, /* Error: cannot be handled */ - FAILED, /* Error: handling failed */ - DELAYED, /* Will be handled later */ - RECOVERED, /* Successfully recovered */ -}; - static const char *action_name[] = { - [IGNORED] = "Ignored", - [FAILED] = "Failed", - [DELAYED] = "Delayed", - [RECOVERED] = "Recovered", -}; - -enum action_page_type { - MSG_KERNEL, - MSG_KERNEL_HIGH_ORDER, - MSG_SLAB, - MSG_DIFFERENT_COMPOUND, - MSG_POISONED_HUGE, - MSG_HUGE, - MSG_FREE_HUGE, - MSG_UNMAP_FAILED, - MSG_DIRTY_SWAPCACHE, - MSG_CLEAN_SWAPCACHE, - MSG_DIRTY_MLOCKED_LRU, - MSG_CLEAN_MLOCKED_LRU, - MSG_DIRTY_UNEVICTABLE_LRU, - MSG_CLEAN_UNEVICTABLE_LRU, - MSG_DIRTY_LRU, - MSG_CLEAN_LRU, - MSG_TRUNCATED_LRU, - MSG_BUDDY, - MSG_BUDDY_2ND, - MSG_UNKNOWN, + [MF_IGNORED] = "Ignored", + [MF_FAILED] = "Failed", + [MF_DELAYED] = "Delayed", + [MF_RECOVERED] = "Recovered", }; static const char * const action_page_types[] = { - [MSG_KERNEL] = "reserved kernel page", - [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", - [MSG_SLAB] = "kernel slab page", - [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MSG_POISONED_HUGE] = "huge page already hardware poisoned", - [MSG_HUGE] = "huge page", - [MSG_FREE_HUGE] = "free huge page", - [MSG_UNMAP_FAILED] = "unmapping failed page", - [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", - [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", - [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", - [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", - [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", - [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", - [MSG_DIRTY_LRU] = "dirty LRU page", - [MSG_CLEAN_LRU] = "clean LRU page", - [MSG_TRUNCATED_LRU] = "already truncated LRU page", - [MSG_BUDDY] = "free buddy page", - [MSG_BUDDY_2ND] = "free buddy page (2nd try)", - [MSG_UNKNOWN] = "unknown page", + [MF_MSG_KERNEL] = "reserved kernel page", + [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MF_MSG_SLAB] = "kernel slab page", + [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MF_MSG_HUGE] = "huge page", + [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_UNMAP_FAILED] = "unmapping failed page", + [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MF_MSG_DIRTY_LRU] = "dirty LRU page", + [MF_MSG_CLEAN_LRU] = "clean LRU page", + [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MF_MSG_BUDDY] = "free buddy page", + [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MF_MSG_UNKNOWN] = "unknown page", }; /* @@ -599,7 +565,7 @@ static int delete_from_lru_cache(struct page *p) */ static int me_kernel(struct page *p, unsigned long pfn) { - return IGNORED; + return MF_IGNORED; } /* @@ -608,7 +574,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); - return FAILED; + return MF_FAILED; } /* @@ -617,7 +583,7 @@ static int me_unknown(struct page *p, unsigned long pfn) static int me_pagecache_clean(struct page *p, unsigned long pfn) { int err; - int ret = FAILED; + int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -627,7 +593,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * should be the one m_f() holds. */ if (PageAnon(p)) - return RECOVERED; + return MF_RECOVERED; /* * Now truncate the page in the page cache. This is really @@ -641,7 +607,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return FAILED; + return MF_FAILED; } /* @@ -658,7 +624,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { - ret = RECOVERED; + ret = MF_RECOVERED; } } else { /* @@ -666,7 +632,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * This fails on dirty or anything with private pages */ if (invalidate_inode_page(p)) - ret = RECOVERED; + ret = MF_RECOVERED; else printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", pfn); @@ -752,9 +718,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ClearPageUptodate(p); if (!delete_from_lru_cache(p)) - return DELAYED; + return MF_DELAYED; else - return FAILED; + return MF_FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) @@ -762,9 +728,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) delete_from_swap_cache(p); if (!delete_from_lru_cache(p)) - return RECOVERED; + return MF_RECOVERED; else - return FAILED; + return MF_FAILED; } /* @@ -794,9 +760,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); if (!res) - return RECOVERED; + return MF_RECOVERED; } - return DELAYED; + return MF_DELAYED; } /* @@ -828,10 +794,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) static struct page_state { unsigned long mask; unsigned long res; - enum action_page_type type; + enum mf_action_page_type type; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, MSG_KERNEL, me_kernel }, + { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -842,31 +808,31 @@ static struct page_state { * currently unused objects without touching them. But just * treat it as standard kernel for now. */ - { slab, slab, MSG_SLAB, me_kernel }, + { slab, slab, MF_MSG_SLAB, me_kernel }, #ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, MSG_HUGE, me_huge_page }, - { tail, tail, MSG_HUGE, me_huge_page }, + { head, head, MF_MSG_HUGE, me_huge_page }, + { tail, tail, MF_MSG_HUGE, me_huge_page }, #else - { compound, compound, MSG_HUGE, me_huge_page }, + { compound, compound, MF_MSG_HUGE, me_huge_page }, #endif - { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, - { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, + { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, - { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, - { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, + { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, - { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, - { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, + { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, - { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, - { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, + { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, /* * Catchall entry: must be at end. */ - { 0, 0, MSG_UNKNOWN, me_unknown }, + { 0, 0, MF_MSG_UNKNOWN, me_unknown }, }; #undef dirty @@ -886,7 +852,7 @@ static struct page_state { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum action_page_type type, int result) +static void action_result(unsigned long pfn, enum mf_action_page_type type, int result) { pr_err("MCE %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -901,13 +867,13 @@ static int page_action(struct page_state *ps, struct page *p, result = ps->action(p, pfn); count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == DELAYED) + if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { printk(KERN_ERR "MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); - result = FAILED; + result = MF_FAILED; } action_result(pfn, ps->type, result); @@ -916,7 +882,7 @@ static int page_action(struct page_state *ps, struct page *p, * Could adjust zone counters here to correct for the missing page. */ - return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } /** @@ -1136,7 +1102,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { - action_result(pfn, MSG_BUDDY, DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; } else if (PageHuge(hpage)) { /* @@ -1153,12 +1119,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, MSG_FREE_HUGE, - res ? IGNORED : DELAYED); + action_result(pfn, MF_MSG_FREE_HUGE, + res ? MF_IGNORED : MF_DELAYED); unlock_page(hpage); return res; } else { - action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; } } @@ -1203,10 +1169,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (is_free_buddy_page(p)) { if (flags & MF_COUNT_INCREASED) - action_result(pfn, MSG_BUDDY, DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); else - action_result(pfn, MSG_BUDDY_2ND, - DELAYED); + action_result(pfn, MF_MSG_BUDDY_2ND, + MF_DELAYED); return 0; } } @@ -1219,7 +1185,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * If this happens just bail out. */ if (PageCompound(p) && compound_head(p) != orig_head) { - action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); + action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto out; } @@ -1259,7 +1225,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * on the head page to show that the hugepage is hwpoisoned */ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, MSG_POISONED_HUGE, IGNORED); + action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); unlock_page(hpage); put_page(hpage); return 0; @@ -1288,7 +1254,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - action_result(pfn, MSG_UNMAP_FAILED, IGNORED); + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; } @@ -1297,7 +1263,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); + action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; goto out; } -- cgit v1.2.3 From cc3e2af42e7b7e0457b93bf17c19b44c635cd40c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:33 -0700 Subject: memory-failure: change type of action_result's param 3 to enum Change type of action_result's param 3 to enum for type consistency, and rename mf_outcome to mf_result for clearly. Signed-off-by: Xie XiuQi Acked-by: Naoya Horiguchi Cc: Chen Gong Cc: Jim Davis Cc: Steven Rostedt Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b71a3cd3d0b0..15c0d5ab0893 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -852,7 +852,8 @@ static struct page_state { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum mf_action_page_type type, int result) +static void action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { pr_err("MCE %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); -- cgit v1.2.3 From 97f0b13452198290799fd6780f05fbaa74f927d3 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 24 Jun 2015 16:57:36 -0700 Subject: tracing: add trace event for memory-failure RAS user space tools like rasdaemon which base on trace event, could receive mce error event, but no memory recovery result event. So, I want to add this event to make this scenario complete. This patch add a event at ras group for memory-failure. The output like below: # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:24 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | mce-inject-13150 [001] .... 277.019359: memory_failure_event: pfn 0x19869: recovery action for free buddy page: Delayed [xiexiuqi@huawei.com: fix build error] Signed-off-by: Xie XiuQi Reviewed-by: Naoya Horiguchi Acked-by: Steven Rostedt Cc: Tony Luck Cc: Chen Gong Cc: Jim Davis Signed-off-by: Xie XiuQi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + mm/memory-failure.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 390214da4546..c180af880ed5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -368,6 +368,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select MEMORY_ISOLATION + select RAS help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 15c0d5ab0893..c53543d89282 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -57,6 +57,7 @@ #include #include #include "internal.h" +#include "ras/ras_event.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -855,6 +856,8 @@ static struct page_state { static void action_result(unsigned long pfn, enum mf_action_page_type type, enum mf_result result) { + trace_memory_failure_event(pfn, type, result); + pr_err("MCE %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } -- cgit v1.2.3 From 15a25b2ead5f97c5a63c169186e294b41ce03f9a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:39 -0700 Subject: mm/thp: split out pmd collapse flush into separate functions Architectures like ppc64 [1] need to do special things while clearing pmd before a collapse. For them this operation is largely different from a normal hugepage pte clear. Hence add a separate function to clear pmd before collapse. After this patch pmdp_* functions operate only on hugepage pte, and not on regular pmd_t values pointing to page table. [1] ppc64 needs to invalidate all the normal page pte mappings we already have inserted in the hardware hash page table. But before doing that we need to make sure there are no parallel hash page table insert going on. So we need to do a kick_all_cpus_sync() before flushing the older hash table entries. By moving this to a separate function we capture these details and mention how it is different from a hugepage pte clear. This patch is a cleanup and only does code movement for clarity. There should not be any change in functionality. Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b3d8cd8d6968..65dd8d67287b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -- cgit v1.2.3 From f28b6ff8c3d48af21354ef30164c8ccea69d5928 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:42 -0700 Subject: powerpc/mm: use generic version of pmdp_clear_flush() Also move the pmd_trans_huge check to generic code. Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c25f94b33811..f21dc5fbc6cd 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -126,6 +126,7 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(!pmd_trans_huge(*pmdp)); pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; @@ -198,3 +199,19 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef pmdp_collapse_flush +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif -- cgit v1.2.3 From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Jun 2015 16:57:44 -0700 Subject: mm: clarify that the function operates on hugepage pte We have confusing functions to clear pmd, pmd_clear_* and pmd_clear. Add _huge_ to pmdp_clear functions so that we are clear that they operate on hugepage pte. We don't bother about other functions like pmdp_set_wrprotect, pmdp_clear_flush_young, because they operate on PTE bits and hence indicate they are operating on hugepage ptes Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Andrea Arcangeli Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 16 ++++++++-------- mm/migrate.c | 2 +- mm/pgtable-generic.c | 14 +++++++++----- mm/rmap.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 65dd8d67287b..c107094f79ba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1174,7 +1174,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; /* * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_get_and_clear. So do the + * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { @@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl)) { @@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mkwrite(entry); @@ -2863,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); diff --git a/mm/migrate.c b/mm/migrate.c index d4fe1f94120b..ee401e4e5ef1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1799,7 +1799,7 @@ fail_putback: */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start); - pmdp_clear_flush_notify(vma, mmun_start, pmd); + pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index f21dc5fbc6cd..6b674e00153c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -119,15 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, } #endif -#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(!pmd_trans_huge(*pmdp)); - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } @@ -205,11 +205,15 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + /* + * pmd and hugepage pte format are same. So we could + * use the same function. + */ pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } diff --git a/mm/rmap.c b/mm/rmap.c index 9f47f152b01e..7af1ecb21ccb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pmd = pmd_offset(pud, address); /* - * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ -- cgit v1.2.3 From 22cc877b32202b6d82e580bc6b3b445531659d3e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 24 Jun 2015 16:57:47 -0700 Subject: mm: nommu: refactor debug and warning prints kenter/kleave/kdebug are wrapper macros to print functions flow and debug information. This set was written before pr_devel() was introduced, so it was controlled by "#if 0" construction. It is questionable if anyone is using them [1] now. This patch removes these macros, converts numerous printk(KERN_WARNING, ...) to use general pr_warn(...) and removes debug print line from validate_mmap_request() function. Signed-off-by: Leon Romanovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 112 +++++++++++-------------------------------------------------- 1 file changed, 20 insertions(+), 92 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index e544508e2a4b..05e7447d960b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -42,22 +42,6 @@ #include #include "internal.h" -#if 0 -#define kenter(FMT, ...) \ - printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) \ - printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) \ - printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) -#else -#define kenter(FMT, ...) \ - no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) \ - no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) \ - no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) -#endif - void *high_memory; EXPORT_SYMBOL(high_memory); struct page *mem_map; @@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to) for (; from < to; from += PAGE_SIZE) { struct page *page = virt_to_page(from); - kdebug("- free %lx", from); atomic_long_dec(&mmap_pages_allocated); - if (page_count(page) != 1) - kdebug("free page %p: refcount not one: %d", - page, page_count(page)); put_page(page); } } @@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, region->vm_usage); - BUG_ON(!nommu_region_tree.rb_node); if (--region->vm_usage == 0) { @@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region) /* IO memory and memory shared directly out of the pagecache * from ramfs/tmpfs mustn't be released here */ - if (region->vm_flags & VM_MAPPED_COPY) { - kdebug("free series"); + if (region->vm_flags & VM_MAPPED_COPY) free_page_series(region->vm_start, region->vm_top); - } kmem_cache_free(vm_region_jar, region); } else { up_write(&nommu_region_sem); @@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) struct address_space *mapping; struct rb_node **p, *parent, *rb_prev; - kenter(",%p", vma); - BUG_ON(!vma->vm_region); mm->map_count++; @@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; - kenter("%p", vma); - protect_vma(vma, 0); mm->map_count--; @@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) */ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED) { - printk(KERN_DEBUG - "%d: Can't do fixed-address/overlay mmap of RAM\n", - current->pid); + if (flags & MAP_FIXED) return -EINVAL; - } if ((flags & MAP_TYPE) != MAP_PRIVATE && (flags & MAP_TYPE) != MAP_SHARED) @@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file, ) { capabilities &= ~NOMMU_MAP_DIRECT; if (flags & MAP_SHARED) { - printk(KERN_WARNING - "MAP_SHARED not completely supported on !MMU\n"); + pr_warn("MAP_SHARED not completely supported on !MMU\n"); return -EINVAL; } } @@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma, * we're allocating is smaller than a page */ order = get_order(len); - kdebug("alloc order %d for %lx", order, len); - total = 1 << order; point = len >> PAGE_SHIFT; /* we don't want to allocate a power-of-2 sized page set */ - if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) total = point; - kdebug("try to alloc exact %lu pages", total); - } base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); if (!base) @@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long capabilities, vm_flags, result; int ret; - kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - *populate = 0; /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) { - kleave(" = %d [val]", ret); + if (ret < 0) return ret; - } /* we ignore the address hint */ addr = 0; @@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_start = start; vma->vm_end = start + len; - if (pregion->vm_flags & VM_MAPPED_COPY) { - kdebug("share copy"); + if (pregion->vm_flags & VM_MAPPED_COPY) vma->vm_flags |= VM_MAPPED_COPY; - } else { - kdebug("share mmap"); + else { ret = do_mmap_shared_file(vma); if (ret < 0) { vma->vm_region = NULL; @@ -1467,7 +1423,6 @@ share: up_write(&nommu_region_sem); - kleave(" = %lx", result); return result; error_just_free: @@ -1479,27 +1434,24 @@ error: if (vma->vm_file) fput(vma->vm_file); kmem_cache_free(vm_area_cachep, vma); - kleave(" = %d", ret); return ret; sharing_violation: up_write(&nommu_region_sem); - printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; error_getting_vma: kmem_cache_free(vm_region_jar, region); - printk(KERN_WARNING "Allocation of vma for %lu byte allocation" - " from process %d failed\n", - len, current->pid); + pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", + len, current->pid); show_free_areas(0); return -ENOMEM; error_getting_region: - printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" - " from process %d failed\n", - len, current->pid); + pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", + len, current->pid); show_free_areas(0); return -ENOMEM; } @@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_region *region; unsigned long npages; - kenter(""); - /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) @@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm, { struct vm_region *region; - kenter(""); - /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ delete_vma_from_mm(vma); @@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unsigned long end; int ret; - kenter(",%lx,%zx", start, len); - len = PAGE_ALIGN(len); if (len == 0) return -EINVAL; @@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (!vma) { static int limit; if (limit < 5) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d" - " (%s): 0x%lx-0x%lx\n", - current->pid, current->comm, - start, start + len - 1); + pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); limit++; } return -EINVAL; @@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* we're allowed to split an anonymous VMA but not a file-backed one */ if (vma->vm_file) { do { - if (start > vma->vm_start) { - kleave(" = -EINVAL [miss]"); + if (start > vma->vm_start) return -EINVAL; - } if (end == vma->vm_end) goto erase_whole_vma; vma = vma->vm_next; } while (vma); - kleave(" = -EINVAL [split file]"); return -EINVAL; } else { /* the chunk must be a subset of the VMA found */ if (start == vma->vm_start && end == vma->vm_end) goto erase_whole_vma; - if (start < vma->vm_start || end > vma->vm_end) { - kleave(" = -EINVAL [superset]"); + if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - } - if (start & ~PAGE_MASK) { - kleave(" = -EINVAL [unaligned start]"); + if (start & ~PAGE_MASK) return -EINVAL; - } - if (end != vma->vm_end && end & ~PAGE_MASK) { - kleave(" = -EINVAL [unaligned split]"); + if (end != vma->vm_end && end & ~PAGE_MASK) return -EINVAL; - } if (start != vma->vm_start && end != vma->vm_end) { ret = split_vma(mm, vma, start, 1); - if (ret < 0) { - kleave(" = %d [split]", ret); + if (ret < 0) return ret; - } } return shrink_vma(mm, vma, start, end); } @@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) erase_whole_vma: delete_vma_from_mm(vma); delete_vma(mm, vma); - kleave(" = 0"); return 0; } EXPORT_SYMBOL(do_munmap); @@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm) if (!mm) return; - kenter(""); - mm->total_vm = 0; while ((vma = mm->mmap)) { @@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm) delete_vma(mm, vma); cond_resched(); } - - kleave(""); } unsigned long vm_brk(unsigned long addr, unsigned long len) -- cgit v1.2.3 From 1dd308a7b49d4bdbc17bfa570675ecc8cf7bedb3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Jun 2015 16:57:52 -0700 Subject: mm/hugetlb: document the reserve map/region tracking routines While working on hugetlbfs fallocate support, I noticed the following race in the existing code. It is unlikely that this race is hit very often in the current code. However, if more functionality to add and remove pages to hugetlbfs mappings (such as fallocate) is added the likelihood of hitting this race will increase. alloc_huge_page and hugetlb_reserve_pages use information from the reserve map to determine if there are enough available huge pages to complete the operation, as well as adjust global reserve and subpool usage counts. The order of operations is as follows: - call region_chg() to determine the expected change based on reserve map - determine if enough resources are available for this operation - adjust global counts based on the expected change - call region_add() to update the reserve map The issue is that reserve map could change between the call to region_chg and region_add. In this case, the counters which were adjusted based on the output of region_chg will not be correct. In order to hit this race today, there must be an existing shared hugetlb mmap created with the MAP_NORESERVE flag. A page fault to allocate a huge page via this mapping must occur at the same another task is mapping the same region without the MAP_NORESERVE flag. The patch set does not prevent the race from happening. Rather, it adds simple functionality to detect when the race has occurred. If a race is detected, then the incorrect counts are adjusted. Review comments pointed out the need for documentation of the existing region/reserve map routines. This patch set also adds documentation in this area. This patch (of 3): This is a documentation only patch and does not modify any code. Descriptions of the routines used for reserve map/region tracking are added. Signed-off-by: Mike Kravetz Cc: Naoya Horiguchi Cc: Davidlohr Bueso Cc: David Rientjes Cc: Luiz Capitulino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 10de25cf1f99..4a1d7021efaf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -217,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are embedded into a resv_map and - * protected by a resv_map's lock + * The region data structures are embedded into a resv_map and protected + * by a resv_map's lock. The set of regions within the resv_map represent + * reservations for huge pages, or huge pages that have already been + * instantiated within the map. The from and to elements are huge page + * indicies into the associated mapping. from indicates the starting index + * of the region. to represents the first index past the end of the region. + * + * For example, a file region structure with from == 0 and to == 4 represents + * four huge pages in a mapping. It is important to note that the to element + * represents the first element past the end of the region. This is used in + * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. + * + * Interval notation of the form [from, to) will be used to indicate that + * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; @@ -226,6 +238,14 @@ struct file_region { long to; }; +/* + * Add the huge page range represented by [f, t) to the reserve + * map. Existing regions will be expanded to accommodate the + * specified range. We know only existing regions need to be + * expanded, because region_add is only called after region_chg + * with the same range. If a new file_region structure must + * be allocated, it is done in region_chg. + */ static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; @@ -265,6 +285,25 @@ static long region_add(struct resv_map *resv, long f, long t) return 0; } +/* + * Examine the existing reserve map and determine how many + * huge pages in the specified range [f, t) are NOT currently + * represented. This routine is called before a subsequent + * call to region_add that will actually modify the reserve + * map to add the specified range [f, t). region_chg does + * not change the number of huge pages represented by the + * map. However, if the existing regions in the map can not + * be expanded to represent the new range, a new file_region + * structure is added to the map as a placeholder. This is + * so that the subsequent region_add call will have all the + * regions it needs and will not fail. + * + * Returns the number of huge pages that need to be added + * to the existing reservation map for the range [f, t). + * This number is greater or equal to zero. -ENOMEM is + * returned if a new file_region structure is needed and can + * not be allocated. + */ static long region_chg(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; @@ -331,6 +370,11 @@ out_nrg: return chg; } +/* + * Truncate the reserve map at index 'end'. Modify/truncate any + * region which contains end. Delete any regions past end. + * Return the number of huge pages removed from the map. + */ static long region_truncate(struct resv_map *resv, long end) { struct list_head *head = &resv->regions; @@ -366,6 +410,10 @@ out: return chg; } +/* + * Count and return the number of huge pages in the reserve map + * that intersect with the range [f, t). + */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; -- cgit v1.2.3 From cf3ad20bfeadda693e408d85684790714fc29b08 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Jun 2015 16:57:55 -0700 Subject: mm/hugetlb: compute/return the number of regions added by region_add() Modify region_add() to keep track of regions(pages) added to the reserve map and return this value. The return value can be compared to the return value of region_chg() to determine if the map was modified between calls. Make vma_commit_reservation() also pass along the return value of region_add(). In the normal case, we want vma_commit_reservation to return the same value as the preceding call to vma_needs_reservation. Create a common __vma_reservation_common routine to help keep the special case return values in sync Signed-off-by: Mike Kravetz Cc: Naoya Horiguchi Cc: Davidlohr Bueso Cc: David Rientjes Cc: Luiz Capitulino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 72 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4a1d7021efaf..cd3fc4194733 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -245,11 +245,15 @@ struct file_region { * expanded, because region_add is only called after region_chg * with the same range. If a new file_region structure must * be allocated, it is done in region_chg. + * + * Return the number of new huge pages added to the map. This + * number is greater than or equal to zero. */ static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + long add = 0; spin_lock(&resv->lock); /* Locate the region we are either in or before. */ @@ -275,14 +279,24 @@ static long region_add(struct resv_map *resv, long f, long t) if (rg->to > t) t = rg->to; if (rg != nrg) { + /* Decrement return value by the deleted range. + * Another range will span this area so that by + * end of routine add will be >= zero + */ + add -= (rg->to - rg->from); list_del(&rg->link); kfree(rg); } } + + add += (nrg->from - f); /* Added to beginning of region */ nrg->from = f; + add += t - nrg->to; /* Added to end of region */ nrg->to = t; + spin_unlock(&resv->lock); - return 0; + VM_BUG_ON(add < 0); + return add; } /* @@ -1470,46 +1484,56 @@ static void return_unused_surplus_pages(struct hstate *h, } /* - * Determine if the huge page at addr within the vma has an associated - * reservation. Where it does not we will need to logically increase - * reservation and actually increase subpool usage before an allocation - * can occur. Where any new reservation would be required the - * reservation change is prepared, but not committed. Once the page - * has been allocated from the subpool and instantiated the change should - * be committed via vma_commit_reservation. No action is required on - * failure. + * vma_needs_reservation and vma_commit_reservation are used by the huge + * page allocation routines to manage reservations. + * + * vma_needs_reservation is called to determine if the huge page at addr + * within the vma has an associated reservation. If a reservation is + * needed, the value 1 is returned. The caller is then responsible for + * managing the global reservation and subpool usage counts. After + * the huge page has been allocated, vma_commit_reservation is called + * to add the page to the reservation map. + * + * In the normal case, vma_commit_reservation returns the same value + * as the preceding vma_needs_reservation call. The only time this + * is not the case is if a reserve map was changed between calls. It + * is the responsibility of the caller to notice the difference and + * take appropriate action. */ -static long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr) +static long __vma_reservation_common(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, + bool commit) { struct resv_map *resv; pgoff_t idx; - long chg; + long ret; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); - chg = region_chg(resv, idx, idx + 1); + if (commit) + ret = region_add(resv, idx, idx + 1); + else + ret = region_chg(resv, idx, idx + 1); if (vma->vm_flags & VM_MAYSHARE) - return chg; + return ret; else - return chg < 0 ? chg : 0; + return ret < 0 ? ret : 0; } -static void vma_commit_reservation(struct hstate *h, + +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct resv_map *resv; - pgoff_t idx; - - resv = vma_resv_map(vma); - if (!resv) - return; + return __vma_reservation_common(h, vma, addr, false); +} - idx = vma_hugecache_offset(h, vma, addr); - region_add(resv, idx, idx + 1); +static long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, true); } static struct page *alloc_huge_page(struct vm_area_struct *vma, -- cgit v1.2.3 From 33039678c8da8133e30ea3250d10ae14701dae2b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Jun 2015 16:57:58 -0700 Subject: mm/hugetlb: handle races in alloc_huge_page and hugetlb_reserve_pages alloc_huge_page and hugetlb_reserve_pages use region_chg to calculate the number of pages which will be added to the reserve map. Subpool and global reserve counts are adjusted based on the output of region_chg. Before the pages are actually added to the reserve map, these routines could race and add fewer pages than expected. If this happens, the subpool and global reserve counts are not correct. Compare the number of pages actually added (region_add) to those expected to added (region_chg). If fewer pages are actually added, this indicates a race and adjust counters accordingly. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: David Rientjes Cc: Luiz Capitulino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd3fc4194733..75c0eef52c5d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1542,7 +1542,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - long chg; + long chg, commit; int ret, idx; struct hugetlb_cgroup *h_cg; @@ -1583,7 +1583,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); - vma_commit_reservation(h, vma, addr); + commit = vma_commit_reservation(h, vma, addr); + if (unlikely(chg > commit)) { + /* + * The page was added to the reservation map between + * vma_needs_reservation and vma_commit_reservation. + * This indicates a race with hugetlb_reserve_pages. + * Adjust for the subpool count incremented above AND + * in hugetlb_reserve_pages for the same page. Also, + * the reservation count added in hugetlb_reserve_pages + * no longer applies. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + } return page; out_uncharge_cgroup: @@ -3701,8 +3716,24 @@ int hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(resv_map, from, to); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + long add = region_add(resv_map, from, to); + + if (unlikely(chg > add)) { + /* + * pages in this range were added to the reserve + * map between region_chg and region_add. This + * indicates a race with alloc_huge_page. Adjust + * the subpool and reserve counts modified above + * based on the difference. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, + chg - add); + hugetlb_acct_memory(h, -rsv_adjust); + } + } return 0; out_err: if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) -- cgit v1.2.3 From f0d6647e85050c6ca70d69a647e3c653dd9b349a Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 24 Jun 2015 16:58:01 -0700 Subject: mm/oom_kill.c: print points as unsigned int In oom_kill_process(), the variable 'points' is unsigned int. Print it as such. Signed-off-by: Wang Long Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5cfda39b3268..dff991e0681e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -517,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, dump_header(p, gfp_mask, order, memcg, nodemask); task_lock(p); - pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); task_unlock(p); -- cgit v1.2.3 From 0f96ae2928a547b86678688042a9759edcc8285d Mon Sep 17 00:00:00 2001 From: Shailendra Verma Date: Wed, 24 Jun 2015 16:58:03 -0700 Subject: mm/cma.c: fix typos in comments Signed-off-by: Shailendra Verma Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 3a7a67b93394..661278025c46 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment requied by mm core */ + /* ensure minimal alignment required by mm core */ alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); /* alignment should be aligned with order_per_bit */ @@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base, /* * high_memory isn't direct mapped memory so retrieving its physical * address isn't appropriate. But it would be useful to check the - * physical address of the highmem boundary so it's justfiable to get + * physical address of the highmem boundary so it's justifiable to get * the physical address from it. On x86 there is a validation check for * this case, so the following workaround is needed to avoid it. */ -- cgit v1.2.3 From 6afdb859b71019143b8eecda02b8b29b03185055 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Jun 2015 16:58:06 -0700 Subject: mm: do not ignore mapping_gfp_mask in page cache allocation paths page_cache_read, do_generic_file_read, __generic_file_splice_read and __ntfs_grab_cache_pages currently ignore mapping_gfp_mask when calling add_to_page_cache_lru which might cause recursion into fs down in the direct reclaim path if the mapping really relies on GFP_NOFS semantic. This doesn't seem to be the case now because page_cache_read (page fault path) doesn't seem to suffer from the reclaim recursion issues and do_generic_file_read and __generic_file_splice_read also shouldn't be called under fs locks which would deadlock in the reclaim path. Anyway it is better to obey mapping gfp mask and prevent from later breakage. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Cc: Dave Chinner Cc: Neil Brown Cc: Johannes Weiner Cc: Al Viro Cc: Mel Gorman Cc: Rik van Riel Cc: Tetsuo Handa Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 519af0045b1c..8d17ceea8dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1664,8 +1664,8 @@ no_cached_page: error = -ENOMEM; goto out; } - error = add_to_page_cache_lru(page, mapping, - index, GFP_KERNEL); + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1766,7 +1766,8 @@ static int page_cache_read(struct file *file, pgoff_t offset) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) -- cgit v1.2.3 From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:09 -0700 Subject: mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute Some high end Intel Xeon systems report uncorrectable memory errors as a recoverable machine check. Linux has included code for some time to process these and just signal the affected processes (or even recover completely if the error was in a read only page that can be replaced by reading from disk). But we have no recovery path for errors encountered during kernel code execution. Except for some very specific cases were are unlikely to ever be able to recover. Enter memory mirroring. Actually 3rd generation of memory mirroing. Gen1: All memory is mirrored Pro: No s/w enabling - h/w just gets good data from other side of the mirror Con: Halves effective memory capacity available to OS/applications Gen2: Partial memory mirror - just mirror memory begind some memory controllers Pro: Keep more of the capacity Con: Nightmare to enable. Have to choose between allocating from mirrored memory for safety vs. NUMA local memory for performance Gen3: Address range partial memory mirror - some mirror on each memory controller Pro: Can tune the amount of mirror and keep NUMA performance Con: I have to write memory management code to implement The current plan is just to use mirrored memory for kernel allocations. This has been broken into two phases: 1) This patch series - find the mirrored memory, use it for boot time allocations 2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the unused mirrored memory from mm/memblock.c and only give it out to select kernel allocations (this is still being scoped because page_alloc.c is scary). This patch (of 3): Add extra "flags" to memblock to allow selection of memory based on attribute. No functional changes Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 6 ++++-- mm/memblock.c | 55 +++++++++++++++++++++++++++++++++++-------------------- mm/memtest.c | 3 ++- mm/nobootmem.c | 6 ++++-- 4 files changed, 45 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 661278025c46..e7d1db533025 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base, */ if (base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range(size, alignment, - highmem_start, limit); + highmem_start, limit, + MEMBLOCK_NONE); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range(size, alignment, base, - limit); + limit, + MEMBLOCK_NONE); if (!addr) { ret = -ENOMEM; goto err; diff --git a/mm/memblock.c b/mm/memblock.c index 9318b567ed79..b9ff2f4f0285 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -107,6 +107,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -115,12 +116,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -139,6 +141,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -147,12 +150,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, + NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -174,6 +179,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Find @size free area aligned to @align in the specified range and node. * @@ -190,7 +196,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t kernel_end, ret; @@ -215,7 +221,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /* ok, try bottom-up allocation first */ ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid); + size, align, nid, flags); if (ret) return ret; @@ -233,7 +239,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, "memory hotunplug may be affected\n"); } - return __memblock_find_range_top_down(start, end, size, align, nid); + return __memblock_find_range_top_down(start, end, size, align, nid, + flags); } /** @@ -253,7 +260,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { return memblock_find_in_range_node(size, align, start, end, - NUMA_NO_NODE); + NUMA_NO_NODE, MEMBLOCK_NONE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -782,6 +789,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -803,7 +811,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, +void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -895,6 +903,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * @idx: pointer to u64 loop variable * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -903,7 +912,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -1050,14 +1059,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, start, end, nid); + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); if (found && !memblock_reserve(found, size)) { /* * The min_count is set to 0 so that memblock allocations are @@ -1070,26 +1080,30 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, } phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end) + phys_addr_t start, phys_addr_t end, + ulong flags) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + flags); } static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid) + int nid, ulong flags) { - return memblock_alloc_range_nid(size, align, 0, max_addr, nid); + return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, MEMBLOCK_NONE); } phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, + MEMBLOCK_NONE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -1173,13 +1187,14 @@ static void * __init memblock_virt_alloc_internal( again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid); + nid, MEMBLOCK_NONE); if (alloc) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, - max_addr, NUMA_NO_NODE); + max_addr, NUMA_NO_NODE, + MEMBLOCK_NONE); if (alloc) goto done; } diff --git a/mm/memtest.c b/mm/memtest.c index 1997d934b13b..0a1cc133f6d7 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) u64 i; phys_addr_t this_start, this_end; - for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start, + &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); if (this_start < this_end) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 90b50468333e..ad3641dcdbe7 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,7 +41,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(size, align, goal, limit, nid); + addr = memblock_find_in_range_node(size, align, goal, limit, nid, + MEMBLOCK_NONE); if (!addr) return NULL; @@ -121,7 +122,8 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) count += __free_memory_core(start, end); #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -- cgit v1.2.3 From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Jun 2015 16:58:12 -0700 Subject: mm/memblock: allocate boot time data structures from mirrored memory Try to allocate all boot time kernel data structures from mirrored memory. If we run out of mirrored memory print warnings, but fall back to using non-mirrored memory to make sure that we still boot. By number of bytes, most of what we allocate at boot time is the page structures. 64 bytes per 4K page on x86_64 ... or about 1.5% of total system memory. For workloads where the bulk of memory is allocated to applications this may represent a useful improvement to system availability since 1.5% of total memory might be a third of the memory allocated to the kernel. Signed-off-by: Tony Luck Cc: Xishi Qiu Cc: Hanjun Guo Cc: Xiexiuqi Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++--------- mm/nobootmem.c | 10 +++++++- 2 files changed, 76 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index b9ff2f4f0285..1b444c730846 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock; #ifdef CONFIG_MOVABLE_NODE bool movable_node_enabled __initdata_memblock = false; #endif +static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; +ulong __init_memblock choose_memblock_flags(void) +{ + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; +} + /* inline so we don't get a warning when pr_debug is compiled out */ static __init_memblock const char * memblock_type_name(struct memblock_type *type) @@ -259,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(size, align, start, end, - NUMA_NO_NODE, MEMBLOCK_NONE); + phys_addr_t ret; + ulong flags = choose_memblock_flags(); + +again: + ret = memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + + return ret; } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -785,6 +804,21 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); } +/** + * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) +{ + system_has_some_mirror = true; + + return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); +} + + /** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable @@ -839,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -944,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1096,8 +1138,18 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, - nid, MEMBLOCK_NONE); + ulong flags = choose_memblock_flags(); + phys_addr_t ret; + +again: + ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + return ret; } phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -1167,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; + ulong flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1187,14 +1240,14 @@ static void * __init memblock_virt_alloc_internal( again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid, MEMBLOCK_NONE); + nid, flags); if (alloc) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, NUMA_NO_NODE, - MEMBLOCK_NONE); + flags); if (alloc) goto done; } @@ -1202,10 +1255,16 @@ again: if (min_addr) { min_addr = 0; goto again; - } else { - goto error; } + if (flags & MEMBLOCK_MIRROR) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } + + return NULL; done: memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); @@ -1220,9 +1279,6 @@ done: kmemleak_alloc(ptr, size, 0, 0); return ptr; - -error: - return NULL; } /** diff --git a/mm/nobootmem.c b/mm/nobootmem.c index ad3641dcdbe7..5258386fa1be 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -37,12 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; + ulong flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; +again: addr = memblock_find_in_range_node(size, align, goal, limit, nid, - MEMBLOCK_NONE); + flags); + if (!addr && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } if (!addr) return NULL; -- cgit v1.2.3 From d1dc6f1bcf1e998e7ce65fc120da371ab047a999 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 24 Jun 2015 16:58:18 -0700 Subject: frontswap: allow multiple backends Change frontswap single pointer to a singly linked list of frontswap implementations. Update Xen tmem implementation as register no longer returns anything. Frontswap only keeps track of a single implementation; any implementation that registers second (or later) will replace the previously registered implementation, and gets a pointer to the previous implementation that the new implementation is expected to pass all frontswap functions to if it can't handle the function itself. However that method doesn't really make much sense, as passing that work on to every implementation adds unnecessary work to implementations; instead, frontswap should simply keep a list of all registered implementations and try each implementation for any function. Most importantly, neither of the two currently existing frontswap implementations in the kernel actually do anything with any previous frontswap implementation that they replace when registering. This allows frontswap to successfully manage multiple implementations by keeping a list of them all. Signed-off-by: Dan Streetman Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 215 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 84 deletions(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 8d82809eb085..27a9924caf61 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -21,11 +21,16 @@ #include /* - * frontswap_ops is set by frontswap_register_ops to contain the pointers - * to the frontswap "backend" implementation functions. + * frontswap_ops are added by frontswap_register_ops, and provide the + * frontswap "backend" implementation functions. Multiple implementations + * may be registered, but implementations can never deregister. This + * is a simple singly-linked list of all registered implementations. */ static struct frontswap_ops *frontswap_ops __read_mostly; +#define for_each_frontswap_ops(ops) \ + for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) + /* * If enabled, frontswap_store will return failure even on success. As * a result, the swap subsystem will always write the page to swap, in @@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { } * on all frontswap functions to not call the backend until the backend * has registered. * - * Specifically when no backend is registered (nobody called - * frontswap_register_ops) all calls to frontswap_init (which is done via - * swapon -> enable_swap_info -> frontswap_init) are registered and remembered - * (via the setting of need_init bitmap) but fail to create tmem_pools. When a - * backend registers with frontswap at some later point the previous - * calls to frontswap_init are executed (by iterating over the need_init - * bitmap) to create tmem_pools and set the respective poolids. All of that is - * guarded by us using atomic bit operations on the 'need_init' bitmap. - * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). * Fortunatly for us, the swapon_mutex has been taked by the callee so we are @@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { } * * Obviously the opposite (unloading the backend) must be done after all * the frontswap_[store|load|invalidate_area|invalidate_page] start - * ignorning or failing the requests - at which point frontswap_ops - * would have to be made in some fashion atomic. + * ignoring or failing the requests. However, there is currently no way + * to unload a backend once it is registered. */ -static DECLARE_BITMAP(need_init, MAX_SWAPFILES); /* - * Register operations for frontswap, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for frontswap */ -struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) +void frontswap_register_ops(struct frontswap_ops *ops) { - struct frontswap_ops *old = frontswap_ops; - int i; - - for (i = 0; i < MAX_SWAPFILES; i++) { - if (test_and_clear_bit(i, need_init)) { - struct swap_info_struct *sis = swap_info[i]; - /* __frontswap_init _should_ have set it! */ - if (!sis->frontswap_map) - return ERR_PTR(-EINVAL); - ops->init(i); - } + DECLARE_BITMAP(a, MAX_SWAPFILES); + DECLARE_BITMAP(b, MAX_SWAPFILES); + struct swap_info_struct *si; + unsigned int i; + + bitmap_zero(a, MAX_SWAPFILES); + bitmap_zero(b, MAX_SWAPFILES); + + spin_lock(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + if (!WARN_ON(!si->frontswap_map)) + set_bit(si->type, a); } + spin_unlock(&swap_lock); + + /* the new ops needs to know the currently active swap devices */ + for_each_set_bit(i, a, MAX_SWAPFILES) + ops->init(i); + /* - * We MUST have frontswap_ops set _after_ the frontswap_init's - * have been called. Otherwise __frontswap_store might fail. Hence - * the barrier to make sure compiler does not re-order us. + * Setting frontswap_ops must happen after the ops->init() calls + * above; cmpxchg implies smp_mb() which will ensure the init is + * complete at this point. */ - barrier(); - frontswap_ops = ops; - return old; + do { + ops->next = frontswap_ops; + } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + + spin_lock(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + if (si->frontswap_map) + set_bit(si->type, b); + } + spin_unlock(&swap_lock); + + /* + * On the very unlikely chance that a swap device was added or + * removed between setting the "a" list bits and the ops init + * calls, we re-check and do init or invalidate for any changed + * bits. + */ + if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!test_bit(i, a) && test_bit(i, b)) + ops->init(i); + else if (test_bit(i, a) && !test_bit(i, b)) + ops->invalidate_area(i); + } + } } EXPORT_SYMBOL(frontswap_register_ops); @@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); void __frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; BUG_ON(sis == NULL); @@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); - if (frontswap_ops) - frontswap_ops->init(type); - else { - BUG_ON(type >= MAX_SWAPFILES); - set_bit(type, need_init); - } + + for_each_frontswap_ops(ops) + ops->init(type); } EXPORT_SYMBOL(__frontswap_init); bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { - bool ret = false; - - if (frontswap_ops && sis->frontswap_map) - ret = test_bit(offset, sis->frontswap_map); - return ret; + if (sis->frontswap_map) + return test_bit(offset, sis->frontswap_map); + return false; } EXPORT_SYMBOL(__frontswap_test); +static inline void __frontswap_set(struct swap_info_struct *sis, + pgoff_t offset) +{ + set_bit(offset, sis->frontswap_map); + atomic_inc(&sis->frontswap_pages); +} + static inline void __frontswap_clear(struct swap_info_struct *sis, - pgoff_t offset) + pgoff_t offset) { clear_bit(offset, sis->frontswap_map); atomic_dec(&sis->frontswap_pages); @@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis, */ int __frontswap_store(struct page *page) { - int ret = -1, dup = 0; + int ret = -1; swp_entry_t entry = { .val = page_private(page), }; int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + struct frontswap_ops *ops; /* * Return if no backend registed. * Don't need to inc frontswap_failed_stores here. */ if (!frontswap_ops) - return ret; + return -1; BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (__frontswap_test(sis, offset)) - dup = 1; - ret = frontswap_ops->store(type, offset, page); + + /* + * If a dup, we must remove the old page first; we can't leave the + * old page no matter if the store of the new page succeeds or fails, + * and we can't rely on the new page replacing the old page as we may + * not store to the same implementation that contains the old page. + */ + if (__frontswap_test(sis, offset)) { + __frontswap_clear(sis, offset); + for_each_frontswap_ops(ops) + ops->invalidate_page(type, offset); + } + + /* Try to store in each implementation, until one succeeds. */ + for_each_frontswap_ops(ops) { + ret = ops->store(type, offset, page); + if (!ret) /* successful store */ + break; + } if (ret == 0) { - set_bit(offset, sis->frontswap_map); + __frontswap_set(sis, offset); inc_frontswap_succ_stores(); - if (!dup) - atomic_inc(&sis->frontswap_pages); } else { - /* - failed dup always results in automatic invalidate of - the (older) page from frontswap - */ inc_frontswap_failed_stores(); - if (dup) { - __frontswap_clear(sis, offset); - frontswap_ops->invalidate_page(type, offset); - } } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ @@ -268,14 +301,22 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + struct frontswap_ops *ops; + + if (!frontswap_ops) + return -1; BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - /* - * __frontswap_test() will check whether there is backend registered - */ - if (__frontswap_test(sis, offset)) - ret = frontswap_ops->load(type, offset, page); + if (!__frontswap_test(sis, offset)) + return -1; + + /* Try loading from each implementation, until one succeeds. */ + for_each_frontswap_ops(ops) { + ret = ops->load(type, offset, page); + if (!ret) /* successful load */ + break; + } if (ret == 0) { inc_frontswap_loads(); if (frontswap_tmem_exclusive_gets_enabled) { @@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load); void __frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; + + if (!frontswap_ops) + return; BUG_ON(sis == NULL); - /* - * __frontswap_test() will check whether there is backend registered - */ - if (__frontswap_test(sis, offset)) { - frontswap_ops->invalidate_page(type, offset); - __frontswap_clear(sis, offset); - inc_frontswap_invalidates(); - } + if (!__frontswap_test(sis, offset)) + return; + + for_each_frontswap_ops(ops) + ops->invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); } EXPORT_SYMBOL(__frontswap_invalidate_page); @@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page); void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; - if (frontswap_ops) { - BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) - return; - frontswap_ops->invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - bitmap_zero(sis->frontswap_map, sis->max); - } - clear_bit(type, need_init); + if (!frontswap_ops) + return; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + + for_each_frontswap_ops(ops) + ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); } EXPORT_SYMBOL(__frontswap_invalidate_area); -- cgit v1.2.3 From f4b90b70b7a4f5c29c442399ffd531332356e1f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2015 16:58:21 -0700 Subject: memcg: remove unused mem_cgroup->oom_wakeups Since commit 4942642080ea ("mm: memcg: handle non-error OOM situations more gracefully"), nobody uses mem_cgroup->oom_wakeups. Remove it. While at it, also fold memcg_wakeup_oom() into memcg_oom_recover() which is its only user. This cleanup was suggested by Michal. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8da44a083397..6a5f5d59f7d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -287,7 +287,6 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; - atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ @@ -1850,17 +1849,10 @@ static int memcg_oom_wake_function(wait_queue_t *wait, return autoremove_wake_function(wait, mode, sync, arg); } -static void memcg_wakeup_oom(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->oom_wakeups); - /* for filtering, pass "memcg" as argument. */ - __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); -} - static void memcg_oom_recover(struct mem_cgroup *memcg) { if (memcg && atomic_read(&memcg->under_oom)) - memcg_wakeup_oom(memcg); + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) -- cgit v1.2.3 From c2b42d3cadbffbf5117ccdbcb3a2fc47c0d59bae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2015 16:58:23 -0700 Subject: memcg: convert mem_cgroup->under_oom from atomic_t to int memcg->under_oom tracks whether the memcg is under OOM conditions and is an atomic_t counter managed with mem_cgroup_[un]mark_under_oom(). While atomic_t appears to be simple synchronization-wise, when used as a synchronization construct like here, it's trickier and more error-prone due to weak memory ordering rules, especially around atomic_read(), and false sense of security. For example, both non-trivial read sites of memcg->under_oom are a bit problematic although not being actually broken. * mem_cgroup_oom_register_event() It isn't explicit what guarantees the memory ordering between event addition and memcg->under_oom check. This isn't broken only because memcg_oom_lock is used for both event list and memcg->oom_lock. * memcg_oom_recover() The lockless test doesn't have any explanation why this would be safe. mem_cgroup_[un]mark_under_oom() are very cold paths and there's no point in avoiding locking memcg_oom_lock there. This patch converts memcg->under_oom from atomic_t to int, puts their modifications under memcg_oom_lock and documents why the lockless test in memcg_oom_recover() is safe. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a5f5d59f7d7..e65f7b0131d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -285,8 +285,9 @@ struct mem_cgroup { */ bool use_hierarchy; + /* protected by memcg_oom_lock */ bool oom_lock; - atomic_t under_oom; + int under_oom; int swappiness; /* OOM-Killer disable */ @@ -1809,8 +1810,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_inc(&iter->under_oom); + iter->under_oom++; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) @@ -1819,11 +1822,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) /* * When a new child is created while the hierarchy is under oom, - * mem_cgroup_oom_lock() may not be called. We have to use - * atomic_add_unless() here. + * mem_cgroup_oom_lock() may not be called. Watch for underflow. */ + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_add_unless(&iter->under_oom, -1, 0); + if (iter->under_oom > 0) + iter->under_oom--; + spin_unlock(&memcg_oom_lock); } static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1851,7 +1856,15 @@ static int memcg_oom_wake_function(wait_queue_t *wait, static void memcg_oom_recover(struct mem_cgroup *memcg) { - if (memcg && atomic_read(&memcg->under_oom)) + /* + * For the following lockless ->under_oom test, the only required + * guarantee is that it must see the state asserted by an OOM when + * this function is called as a result of userland actions + * triggered by the notification of the OOM. This is trivially + * achieved by invoking mem_cgroup_mark_under_oom() before + * triggering notification. + */ + if (memcg && memcg->under_oom) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } @@ -3860,7 +3873,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->under_oom)) + if (memcg->under_oom) eventfd_signal(eventfd, 1); spin_unlock(&memcg_oom_lock); @@ -3889,7 +3902,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); - seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); + seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); return 0; } -- cgit v1.2.3 From c5f3b1a51a591c18c8b33983908e7fdda6ae417e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 24 Jun 2015 16:58:26 -0700 Subject: mm: kmemleak: allow safe memory scanning during kmemleak disabling The kmemleak scanning thread can run for minutes. Callbacks like kmemleak_free() are allowed during this time, the race being taken care of by the object->lock spinlock. Such lock also prevents a memory block from being freed or unmapped while it is being scanned by blocking the kmemleak_free() -> ... -> __delete_object() function until the lock is released in scan_object(). When a kmemleak error occurs (e.g. it fails to allocate its metadata), kmemleak_enabled is set and __delete_object() is no longer called on freed objects. If kmemleak_scan is running at the same time, kmemleak_free() no longer waits for the object scanning to complete, allowing the corresponding memory block to be freed or unmapped (in the case of vfree()). This leads to kmemleak_scan potentially triggering a page fault. This patch separates the kmemleak_free() enabling/disabling from the overall kmemleak_enabled nob so that we can defer the disabling of the object freeing tracking until the scanning thread completed. The kmemleak_free_part() is deliberately ignored by this patch since this is only called during boot before the scanning thread started. Signed-off-by: Catalin Marinas Reported-by: Vignesh Radhakrishnan Tested-by: Vignesh Radhakrishnan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f0fe4f2c1fa7..41df5b8efd25 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -195,6 +195,8 @@ static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ static int kmemleak_enabled; +/* same as above but only for the kmemleak_free() callback */ +static int kmemleak_free_enabled; /* set in the late_initcall if there were no errors */ static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ @@ -942,7 +944,7 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); @@ -982,7 +984,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); @@ -1750,6 +1752,13 @@ static void kmemleak_do_cleanup(struct work_struct *work) mutex_lock(&scan_mutex); stop_scan_thread(); + /* + * Once the scan thread has stopped, it is safe to no longer track + * object freeing. Ordering of the scan thread stopping and the memory + * accesses below is guaranteed by the kthread_stop() function. + */ + kmemleak_free_enabled = 0; + if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else @@ -1776,6 +1785,8 @@ static void kmemleak_disable(void) /* check whether it is too early for a kernel thread */ if (kmemleak_initialized) schedule_work(&cleanup_work); + else + kmemleak_free_enabled = 0; pr_info("Kernel memory leak detector disabled\n"); } @@ -1840,8 +1851,10 @@ void __init kmemleak_init(void) if (kmemleak_error) { local_irq_restore(flags); return; - } else + } else { kmemleak_enabled = 1; + kmemleak_free_enabled = 1; + } local_irq_restore(flags); /* -- cgit v1.2.3 From e781a9ab4847a81224667f5faab0b2bc5f78908f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 24 Jun 2015 16:58:29 -0700 Subject: mm: kmemleak: fix delete_object_*() race when called on the same memory block Calling delete_object_*() on the same pointer is not a standard use case (unless there is a bug in the code calling kmemleak_free()). However, during kmemleak disabling (error or user triggered via /sys), there is a potential race between kmemleak_free() calls on a CPU and __kmemleak_do_cleanup() on a different CPU. The current delete_object_*() implementation first performs a look-up holding kmemleak_lock, increments the object->use_count and then re-acquires kmemleak_lock to remove the object from object_tree_root and object_list. This patch simplifies the delete_object_*() mechanism to both look up and remove an object from the object_tree_root and object_list atomically (guarded by kmemleak_lock). This allows safe concurrent calls to delete_object_*() on the same pointer without additional locking for synchronising the kmemleak_free_enabled flag. A side effect is a slight improvement in the delete_object_*() performance by avoiding acquiring kmemleak_lock twice and incrementing/decrementing object->use_count. Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 41df5b8efd25..ecde522ff616 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -497,6 +497,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) return object; } +/* + * Look up an object in the object search tree and remove it from both + * object_tree_root and object_list. The returned object's use_count should be + * at least 1, as initially set by create_object(). + */ +static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias) +{ + unsigned long flags; + struct kmemleak_object *object; + + write_lock_irqsave(&kmemleak_lock, flags); + object = lookup_object(ptr, alias); + if (object) { + rb_erase(&object->rb_node, &object_tree_root); + list_del_rcu(&object->object_list); + } + write_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + /* * Save stack trace to the given array of MAX_TRACE size. */ @@ -600,20 +621,14 @@ out: } /* - * Remove the metadata (struct kmemleak_object) for a memory block from the - * object_list and object_tree_root and decrement its use_count. + * Mark the object as not allocated and schedule RCU freeing via put_object(). */ static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - write_lock_irqsave(&kmemleak_lock, flags); - rb_erase(&object->rb_node, &object_tree_root); - list_del_rcu(&object->object_list); - write_unlock_irqrestore(&kmemleak_lock, flags); - WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 2); + WARN_ON(atomic_read(&object->use_count) < 1); /* * Locking here also ensures that the corresponding memory block @@ -633,7 +648,7 @@ static void delete_object_full(unsigned long ptr) { struct kmemleak_object *object; - object = find_and_get_object(ptr, 0); + object = find_and_remove_object(ptr, 0); if (!object) { #ifdef DEBUG kmemleak_warn("Freeing unknown object at 0x%08lx\n", @@ -642,7 +657,6 @@ static void delete_object_full(unsigned long ptr) return; } __delete_object(object); - put_object(object); } /* @@ -655,7 +669,7 @@ static void delete_object_part(unsigned long ptr, size_t size) struct kmemleak_object *object; unsigned long start, end; - object = find_and_get_object(ptr, 1); + object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx " @@ -663,7 +677,6 @@ static void delete_object_part(unsigned long ptr, size_t size) #endif return; } - __delete_object(object); /* * Create one or two objects that may result from the memory block @@ -681,7 +694,7 @@ static void delete_object_part(unsigned long ptr, size_t size) create_object(ptr + size, end - ptr - size, object->min_count, GFP_KERNEL); - put_object(object); + __delete_object(object); } static void __paint_it(struct kmemleak_object *object, int color) -- cgit v1.2.3 From 5f369f374ba4889fe3c17883402db5ee8d254216 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 24 Jun 2015 16:58:31 -0700 Subject: mm: kmemleak: do not acquire scan_mutex in kmemleak_do_cleanup() The kmemleak_do_cleanup() work thread already waits for the kmemleak_scan thread to finish via kthread_stop(). Waiting in kthread_stop() while scan_mutex is held may lead to deadlock if kmemleak_scan_thread() also waits to acquire for scan_mutex. Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index ecde522ff616..8a57e34625fa 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1762,7 +1762,6 @@ static void __kmemleak_do_cleanup(void) */ static void kmemleak_do_cleanup(struct work_struct *work) { - mutex_lock(&scan_mutex); stop_scan_thread(); /* @@ -1777,7 +1776,6 @@ static void kmemleak_do_cleanup(struct work_struct *work) else pr_info("Kmemleak disabled without freeing internal data. " "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); - mutex_unlock(&scan_mutex); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); -- cgit v1.2.3 From 9d5a4c730dd164f6f1b4ed6690fbe2667e5149ea Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 24 Jun 2015 16:58:34 -0700 Subject: mm: kmemleak: avoid deadlock on the kmemleak object insertion error path While very unlikely (usually kmemleak or sl*b bug), the create_object() function in mm/kmemleak.c may fail to insert a newly allocated object into the rb tree. When this happens, kmemleak disables itself and prints additional information about the object already found in the rb tree. Such printing is done with the parent->lock acquired, however the kmemleak_lock is already held. This is a potential race with the scanning thread which acquires object->lock and kmemleak_lock in a This patch removes the locking around the 'parent' object information printing. Such object cannot be freed or removed from object_tree_root and object_list since kmemleak_lock is already held. There is a very small risk that some of the object data is being modified on another CPU but the only downside is inconsistent information printing. Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8a57e34625fa..c0fd7769d227 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -53,6 +53,11 @@ * modifications to the memory scanning parameters including the scan_thread * pointer * + * Locks and mutexes should only be acquired/nested in the following order: + * + * scan_mutex -> object->lock -> other_object->lock (SINGLE_DEPTH_NESTING) + * -> kmemleak_lock + * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes * 0, this count can no longer be incremented and put_object() schedules the @@ -603,11 +608,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, kmemleak_stop("Cannot insert 0x%lx into the object " "search tree (overlaps existing)\n", ptr); + /* + * No need for parent->lock here since "parent" cannot + * be freed while the kmemleak_lock is held. + */ + dump_object_info(parent); kmem_cache_free(object_cache, object); - object = parent; - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); + object = NULL; goto out; } } -- cgit v1.2.3 From 93ada579b0eea06f808aef08ead64bb230fb7851 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 24 Jun 2015 16:58:37 -0700 Subject: mm: kmemleak: optimise kmemleak_lock acquiring during kmemleak_scan The kmemleak memory scanning uses finer grained object->lock spinlocks primarily to avoid races with the memory block freeing. However, the pointer lookup in the rb tree requires the kmemleak_lock to be held. This is currently done in the find_and_get_object() function for each pointer-like location read during scanning. While this allows a low latency on kmemleak_*() callbacks on other CPUs, the memory scanning is slower. This patch moves the kmemleak_lock outside the scan_block() loop, acquiring/releasing it only once per scanned memory block. The allow_resched logic is moved outside scan_block() and a new scan_large_block() function is implemented which splits large blocks in MAX_SCAN_SIZE chunks with cond_resched() calls in-between. A redundant (object->flags & OBJECT_NO_SCAN) check is also removed from scan_object(). With this patch, the kmemleak scanning performance is significantly improved: at least 50% with lock debugging disabled and over an order of magnitude with lock proving enabled (on an arm64 system). Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 90 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c0fd7769d227..ca9e5a5969a8 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -53,10 +53,12 @@ * modifications to the memory scanning parameters including the scan_thread * pointer * - * Locks and mutexes should only be acquired/nested in the following order: + * Locks and mutexes are acquired/nested in the following order: * - * scan_mutex -> object->lock -> other_object->lock (SINGLE_DEPTH_NESTING) - * -> kmemleak_lock + * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING) + * + * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex + * regions. * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes @@ -490,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) rcu_read_lock(); read_lock_irqsave(&kmemleak_lock, flags); - if (ptr >= min_addr && ptr < max_addr) - object = lookup_object(ptr, alias); + object = lookup_object(ptr, alias); read_unlock_irqrestore(&kmemleak_lock, flags); /* check whether the object is still available */ @@ -1170,19 +1171,18 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned, int allow_resched) + struct kmemleak_object *scanned) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); + unsigned long flags; + read_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; - unsigned long flags; unsigned long pointer; - if (allow_resched) - cond_resched(); if (scan_should_stop()) break; @@ -1195,26 +1195,31 @@ static void scan_block(void *_start, void *_end, pointer = *ptr; kasan_enable_current(); - object = find_and_get_object(pointer, 1); + if (pointer < min_addr || pointer >= max_addr) + continue; + + /* + * No need for get_object() here since we hold kmemleak_lock. + * object->use_count cannot be dropped to 0 while the object + * is still present in object_tree_root and object_list + * (with updates protected by kmemleak_lock). + */ + object = lookup_object(pointer, 1); if (!object) continue; - if (object == scanned) { + if (object == scanned) /* self referenced, ignore */ - put_object(object); continue; - } /* * Avoid the lockdep recursive warning on object->lock being * previously acquired in scan_object(). These locks are * enclosed by scan_mutex. */ - spin_lock_irqsave_nested(&object->lock, flags, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); if (!color_white(object)) { /* non-orphan, ignored or new */ - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); + spin_unlock(&object->lock); continue; } @@ -1226,13 +1231,27 @@ static void scan_block(void *_start, void *_end, */ object->count++; if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); list_add_tail(&object->gray_list, &gray_list); - spin_unlock_irqrestore(&object->lock, flags); - continue; } + spin_unlock(&object->lock); + } + read_unlock_irqrestore(&kmemleak_lock, flags); +} - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); +/* + * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. + */ +static void scan_large_block(void *start, void *end) +{ + void *next; + + while (start < end) { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, NULL); + start = next; + cond_resched(); } } @@ -1258,22 +1277,25 @@ static void scan_object(struct kmemleak_object *object) if (hlist_empty(&object->area_list)) { void *start = (void *)object->pointer; void *end = (void *)(object->pointer + object->size); + void *next; + + do { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, object); - while (start < end && (object->flags & OBJECT_ALLOCATED) && - !(object->flags & OBJECT_NO_SCAN)) { - scan_block(start, min(start + MAX_SCAN_SIZE, end), - object, 0); - start += MAX_SCAN_SIZE; + start = next; + if (start >= end) + break; spin_unlock_irqrestore(&object->lock, flags); cond_resched(); spin_lock_irqsave(&object->lock, flags); - } + } while (object->flags & OBJECT_ALLOCATED); } else hlist_for_each_entry(area, &object->area_list, node) scan_block((void *)area->start, (void *)(area->start + area->size), - object, 0); + object); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -1350,14 +1372,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL, 1); - scan_block(__bss_start, __bss_stop, NULL, 1); + scan_large_block(_sdata, _edata); + scan_large_block(__bss_start, __bss_stop); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) - scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL, 1); + scan_large_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i)); #endif /* @@ -1378,7 +1400,7 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL, 1); + scan_block(page, page + 1, NULL); } } put_online_mems(); @@ -1392,7 +1414,7 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL, 0); + THREAD_SIZE, NULL); } while_each_thread(g, p); read_unlock(&tasklist_lock); } -- cgit v1.2.3 From e37609bb36f706c954e82e580e2e790e9d5caef8 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Wed, 24 Jun 2015 16:58:39 -0700 Subject: mm/mmap.c: optimization of do_mmap_pgoff function The simple check for zero length memory mapping may be performed earlier. So that in case of zero length memory mapping some unnecessary code is not executed at all. It does not make the code less readable and saves some CPU cycles. Signed-off-by: Piotr Kwapulinski Acked-by: Michal Hocko Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bb50cacc3ea5..aa632ade2be7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, *populate = 0; + if (!len) + return -EINVAL; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) prot |= PROT_EXEC; - if (!len) - return -EINVAL; - if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); -- cgit v1.2.3 From c435a390574d012f8d30074135d8fcc6f480b484 Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Wed, 24 Jun 2015 16:58:42 -0700 Subject: mm/memory hotplug: print the last vmemmap region at the end of hot add memory When hot add two nodes continuously, we found the vmemmap region info is a bit messed. The last region of node 2 is printed when node 3 hot added, like the following: Initmem setup node 2 [mem 0x0000000000000000-0xffffffffffffffff] On node 2 totalpages: 0 Built 2 zonelists in Node order, mobility grouping on. Total pages: 16090539 Policy zone: Normal init_memory_mapping: [mem 0x40000000000-0x407ffffffff] [mem 0x40000000000-0x407ffffffff] page 1G [ffffea1000000000-ffffea10001fffff] PMD -> [ffff8a077d800000-ffff8a077d9fffff] on node 2 [ffffea1000200000-ffffea10003fffff] PMD -> [ffff8a077de00000-ffff8a077dffffff] on node 2 ... [ffffea101f600000-ffffea101f9fffff] PMD -> [ffff8a074ac00000-ffff8a074affffff] on node 2 [ffffea101fa00000-ffffea101fdfffff] PMD -> [ffff8a074a800000-ffff8a074abfffff] on node 2 Initmem setup node 3 [mem 0x0000000000000000-0xffffffffffffffff] On node 3 totalpages: 0 Built 3 zonelists in Node order, mobility grouping on. Total pages: 16090539 Policy zone: Normal init_memory_mapping: [mem 0x60000000000-0x607ffffffff] [mem 0x60000000000-0x607ffffffff] page 1G [ffffea101fe00000-ffffea101fffffff] PMD -> [ffff8a074a400000-ffff8a074a5fffff] on node 2 <=== node 2 ??? [ffffea1800000000-ffffea18001fffff] PMD -> [ffff8a074a600000-ffff8a074a7fffff] on node 3 [ffffea1800200000-ffffea18005fffff] PMD -> [ffff8a074a000000-ffff8a074a3fffff] on node 3 [ffffea1800600000-ffffea18009fffff] PMD -> [ffff8a0749c00000-ffff8a0749ffffff] on node 3 ... The cause is the last region was missed at the and of hot add memory, and p_start, p_end, node_start were not reset, so when hot add memory to a new node, it will consider they are not contiguous blocks and print the previous one. So we print the last vmemmap region at the end of hot add memory to avoid the confusion. Signed-off-by: Zhu Guihua Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9e88f749aa51..26fbba7d888f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -513,6 +513,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, break; err = 0; } + vmemmap_populate_print_last(); return err; } -- cgit v1.2.3 From afa2db2fb6f15f860069de94a1257db57589fe95 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jun 2015 16:58:45 -0700 Subject: tmpfs: truncate prealloc blocks past i_size One of the rocksdb people noticed that when you do something like this fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 10M) pwrite(fd, buf, 5M, 0) ftruncate(5M) on tmpfs, the file would still take up 10M: which led to super fun issues because we were getting ENOSPC before we thought we should be getting ENOSPC. This patch fixes the problem, and mirrors what all the other fs'es do (and was agreed to be the correct behaviour at LSF). I tested it locally to make sure it worked properly with the following xfs_io -f -c "falloc -k 0 10M" -c "pwrite 0 5M" -c "truncate 5M" file Without the patch we have "Blocks: 20480", with the patch we have the correct value of "Blocks: 10240". Signed-off-by: Josef Bacik Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 3759099d8ce4..4caf8ed24d65 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } - if (newsize < oldsize) { + if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); shmem_truncate_range(inode, newsize, (loff_t)-1); -- cgit v1.2.3 From 0867a57c4f80a566dda1bac975b42fcd857cb489 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Jun 2015 16:58:48 -0700 Subject: mm, thp: respect MPOL_PREFERRED policy with non-local node Since commit 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node"), we handle THP allocations on page fault in a special way - for non-interleave memory policies, the allocation is only attempted on the node local to the current CPU, if the policy's nodemask allows the node. This is motivated by the assumption that THP benefits cannot offset the cost of remote accesses, so it's better to fallback to base pages on the local node (which might still be available, while huge pages are not due to fragmentation) than to allocate huge pages on a remote node. The nodemask check prevents us from violating e.g. MPOL_BIND policies where the local node is not among the allowed nodes. However, the current implementation can still give surprising results for the MPOL_PREFERRED policy when the preferred node is different than the current CPU's local node. In such case we should honor the preferred node and not use the local node, which is what this patch does. If hugepage allocation on the preferred node fails, we fall back to base pages and don't try other nodes, with the same motivation as is done for the local node hugepage allocations. The patch also moves the MPOL_INTERLEAVE check around to simplify the hugepage specific test. The difference can be demonstrated using in-tree transhuge-stress test on the following 2-node machine where half memory on one node was occupied to show the difference. > numactl --hardware available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 24 25 26 27 28 29 30 31 32 33 34 35 node 0 size: 7878 MB node 0 free: 3623 MB node 1 cpus: 12 13 14 15 16 17 18 19 20 21 22 23 36 37 38 39 40 41 42 43 44 45 46 47 node 1 size: 8045 MB node 1 free: 7818 MB node distances: node 0 1 0: 10 21 1: 21 10 Before the patch: > numactl -p0 -C0 ./transhuge-stress transhuge-stress: 2.197 s/loop, 0.276 ms/page, 7249.168 MiB/s 7962 succeed, 0 failed, 1786 different pages > numactl -p0 -C12 ./transhuge-stress transhuge-stress: 2.962 s/loop, 0.372 ms/page, 5376.172 MiB/s 7962 succeed, 0 failed, 3873 different pages Number of successful THP allocations corresponds to free memory on node 0 in the first case and node 1 in the second case, i.e. -p parameter is ignored and cpu binding "wins". After the patch: > numactl -p0 -C0 ./transhuge-stress transhuge-stress: 2.183 s/loop, 0.274 ms/page, 7295.516 MiB/s 7962 succeed, 0 failed, 1760 different pages > numactl -p0 -C12 ./transhuge-stress transhuge-stress: 2.878 s/loop, 0.361 ms/page, 5533.638 MiB/s 7962 succeed, 0 failed, 1750 different pages > numactl -p1 -C0 ./transhuge-stress transhuge-stress: 4.628 s/loop, 0.581 ms/page, 3440.893 MiB/s 7962 succeed, 0 failed, 3918 different pages The -p parameter is respected regardless of cpu binding. > numactl -C0 ./transhuge-stress transhuge-stress: 2.202 s/loop, 0.277 ms/page, 7230.003 MiB/s 7962 succeed, 0 failed, 1750 different pages > numactl -C12 ./transhuge-stress transhuge-stress: 3.020 s/loop, 0.379 ms/page, 5273.324 MiB/s 7962 succeed, 0 failed, 3916 different pages Without -p parameter, hugepage restriction to CPU-local node works as before. Fixes: 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node") Signed-off-by: Vlastimil Babka Cc: Aneesh Kumar K.V Acked-by: David Rientjes Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Michal Hocko Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 747743237d9f..99d4c1d0b858 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1972,35 +1972,41 @@ retry_cpuset: pol = get_vma_policy(vma, addr); cpuset_mems_cookie = read_mems_allowed_begin(); - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && - pol->mode != MPOL_INTERLEAVE)) { + if (pol->mode == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + page = alloc_page_interleave(gfp, order, nid); + goto out; + } + + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + /* * For hugepage allocation and non-interleave policy which - * allows the current node, we only try to allocate from the - * current node and don't fall back to other nodes, as the - * cost of remote accesses would likely offset THP benefits. + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. * * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ + if (pol->mode == MPOL_PREFERRED && + !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(node, *nmask)) { + if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(node, + page = alloc_pages_exact_node(hpage_node, gfp | __GFP_THISNODE, order); goto out; } } - if (pol->mode == MPOL_INTERLEAVE) { - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - page = alloc_page_interleave(gfp, order, nid); - goto out; - } - nmask = policy_nodemask(gfp, pol); zl = policy_zonelist(gfp, pol, node); mpol_cond_put(pol); -- cgit v1.2.3 From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 24 Jun 2015 16:58:51 -0700 Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc() Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the following INFO splat is logged: =============================== [ INFO: suspicious RCU usage. ] 4.1.0-rc7-next-20150612 #1 Not tainted ------------------------------- kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by systemd/1: #0: (rtnl_mutex){+.+.+.}, at: [] rtnetlink_rcv+0x1f/0x40 #1: (rcu_read_lock_bh){......}, at: [] ipv6_add_addr+0x62/0x540 #2: (addrconf_hash_lock){+...+.}, at: [] ipv6_add_addr+0x184/0x540 stack backtrace: CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1 Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20 04/17/2014 Call Trace: dump_stack+0x4c/0x6e lockdep_rcu_suspicious+0xe7/0x120 ___might_sleep+0x1d5/0x1f0 __might_sleep+0x4d/0x90 kmem_cache_alloc+0x47/0x250 create_object+0x39/0x2e0 kmemleak_alloc_percpu+0x61/0xe0 pcpu_alloc+0x370/0x630 Additional backtrace lines are truncated. In addition, the above splat is followed by several "BUG: sleeping function called from invalid context at mm/slub.c:1268" outputs. As suggested by Martin KaFai Lau, these are the clue to the fix. Routine kmemleak_alloc_percpu() always uses GFP_KERNEL for its allocations, whereas it should follow the gfp from its callers. Reviewed-by: Catalin Marinas Reviewed-by: Kamalesh Babulal Acked-by: Martin KaFai Lau Signed-off-by: Larry Finger Cc: Martin KaFai Lau Cc: Catalin Marinas Cc: Tejun Heo Cc: Christoph Lameter Cc: [3.18+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 9 +++++---- mm/percpu.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index ca9e5a5969a8..cf79f110157c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -930,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); * kmemleak_alloc_percpu - register a newly allocated __percpu object * @ptr: __percpu pointer to beginning of the object * @size: size of the object + * @gfp: flags used for kmemleak internal memory allocations * * This function is called from the kernel percpu allocator when a new object - * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL - * allocation. + * (memory block) is allocated (alloc_percpu). */ -void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) +void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) { unsigned int cpu; @@ -948,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), - size, 0, GFP_KERNEL); + size, 0, gfp); else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } diff --git a/mm/percpu.c b/mm/percpu.c index dfd02484e8de..2dd74487a0af 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1030,7 +1030,7 @@ area_found: memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); - kmemleak_alloc_percpu(ptr, size); + kmemleak_alloc_percpu(ptr, size, gfp); return ptr; fail_unlock: -- cgit v1.2.3