From dce0b4fcf24d431eea39d7675c72a8b830ae6a65 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 6 Feb 2014 12:04:24 -0800 Subject: mm: __set_page_dirty_nobuffers() uses spin_lock_irqsave() instead of spin_lock_irq() commit a85d9df1ea1d23682a0ed1e100e6965006595d06 upstream. During aio stress test, we observed the following lockdep warning. This mean AIO+numa_balancing is currently deadlockable. The problem is, aio_migratepage disable interrupt, but __set_page_dirty_nobuffers unintentionally enable it again. Generally, all helper function should use spin_lock_irqsave() instead of spin_lock_irq() because they don't know caller at all. other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&ctx->completion_lock)->rlock); lock(&(&ctx->completion_lock)->rlock); *** DEADLOCK *** dump_stack+0x19/0x1b print_usage_bug+0x1f7/0x208 mark_lock+0x21d/0x2a0 mark_held_locks+0xb9/0x140 trace_hardirqs_on_caller+0x105/0x1d0 trace_hardirqs_on+0xd/0x10 _raw_spin_unlock_irq+0x2c/0x50 __set_page_dirty_nobuffers+0x8c/0xf0 migrate_page_copy+0x434/0x540 aio_migratepage+0xb1/0x140 move_to_new_page+0x7d/0x230 migrate_pages+0x5e5/0x700 migrate_misplaced_page+0xbc/0xf0 do_numa_page+0x102/0x190 handle_pte_fault+0x241/0x970 handle_mm_fault+0x265/0x370 __do_page_fault+0x172/0x5a0 do_page_fault+0x1a/0x70 page_fault+0x28/0x30 Signed-off-by: KOSAKI Motohiro Cc: Larry Woodman Cc: Rik van Riel Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5a06d4cb9a3d..73cbc5dc150b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2026,11 +2026,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -2039,7 +2040,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -- cgit v1.2.3 From b0d4c0f8122abd6069312ccc20089d5c1c19c772 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 3 Jul 2013 15:02:37 -0700 Subject: mm/memory-failure.c: fix memory leak in successful soft offlining commit f15bdfa802bfa5eb6b4b5a241b97ec9fa1204a35 upstream. After a successful page migration by soft offlining, the source page is not properly freed and it's never reusable even if we unpoison it afterward. This is caused by the race between freeing page and setting PG_hwpoison. In successful soft offlining, the source page is put (and the refcount becomes 0) by putback_lru_page() in unmap_and_move(), where it's linked to pagevec and actual freeing back to buddy is delayed. So if PG_hwpoison is set for the page before freeing, the freeing does not functions as expected (in such case freeing aborts in free_pages_prepare() check.) This patch tries to make sure to free the source page before setting PG_hwpoison on it. To avoid reallocating, the page keeps MIGRATE_ISOLATE until after setting PG_hwpoison. This patch also removes obsolete comments about "keeping elevated refcount" because what they say is not true. Unlike memory_failure(), soft_offline_page() uses no special page isolation code, and the soft-offlined pages have no elevated. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Xishi Qiu Signed-off-by: Greg Kroah-Hartman --- mm/memory-failure.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3b4120e38d48..f2a591d87d00 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1421,7 +1421,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* * Isolate the page, so that it doesn't get reallocated if it - * was free. + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. */ set_migratetype_isolate(p, true); /* @@ -1444,7 +1445,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p, MIGRATE_MOVABLE); unlock_memory_hotplug(); return ret; } @@ -1511,7 +1511,6 @@ static int soft_offline_huge_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } - /* keep elevated page count for bad page */ return ret; } @@ -1576,7 +1575,7 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } - /* keep elevated page count for bad page */ + unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } @@ -1642,7 +1641,22 @@ static int __soft_offline_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); atomic_long_inc(&num_poisoned_pages); } } else { -- cgit v1.2.3 From 6843d9254c8cd2decb30edaf7deffaad4d244c51 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 14 Feb 2014 10:33:35 +0800 Subject: mm: fix process accidentally killed by mce because of huge page migration Based on c8721bbbdd36382de51cd6b7a56322e0acca2414 upstream, but only the bugfix portion pulled out. Hi Naoya or Greg, We found a bug in 3.10.x. The problem is that we accidentally have a hwpoisoned hugepage in free hugepage list. It could happend in the the following scenario: process A process B migrate_huge_page put_page (old hugepage) linked to free hugepage list hugetlb_fault hugetlb_no_page alloc_huge_page dequeue_huge_page_vma dequeue_huge_page_node (steal hwpoisoned hugepage) set_page_hwpoison_huge_page dequeue_hwpoisoned_huge_page (fail to dequeue) I tested this bug, one process keeps allocating huge page, and I use sysfs interface to soft offline a huge page, then received: "MCE: Killing UCP:2717 due to hardware memory corruption fault at 8200034" Upstream kernel is free from this bug because of these two commits: f15bdfa802bfa5eb6b4b5a241b97ec9fa1204a35 mm/memory-failure.c: fix memory leak in successful soft offlining c8721bbbdd36382de51cd6b7a56322e0acca2414 mm: memory-hotplug: enable memory hotplug to handle hugepage The first one, although the problem is about memory leak, this patch moves unset_migratetype_isolate(), which is important to avoid the race. The latter is not a bug fix and it's too big, so I rewrite a small one. The following patch can fix this bug.(please apply f15bdfa802bf first) Signed-off-by: Xishi Qiu Reviewed-by: Naoya Horiguchi Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 40ad2c6e0ca9..aa3b9a63394b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -517,9 +518,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (list_empty(&h->hugepage_freelists[nid])) + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) return NULL; - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; -- cgit v1.2.3 From 2186bb4e794d284012e9af6bf592e5375279439d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 10 Feb 2014 14:25:50 -0800 Subject: mm/memory-failure.c: move refcount only in !MF_COUNT_INCREASED commit 8d547ff4ac5927245e0833ac18528f939da0ee0e upstream. mce-test detected a test failure when injecting error to a thp tail page. This is because we take page refcount of the tail page in madvise_hwpoison() while the fix in commit a3e0f9e47d5e ("mm/memory-failure.c: transfer page count from head page to tail page after split thp") assumes that we always take refcount on the head page. When a real memory error happens we take refcount on the head page where memory_failure() is called without MF_COUNT_INCREASED set, so it seems to me that testing memory error on thp tail page using madvise makes little sense. This patch cancels moving refcount in !MF_COUNT_INCREASED for valid testing. [akpm@linux-foundation.org: s/&&/&/] Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Wanpeng Li Cc: Chen Gong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory-failure.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f2a591d87d00..e386beefc994 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -943,8 +943,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * to it. Similarly, page lock is shifted. */ if (hpage != p) { - put_page(hpage); - get_page(p); + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } lock_page(p); unlock_page(hpage); *hpagep = p; -- cgit v1.2.3