From 40e0484473ac43b557796c9f392c2cb69b1f55ae Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:50 -0700 Subject: mm/memblock: cleanup on duplicate VA/PA conversion commit 4e2f07750d9a94e8f23e86408df5ab95be88bf11 upstream. The overall memblock has been organized into the memory regions and reserved regions. Initially, the memory regions and reserved regions are stored in the predetermined arrays of "struct memblock _region". It's possible for the arrays to be enlarged when we have newly added regions for them, but no enough space there. Under the situation, We will created double-sized array to meet the requirement. However, the original implementation converted the VA (Virtual Address) of the newly allocated array of regions to PA (Physical Address), then translate back when we allocates the new array from slab. That's actually unnecessary. The patch removes the duplicate VA/PA conversion. Signed-off-by: Gavin Shan Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memblock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index a44eab3157f8..eae06ea3aa50 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -212,14 +212,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; - } else + } else { addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + new_array = addr ? __va(addr) : 0; + } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; } - new_array = __va(addr); memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); -- cgit v1.2.3 From 659586507ac1f4219d1b2f2d471381e39f7e6961 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:50 -0700 Subject: mm/memblock: fix memory leak on extending regions commit 181eb39425f2b9275afcb015eaa547d11f71a02f upstream. The overall memblock has been organized into the memory regions and reserved regions. Initially, the memory regions and reserved regions are stored in the predetermined arrays of "struct memblock _region". It's possible for the arrays to be enlarged when we have newly added regions, but no free space left there. The policy here is to create double-sized array either by slab allocator or memblock allocator. Unfortunately, we didn't free the old array, which might be allocated through slab allocator before. That would cause memory leak. The patch introduces 2 variables to trace where (slab or memblock) the memory and reserved regions come from. The memory for the memory or reserved regions will be deallocated by kfree() if that was allocated by slab allocator. Thus to fix the memory leak issue. Signed-off-by: Gavin Shan Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memblock.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index eae06ea3aa50..952123eba433 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = { int memblock_debug __initdata_memblock; static int memblock_can_resize __initdata_memblock; +static int memblock_memory_in_slab __initdata_memblock = 0; +static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ static inline const char *memblock_type_name(struct memblock_type *type) @@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) struct memblock_region *new_array, *old_array; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); + int *in_slab; /* We don't allow resizing until we know about the reserved regions * of memory that aren't suitable for allocation @@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* Retrieve the slab flag */ + if (type == &memblock.memory) + in_slab = &memblock_memory_in_slab; + else + in_slab = &memblock_reserved_in_slab; + /* Try to find some space for it. * * WARNING: We assume that either slab_is_available() and we use it or @@ -235,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) type->regions = new_array; type->max <<= 1; - /* If we use SLAB that's it, we are done */ - if (use_slab) - return 0; - - /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_reserve(addr, new_size)); - - /* If the array wasn't our static init one, then free it. We only do - * that before SLAB is available as later on, we don't know whether - * to use kfree or free_bootmem_pages(). Shouldn't be a big deal - * anyways + /* Free old array. We needn't free it if the array is the + * static one */ - if (old_array != memblock_memory_init_regions && - old_array != memblock_reserved_init_regions) + if (*in_slab) + kfree(old_array); + else if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) memblock_free(__pa(old_array), old_size); + /* Reserve the new array if that comes from the memblock. + * Otherwise, we needn't do it + */ + if (!use_slab) + BUG_ON(memblock_reserve(addr, new_size)); + + /* Update slab flag */ + *in_slab = use_slab; + return 0; } -- cgit v1.2.3 From 757fcf2b6df4814604764a4abb54d2b58f422fb5 Mon Sep 17 00:00:00 2001 From: Greg Pearson Date: Wed, 20 Jun 2012 12:53:05 -0700 Subject: mm/memblock: fix overlapping allocation when doubling reserved array commit 48c3b583bbddad2220ca4c22319ca5d1f78b2090 upstream. __alloc_memory_core_early() asks memblock for a range of memory then try to reserve it. If the reserved region array lacks space for the new range, memblock_double_array() is called to allocate more space for the array. If memblock is used to allocate memory for the new array it can end up using a range that overlaps with the range originally allocated in __alloc_memory_core_early(), leading to possible data corruption. With this patch memblock_double_array() now calls memblock_find_in_range() with a narrowed candidate range (in cases where the reserved.regions array is being doubled) so any memory allocated will not overlap with the original range that was being reserved. The range is narrowed by passing in the starting address and size of the previously allocated range. Then the range above the ending address is searched and if a candidate is not found, the range below the starting address is searched. Signed-off-by: Greg Pearson Signed-off-by: Yinghai Lu Acked-by: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memblock.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 952123eba433..0e737d9ab2ba 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -184,7 +184,24 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -static int __init_memblock memblock_double_array(struct memblock_type *type) +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; phys_addr_t old_size, new_size, addr; @@ -222,7 +239,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; } else { - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_size, sizeof(phys_addr_t)); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_size, sizeof(phys_addr_t)); + new_array = addr ? __va(addr) : 0; } if (!addr) { @@ -399,7 +427,7 @@ repeat: */ if (!insert) { while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; @@ -450,7 +478,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for (i = 0; i < type->cnt; i++) { -- cgit v1.2.3 From 2c07f25ea7800adb36cd8da9b58c4ecd3fc3d064 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 15:24:40 +0200 Subject: splice: fix racy pipe->buffers uses commit 047fe3605235888f3ebcda0c728cb31937eadfe6 upstream. Dave Jones reported a kernel BUG at mm/slub.c:3474! triggered by splice_shrink_spd() called from vmsplice_to_pipe() commit 35f3d14dbbc5 (pipe: add support for shrinking and growing pipes) added capability to adjust pipe->buffers. Problem is some paths don't hold pipe mutex and assume pipe->buffers doesn't change for their duration. Fix this by adding nr_pages_max field in struct splice_pipe_desc, and use it in place of pipe->buffers where appropriate. splice_shrink_spd() loses its struct pipe_inode_info argument. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Alexander Viro Cc: Tom Herbert Tested-by: Dave Jones Signed-off-by: Jens Axboe [bwh: Backported to 3.2: - Adjust context in vmsplice_to_pipe() - Update one more call to splice_shrink_spd(), from skb_splice_bits()] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f99ff3e50bd6..9d65a02a8799 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1365,6 +1365,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1453,7 +1454,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; -- cgit v1.2.3 From 0c4ad5cc8c01f62fe5211b5ce9563c27f795a4ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Jul 2012 16:00:11 -0700 Subject: mm: Hold a file reference in madvise_remove commit 9ab4233dd08036fe34a89c7dc6f47a8bf2eb29eb upstream. Otherwise the code races with munmap (causing a use-after-free of the vma) or with close (causing a use-after-free of the struct file). The bug was introduced by commit 90ed52ebe481 ("[PATCH] holepunch: fix mmap_sem i_mutex deadlock") Cc: Hugh Dickins Cc: Miklos Szeredi Cc: Badari Pulavarty Cc: Nick Piggin Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds [bwh: Backported to 3.2: - Adjust context - madvise_remove() calls vmtruncate_range(), not do_fallocate()] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- mm/madvise.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b6674..55f645c85a35 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -203,14 +204,16 @@ static long madvise_remove(struct vm_area_struct *vma, struct address_space *mapping; loff_t offset, endoff; int error; + struct file *f; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; - if (!vma->vm_file || !vma->vm_file->f_mapping - || !vma->vm_file->f_mapping->host) { + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } @@ -224,9 +227,16 @@ static long madvise_remove(struct vm_area_struct *vma, endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex */ + /* + * vmtruncate_range may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); + fput(f); down_read(¤t->mm->mmap_sem); return error; } -- cgit v1.2.3 From 0e343dbe08acb440f7914d989bcc32c1d1576735 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 11 Jul 2012 14:01:52 -0700 Subject: memory hotplug: fix invalid memory access caused by stale kswapd pointer commit d8adde17e5f858427504725218c56aef90e90fc7 upstream. kswapd_stop() is called to destroy the kswapd work thread when all memory of a NUMA node has been offlined. But kswapd_stop() only terminates the work thread without resetting NODE_DATA(nid)->kswapd to NULL. The stale pointer will prevent kswapd_run() from creating a new work thread when adding memory to the memory-less NUMA node again. Eventually the stale pointer may cause invalid memory access. An example stack dump as below. It's reproduced with 2.6.32, but latest kernel has the same issue. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] exit_creds+0x12/0x78 PGD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/system/memory/memory391/state CPU 11 Modules linked in: cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq microcode fuse loop dm_mod tpm_tis rtc_cmos i2c_i801 rtc_core tpm serio_raw pcspkr sg tpm_bios igb i2c_core iTCO_wdt rtc_lib mptctl iTCO_vendor_support button dca bnx2 usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix libata thermal processor thermal_sys hwmon mptsas mptscsih mptbase scsi_transport_sas scsi_mod Pid: 7949, comm: sh Not tainted 2.6.32.12-qiuxishi-5-default #92 Tecal RH2285 RIP: 0010:exit_creds+0x12/0x78 RSP: 0018:ffff8806044f1d78 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff880604f22140 RCX: 0000000000019502 RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000 RBP: ffff880604f22150 R08: 0000000000000000 R09: ffffffff81a4dc10 R10: 00000000000032a0 R11: ffff880006202500 R12: 0000000000000000 R13: 0000000000c40000 R14: 0000000000008000 R15: 0000000000000001 FS: 00007fbc03d066f0(0000) GS:ffff8800282e0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000060f029000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process sh (pid: 7949, threadinfo ffff8806044f0000, task ffff880603d7c600) Stack: ffff880604f22140 ffffffff8103aac5 ffff880604f22140 ffffffff8104d21e ffff880006202500 0000000000008000 0000000000c38000 ffffffff810bd5b1 0000000000000000 ffff880603d7c600 00000000ffffdd29 0000000000000003 Call Trace: __put_task_struct+0x5d/0x97 kthread_stop+0x50/0x58 offline_pages+0x324/0x3da memory_block_change_state+0x179/0x1db store_mem_state+0x9e/0xbb sysfs_write_file+0xd0/0x107 vfs_write+0xad/0x169 sys_write+0x45/0x6e system_call_fastpath+0x16/0x1b Code: ff 4d 00 0f 94 c0 84 c0 74 08 48 89 ef e8 1f fd ff ff 5b 5d 31 c0 41 5c c3 53 48 8b 87 20 06 00 00 48 89 fb 48 8b bf 18 06 00 00 <8b> 00 48 c7 83 18 06 00 00 00 00 00 00 f0 ff 0f 0f 94 c0 84 c0 RIP exit_creds+0x12/0x78 RSP CR2: 0000000000000000 [akpm@linux-foundation.org: add pglist_data.kswapd locking comments] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0932dc27e19d..4607cc62b1d9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3279,14 +3279,17 @@ int kswapd_run(int nid) } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold lock_memory_hotplug(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) -- cgit v1.2.3 From 7a08b440fa93e036968102597c8a2ab809a9bdc4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 11 Jul 2012 14:02:13 -0700 Subject: mm, thp: abort compaction if migration page cannot be charged to memcg commit 4bf2bba3750f10aa9e62e6949bc7e8329990f01b upstream. If page migration cannot charge the temporary page to the memcg, migrate_pages() will return -ENOMEM. This isn't considered in memory compaction however, and the loop continues to iterate over all pageblocks trying to isolate and migrate pages. If a small number of very large memcgs happen to be oom, however, these attempts will mostly be futile leading to an enormous amout of cpu consumption due to the page migration failures. This patch will short circuit and fail memory compaction if migrate_pages() returns -ENOMEM. COMPACT_PARTIAL is returned in case some migrations were successful so that the page allocator will retry. Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Minchan Kim Cc: Kamezawa Hiroyuki Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 74a8c825ff28..459b0ab60524 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -594,8 +594,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; + if (err == -ENOMEM) { + ret = COMPACT_PARTIAL; + goto out; + } } - } out: -- cgit v1.2.3 From 7ad71f960f0f6e06cbded278809674afc515036a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:56 -0700 Subject: memblock: free allocated memblock_reserved_regions later commit 29f6738609e40227dabcc63bfb3b84b3726a75bd upstream. memblock_free_reserved_regions() calls memblock_free(), but memblock_free() would double reserved.regions too, so we could free the old range for reserved.regions. Also tj said there is another bug which could be related to this. | I don't think we're saving any noticeable | amount by doing this "free - give it to page allocator - reserve | again" dancing. We should just allocate regions aligned to page | boundaries and free them later when memblock is no longer in use. in that case, when DEBUG_PAGEALLOC, will get panic: memblock_free: [0x0000102febc080-0x0000102febf080] memblock_free_reserved_regions+0x37/0x39 BUG: unable to handle kernel paging request at ffff88102febd948 IP: [] __next_free_mem_range+0x9b/0x155 PGD 4826063 PUD cf67a067 PMD cf7fa067 PTE 800000102febd160 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU 0 Pid: 0, comm: swapper Not tainted 3.5.0-rc2-next-20120614-sasha #447 RIP: 0010:[] [] __next_free_mem_range+0x9b/0x155 See the discussion at https://lkml.org/lkml/2012/6/13/469 So try to allocate with PAGE_SIZE alignment and free it later. Reported-by: Sasha Levin Acked-by: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memblock.c | 51 +++++++++++++++++++++++---------------------------- mm/nobootmem.c | 38 +++++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 0e737d9ab2ba..280d3d7835d6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, MAX_NUMNODES); } -/* - * Free memblock.reserved.regions - */ -int __init_memblock memblock_free_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_free(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - -/* - * Reserve memblock.reserved.regions - */ -int __init_memblock memblock_reserve_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_reserve(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { type->total_size -= type->regions[r].size; @@ -184,6 +160,18 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + /** * memblock_double_array - double the size of the memblock regions array * @type: memblock type of the regions array being doubled @@ -204,6 +192,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); int *in_slab; @@ -217,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); /* Retrieve the slab flag */ if (type == &memblock.memory) @@ -245,11 +240,11 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, addr = memblock_find_in_range(new_area_start + new_area_size, memblock.current_limit, - new_size, sizeof(phys_addr_t)); + new_alloc_size, PAGE_SIZE); if (!addr && new_area_size) addr = memblock_find_in_range(0, min(new_area_start, memblock.current_limit), - new_size, sizeof(phys_addr_t)); + new_alloc_size, PAGE_SIZE); new_array = addr ? __va(addr) : 0; } @@ -279,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_size); + memblock_free(__pa(old_array), old_alloc_size); /* Reserve the new array if that comes from the memblock. * Otherwise, we needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_size)); + BUG_ON(memblock_reserve(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 1983fb1c7026..218e6f95d44f 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn > end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + unsigned long __init free_low_memory_core_early(int nodeid) { unsigned long count = 0; - phys_addr_t start, end; + phys_addr_t start, end, size; u64 i; - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } - } + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + count += __free_memory_core(start, end); + + /* free range that is used for reserved array if we allocate it */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } -- cgit v1.2.3 From 07aa70120e4be526a468b9c3ea93b45cfbe95013 Mon Sep 17 00:00:00 2001 From: Aaditya Kumar Date: Tue, 17 Jul 2012 15:48:07 -0700 Subject: mm: fix lost kswapd wakeup in kswapd_stop() commit 1c7e7f6c0703d03af6bcd5ccc11fc15d23e5ecbe upstream. Offlining memory may block forever, waiting for kswapd() to wake up because kswapd() does not check the event kthread->should_stop before sleeping. The proper pattern, from Documentation/memory-barriers.txt, is: --- waker --- event_indicated = 1; wake_up_process(event_daemon); --- sleeper --- for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (event_indicated) break; schedule(); } set_current_state() may be wrapped by: prepare_to_wait(); In the kswapd() case, event_indicated is kthread->should_stop. === offlining memory (waker) === kswapd_stop() kthread_stop() kthread->should_stop = 1 wake_up_process() wait_for_completion() === kswapd_try_to_sleep (sleeper) === kswapd_try_to_sleep() prepare_to_wait() . . schedule() . . finish_wait() The schedule() needs to be protected by a test of kthread->should_stop, which is wrapped by kthread_should_stop(). Reproducer: Do heavy file I/O in background. Do a memory offline/online in a tight loop Signed-off-by: Aaditya Kumar Acked-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4607cc62b1d9..be5bc0af2e76 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3013,7 +3013,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - schedule(); + + if (!kthread_should_stop()) + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) -- cgit v1.2.3