From aa7994f281a5e705b5f9cb13b3219fc346263872 Mon Sep 17 00:00:00 2001 From: Li Haifeng Date: Mon, 17 Sep 2012 14:09:21 -0700 Subject: mm/page_alloc: fix the page address of higher page's buddy calculation commit 0ba8f2d59304dfe69b59c034de723ad80f7ab9ac upstream. The heuristic method for buddy has been introduced since commit 43506fad21ca ("mm/page_alloc.c: simplify calculation of combined index of adjacent buddy lists"). But the page address of higher page's buddy was wrongly calculated, which will lead page_is_buddy to fail for ever. IOW, the heuristic method would be disabled with the wrong page address of higher page's buddy. Calculating the page address of higher page's buddy should be based higher_page with the offset between index of higher page and index of higher page's buddy. Signed-off-by: Haifeng Li Signed-off-by: Gavin Shan Reviewed-by: Michal Hocko Cc: KyongHo Cho Cc: Mel Gorman Cc: Minchan Kim Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 918330f71dba..88a6d87041df 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -579,7 +579,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); -- cgit v1.2.3 From 967c0e4ab954d228ca87d85c523ae8a65c03cc5e Mon Sep 17 00:00:00 2001 From: qiuxishi Date: Mon, 17 Sep 2012 14:09:24 -0700 Subject: memory hotplug: fix section info double registration bug commit f14851af0ebb32745c6c5a2e400aa0549f9d20df upstream. There may be a bug when registering section info. For example, on my Itanium platform, the pfn range of node0 includes the other nodes, so other nodes' section info will be double registered, and memmap's page count will equal to 3. node0: start_pfn=0x100, spanned_pfn=0x20fb00, present_pfn=0x7f8a3, => 0x000100-0x20fc00 node1: start_pfn=0x80000, spanned_pfn=0x80000, present_pfn=0x80000, => 0x080000-0x100000 node2: start_pfn=0x100000, spanned_pfn=0x80000, present_pfn=0x80000, => 0x100000-0x180000 node3: start_pfn=0x180000, spanned_pfn=0x80000, present_pfn=0x80000, => 0x180000-0x200000 free_all_bootmem_node() register_page_bootmem_info_node() register_page_bootmem_info_section() When hot remove memory, we can't free the memmap's page because page_count() is 2 after put_page_bootmem(). sparse_remove_one_section() free_section_usemap() free_map_bootmem() put_page_bootmem() [akpm@linux-foundation.org: add code comment] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Acked-by: Mel Gorman Cc: "Luck, Tony" Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory_hotplug.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6629fafd6ce4..9ad7d1ef6ac1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -127,9 +127,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -188,9 +185,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) end_pfn = pfn + pgdat->node_spanned_pages; /* register_section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) - register_page_bootmem_info_section(pfn); - + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other node. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -- cgit v1.2.3 From 391c314928c37b0c946061c0adddd5bd5730c8b7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:53 -0700 Subject: mm: sparse: fix usemap allocation above node descriptor section commit 99ab7b19440a72ebdf225f99b20f8ef40decee86 upstream. After commit f5bf18fa22f8 ("bootmem/sparsemem: remove limit constraint in alloc_bootmem_section"), usemap allocations may easily be placed outside the optimal section that holds the node descriptor, even if there is space available in that section. This results in unnecessary hotplug dependencies that need to have the node unplugged before the section holding the usemap. The reason is that the bootmem allocator doesn't guarantee a linear search starting from the passed allocation goal but may start out at a much higher address absent an upper limit. Fix this by trying the allocation with the limit at the section end, then retry without if that fails. This keeps the fix from f5bf18fa22f8 of not panicking if the allocation does not fit in the section, but still makes sure to try to stay within the section at first. [rewritten massively by Johannes to apply to 3.4] Signed-off-by: Yinghai Lu Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..53cf62b186b6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -766,13 +766,17 @@ void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { bootmem_data_t *bdata; - unsigned long pfn, goal; + unsigned long pfn, goal, limit; pfn = section_nr_to_pfn(section_nr); goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); + if (goal + size > limit) + limit = 0; + + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); } #endif -- cgit v1.2.3 From 49194d4e7d8c14fb83f213c54a476685f8389c70 Mon Sep 17 00:00:00 2001 From: Satoru Moriya Date: Tue, 29 May 2012 15:06:47 -0700 Subject: mm: avoid swapping out with swappiness==0 commit fe35004fbf9eaf67482b074a2e032abb9c89b1dd upstream. Sometimes we'd like to avoid swapping out anonymous memory. In particular, avoid swapping out pages of important process or process groups while there is a reasonable amount of pagecache on RAM so that we can satisfy our customers' requirements. OTOH, we can control how aggressive the kernel will swap memory pages with /proc/sys/vm/swappiness for global and /sys/fs/cgroup/memory/memory.swappiness for each memcg. But with current reclaim implementation, the kernel may swap out even if we set swappiness=0 and there is pagecache in RAM. This patch changes the behavior with swappiness==0. If we set swappiness==0, the kernel does not swap out completely (for global reclaim until the amount of free pages and filebacked pages in a zone has been reduced to something very very small (nr_free + nr_filebacked < high watermark)). Signed-off-by: Satoru Moriya Acked-by: Minchan Kim Reviewed-by: Rik van Riel Acked-by: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index be5bc0af2e76..e989ee22f100 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1983,10 +1983,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; spin_unlock_irq(&mz->zone->lru_lock); @@ -1999,7 +1999,7 @@ out: unsigned long scan; scan = zone_nr_lru_pages(mz, lru); - if (priority || noswap) { + if (priority || noswap || !vmscan_swappiness(mz, sc)) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; -- cgit v1.2.3