diff options
author | Anders Roxell <anders.roxell@linaro.org> | 2014-07-08 14:40:42 +0200 |
---|---|---|
committer | Anders Roxell <anders.roxell@linaro.org> | 2014-07-08 14:54:45 +0200 |
commit | d7789ba79b1d081932a2da6d05f78320a002505a (patch) | |
tree | 1462027856037610434f3911017a1d22113bf771 /mm | |
parent | 83f12c5cb20669c30ac711f28f09d7f464124d00 (diff) | |
parent | bbae7add628cfe96a1facd578dd1eddcd1030de7 (diff) |
Merge tag 'v3.14.10' into v3.14-rt
This is the 3.14.10 stable release
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Conflicts:
include/linux/irqdesc.h
include/linux/thread_info.h
kernel/irq/manage.c
kernel/locking/rtmutex.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/compaction.c | 22 | ||||
-rw-r--r-- | mm/huge_memory.c | 13 | ||||
-rw-r--r-- | mm/hugetlb.c | 1 | ||||
-rw-r--r-- | mm/memory-failure.c | 90 | ||||
-rw-r--r-- | mm/memory.c | 5 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 52 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 46 |
13 files changed, 195 insertions, 82 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 64007ac64526..fc9125819598 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -263,6 +263,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + boolean + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT diff --git a/mm/compaction.c b/mm/compaction.c index 918577595ea8..5f702ef0a65f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -666,16 +666,20 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; + unsigned long high_pfn, low_pfn, pfn, z_end_pfn; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to + * the pageblock boundary, because we do pfn -= pageblock_nr_pages + * in the for loop. + * The low boundary is the end of the pageblock the migration scanner + * is using. */ - pfn = cc->free_pfn; + pfn = cc->free_pfn & ~(pageblock_nr_pages-1); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* @@ -695,6 +699,7 @@ static void isolate_freepages(struct zone *zone, for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; + unsigned long end_pfn; /* * This can iterate a massively long zone without finding any @@ -729,13 +734,10 @@ static void isolate_freepages(struct zone *zone, isolated = 0; /* - * As pfn may not start aligned, pfn+pageblock_nr_page - * may cross a MAX_ORDER_NR_PAGES boundary and miss - * a pfn_valid check. Ensure isolate_freepages_block() - * only scans within a pageblock + * Take care when isolating in last pageblock of a zone which + * ends in the middle of a pageblock. */ - end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - end_pfn = min(end_pfn, z_end_pfn); + end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); isolated = isolate_freepages_block(cc, pfn, end_pfn, freelist, false); nr_freepages += isolated; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655a2d78..1c42d0c36d0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1611,16 +1611,23 @@ pmd_t *page_check_address_pmd(struct page *page, enum page_check_address_pmd_flag flag, spinlock_t **ptl) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) return NULL; - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + *ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) goto unlock; if (pmd_page(*pmd) != page) goto unlock; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2de3c845f03a..06a9bc0a3120 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1134,6 +1134,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 90002ea43638..33365e9ce6a7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -208,9 +208,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #endif si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; - if ((flags & MF_ACTION_REQUIRED) && t == current) { + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; - ret = force_sig_info(SIGBUS, &si, t); + ret = force_sig_info(SIGBUS, &si, current); } else { /* * Don't use force here, it's convenient if the signal @@ -384,20 +384,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) +{ + struct task_struct *t; if (!tsk->mm) - return 0; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return NULL; + if (force_early) + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -412,16 +443,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -432,7 +464,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -442,10 +474,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -455,8 +487,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -469,7 +501,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -480,9 +513,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -967,7 +1000,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill); + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) @@ -1085,15 +1118,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } else if (PageHuge(hpage)) { /* - * Check "just unpoisoned", "filter hit", and - * "race with other subpage." + * Check "filter hit" and "race with other subpage." */ lock_page(hpage); - if (!PageHWPoison(hpage) - || (hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &num_poisoned_pages); - return 0; + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); @@ -1156,6 +1190,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); res = 0; goto out; } diff --git a/mm/memory.c b/mm/memory.c index eef1e5cfe8de..8c595234528a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1929,12 +1929,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) { struct vm_area_struct *vma; + vm_flags_t vm_flags; int ret; vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ae3c8f3595d4..56224d998c39 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -526,9 +526,13 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, int nid; struct page *page; spinlock_t *ptl; + pte_t entry; ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); - page = pte_page(huge_ptep_get((pte_t *)pmd)); + entry = huge_ptep_get((pte_t *)pmd); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) goto unlock; diff --git a/mm/mremap.c b/mm/mremap.c index 0843feb66f3d..05f1180e9f21 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -194,10 +194,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma, break; if (pmd_trans_huge(*old_pmd)) { int err = 0; - if (extent == HPAGE_PMD_SIZE) + if (extent == HPAGE_PMD_SIZE) { + VM_BUG_ON(vma->vm_file || !vma->anon_vma); + /* See comment in move_ptes() */ + if (need_rmap_locks) + anon_vma_lock_write(vma->anon_vma); err = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); + if (need_rmap_locks) + anon_vma_unlock_write(vma->anon_vma); + } if (err > 0) { need_flush = true; continue; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7106cb1aca8e..d013dba21429 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -593,14 +593,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ -static inline long long pos_ratio_polynom(unsigned long setpoint, +static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -842,7 +842,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; @@ -2398,7 +2398,7 @@ int test_clear_page_writeback(struct page *page) return ret; } -int test_set_page_writeback(struct page *page) +int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); int ret; @@ -2423,9 +2423,10 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_TOWRITE); + if (!keep_write) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); @@ -2436,7 +2437,7 @@ int test_set_page_writeback(struct page *page) return ret; } -EXPORT_SYMBOL(test_set_page_writeback); +EXPORT_SYMBOL(__test_set_page_writeback); /* * Return true if any of the pages in the mapping are marked with the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 563d41c4935b..672afc4c4157 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6064,53 +6064,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pageblock_flags_mask(struct page *page, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pageblock_flags_mask(struct page *page, unsigned long flags, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* diff --git a/mm/percpu.c b/mm/percpu.c index 036cfe07050f..a2a54a85f691 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -612,7 +612,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - kfree(chunk); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); return NULL; } diff --git a/mm/rmap.c b/mm/rmap.c index d3cbac508c2f..cdbd31285cf6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ + might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); @@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) * above cannot corrupt). */ if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; + return NULL; } out: rcu_read_unlock(); @@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; - goto out; + return NULL; } /* we pinned the anon_vma, its safe to sleep */ @@ -1554,10 +1556,9 @@ void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; + anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); - - anon_vma_free(anon_vma); } static struct anon_vma *rmap_walk_anon_lock(struct page *page, diff --git a/mm/vmscan.c b/mm/vmscan.c index a9c74b409681..6ef876cae8f1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2502,10 +2502,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ @@ -2530,9 +2537,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2551,10 +2558,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ @@ -3285,7 +3316,10 @@ static int kswapd(void *p) } } + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } |