aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c1
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/page_alloc.c35
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c68
6 files changed, 127 insertions, 69 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f900de4c67..a9a534a38ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1647,6 +1647,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
if (likely(!ret))
return CHARGE_OK;
+ res_counter_uncharge(&mem->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f..2cfa9bf1f0d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -553,7 +553,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int remap_swapcache = 1;
- int rcu_locked = 0;
int charge = 0;
struct mem_cgroup *mem = NULL;
struct anon_vma *anon_vma = NULL;
@@ -605,20 +604,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
- * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- /* Determine how to safely use anon_vma */
- if (!page_mapped(page)) {
- if (!PageSwapCache(page))
- goto rcu_unlock;
-
+ /*
+ * Only page_lock_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_lock_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Take a reference count on the anon_vma if the
+ * page is mapped so that it is guaranteed to
+ * exist when the page is remapped later
+ */
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ } else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
* swapcache page is safe to use because we don't
@@ -633,13 +638,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
*/
remap_swapcache = 0;
} else {
- /*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
- */
- anon_vma = page_anon_vma(page);
- get_anon_vma(anon_vma);
+ goto uncharge;
}
}
@@ -656,16 +655,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- if (!PageAnon(page) && page_has_private(page)) {
- /*
- * Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
- */
+ VM_BUG_ON(PageAnon(page));
+ if (page_has_private(page)) {
try_to_free_buffers(page);
- goto rcu_unlock;
+ goto uncharge;
}
goto skip_unmap;
}
@@ -679,14 +672,11 @@ skip_unmap:
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
-rcu_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
drop_anon_vma(anon_vma);
- if (rcu_locked)
- rcu_read_unlock();
uncharge:
if (!charge)
mem_cgroup_end_migration(mem, page, newpage);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c85..f5b7d176021 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
- /*
- * While kswapd is awake, it is considered the zone is under some
- * memory pressure. Under pressure, there is a risk that
- * per-cpu-counter-drift will allow the min watermark to be breached
- * potentially causing a live-lock. While kswapd is awake and
- * free pages are low, get a better estimate for free pages
- */
- if (nr_free_pages < zone->percpu_drift_mark &&
- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
- return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bd6f6da38e..985e072a3dd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1459,24 +1459,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1485,9 +1485,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -2441,7 +2460,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_nr_free_pages(zone)),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f1..3e71cb1ee28 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
if (zone->all_unreclaimable)
continue;
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
0, 0))
return 1;
}
@@ -2169,7 +2169,7 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
@@ -2215,7 +2215,7 @@ loop_again:
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2236,7 +2236,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2244,7 +2244,7 @@ loop_again:
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
}
@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
*/
if (!sleeping_prematurely(pgdat, order, remaining)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+ restore_pgdat_percpu_threshold(pgdat);
schedule();
+ reduce_pgdat_percpu_threshold(pgdat);
} else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aa..4d7faebb9b7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
+static int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
static int calculate_threshold(struct zone *zone)
{
int threshold;
@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
}
}
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = calculate_pressure_threshold(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+ put_online_cpus();
+}
+
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = calculate_threshold(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+ put_online_cpus();
+}
+
/*
* For use when we know that interrupts are disabled.
*/
@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_nr_free_pages(zone),
+ zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),