From 79bd9814e5ec9a288d6599f53aeac0b548fdfe52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:42 -0500 Subject: cgroup, memcg: move cgroup_event implementation to memcg cgroup_event is way over-designed and tries to build a generic flexible event mechanism into cgroup - fully customizable event specification for each user of the interface. This is utterly unnecessary and overboard especially in the light of the planned unified hierarchy as there's gonna be single agent. Simply generating events at fixed points, or if that's too restrictive, configureable cadence or single set of configureable points should be enough. Thankfully, memcg is the only user and gets to keep it. Replacing it with something simpler on sane_behavior is strongly recommended. This patch moves cgroup_event and "cgroup.event_control" implementation to mm/memcontrol.c. Clearing of events on cgroup destruction is moved from cgroup_destroy_locked() to mem_cgroup_css_offline(), which shouldn't make any noticeable difference. cgroup_css() and __file_cft() are exported to enable the move; however, this will soon be reverted once the event code is updated to be memcg specific. Note that "cgroup.event_control" will now exist only on the hierarchy with memcg attached to it. While this change is visible to userland, it is unlikely to be noticeable as the file has never been meaningful outside memcg. Aside from the above change, this is pure code relocation. v2: Per Li Zefan's comments, init/Kconfig updated accordingly and poll.h inclusion moved from cgroup.c to memcontrol.c. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13b9d0f221b8..02dae3292668 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -226,6 +228,36 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; +/* + * cgroup_event represents events which userspace want to receive. + */ +struct cgroup_event { + /* + * css which the event belongs to. + */ + struct cgroup_subsys_state *css; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; + static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@ -5947,6 +5979,202 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) } #endif +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup_subsys_state *css = event->css; + + remove_wait_queue(event->wqh, &event->wait); + + event->cft->unregister_event(css, event->cft, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->css->cgroup; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&cgrp->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, const char *buffer) +{ + struct cgroup *cgrp = dummy_css->cgroup; + struct cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(file_inode(cfile.file), MAY_READ); + if (ret < 0) + goto out_put_cfile; + + event->cft = __file_cft(cfile.file); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto out_put_cfile; + } + + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; + } + + /* + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. + */ + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss); + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto out_put_css; + } + + ret = event->cft->register_event(event->css, event->cft, + event->eventfd, buffer); + if (ret) + goto out_put_css; + + efile.file->f_op->poll(efile.file, &event->pt); + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fdput(cfile); + fdput(efile); + + return 0; + +out_put_css: + css_put(event->css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -5993,6 +6221,12 @@ static struct cftype mem_cgroup_files[] = { .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, + { + .name = "cgroup.event_control", + .write_string = cgroup_write_event_control, + .flags = CFTYPE_NO_PREFIX, + .mode = S_IWUGO, + }, { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, @@ -6326,6 +6560,20 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup *cgrp = css->cgroup; + struct cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); kmem_cgroup_css_offline(memcg); -- cgit v1.2.3 From b5557c4c3b1a38074d7001b87c2482eda3a0834a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:42 -0500 Subject: memcg: cgroup_write_event_control() now knows @css is for memcg @css for cgroup_write_event_control() is now always for memcg and the target file should be a memcg file too. Drop code which assumes @css is dummy_css and the target file may belong to different subsystems. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- mm/memcontrol.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 02dae3292668..d00368110b08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6056,10 +6056,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, +static int cgroup_write_event_control(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = dummy_css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; @@ -6082,6 +6082,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, if (!event) return -ENOMEM; + event->css = css; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -6117,23 +6118,17 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_cfile; } - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. */ rcu_read_lock(); ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, + &mem_cgroup_subsys); + if (cfile_css == css && css_tryget(css)) ret = 0; rcu_read_unlock(); @@ -6145,7 +6140,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_css; } - ret = event->cft->register_event(event->css, event->cft, + ret = event->cft->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_css; @@ -6162,7 +6157,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, return 0; out_put_css: - css_put(event->css); + css_put(css); out_put_cfile: fdput(cfile); out_put_eventfd: -- cgit v1.2.3 From fba94807837850e211f8975e1970e23e7804ff4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:43 -0500 Subject: cgroup, memcg: move cgroup->event_list[_lock] and event callbacks into memcg cgroup_event is being moved from cgroup core to memcg and the implementation is already moved by the previous patch. This patch moves the data fields and callbacks. * cgroup->event_list[_lock] are moved to mem_cgroup. * cftype->[un]register_event() are moved to cgroup_event. This makes it impossible for individual cftype definitions to specify their event callbacks. This is worked around by simply hard-coding filename to event callback mapping in cgroup_write_event_control(). This is awkward and inflexible, which is actually desirable given that we don't want to grow more usages of this feature. * eventfd_ctx declaration is removed from cgroup.h, which makes vmpressure.h miss eventfd_ctx declaration. Include eventfd.h from vmpressure.h. v2: Use file name from dentry instead of cftype. This will allow removing all cftype handling in the function. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- mm/memcontrol.c | 87 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d00368110b08..2fcacb18404b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -248,6 +248,22 @@ struct cgroup_event { * Each of these stored in a list by the cgroup. */ struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct cgroup_subsys_state *css, + struct cftype *cft, + struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when * userspace closes eventfd. @@ -362,6 +378,10 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; @@ -5992,7 +6012,7 @@ static void cgroup_event_remove(struct work_struct *work) remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(css, event->cft, event->eventfd); + event->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -6012,7 +6032,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_css(event->css); unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -6025,7 +6045,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, * side will require wqh->lock via remove_wait_queue(), * which we hold. */ - spin_lock(&cgrp->event_list_lock); + spin_lock(&memcg->event_list_lock); if (!list_empty(&event->list)) { list_del_init(&event->list); /* @@ -6034,7 +6054,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, */ schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); + spin_unlock(&memcg->event_list_lock); } return 0; @@ -6059,12 +6079,13 @@ static void cgroup_event_ptable_queue_proc(struct file *file, static int cgroup_write_event_control(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = css->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct fd efile; struct fd cfile; + const char *name; char *endp; int ret; @@ -6118,6 +6139,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + */ + name = cfile.file->f_dentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + /* * Verify @cfile should belong to @css. Also, remaining events are * automatically removed on cgroup destruction but the removal is @@ -6135,21 +6181,15 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret) goto out_put_cfile; - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(css, event->cft, - event->eventfd, buffer); + ret = event->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_css; efile.file->f_op->poll(efile.file, &event->pt); - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); + spin_lock(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock(&memcg->event_list_lock); fdput(cfile); fdput(efile); @@ -6175,8 +6215,6 @@ static struct cftype mem_cgroup_files[] = { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -6236,14 +6274,10 @@ static struct cftype mem_cgroup_files[] = { .name = "oom_control", .read_map = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, - .register_event = mem_cgroup_oom_register_event, - .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, { .name = "pressure_level", - .register_event = vmpressure_register_event, - .unregister_event = vmpressure_unregister_event, }, #ifdef CONFIG_NUMA { @@ -6291,8 +6325,6 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -6483,6 +6515,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); return &memcg->css; @@ -6555,7 +6589,6 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cgroup *cgrp = css->cgroup; struct cgroup_event *event, *tmp; /* @@ -6563,12 +6596,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) * Notify userspace about cgroup removing only after rmdir of cgroup * directory to avoid race between userspace and kernelspace. */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + spin_lock(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { list_del_init(&event->list); schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); + spin_unlock(&memcg->event_list_lock); kmem_cgroup_css_offline(memcg); -- cgit v1.2.3 From 347c4a8747104a945ecced358944e42879176ca5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:43 -0500 Subject: memcg: remove cgroup_event->cft The only use of cgroup_event->cft is distinguishing "usage_in_bytes" and "memsw.usgae_in_bytes" for mem_cgroup_usage_[un]register_event(), which can be done by adding an explicit argument to the function and implementing two wrappers so that the two cases can be distinguished from the function alone. Remove cgroup_event->cft and the related code including [un]register_events() methods. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko --- mm/memcontrol.c | 65 +++++++++++++++++++++++++++++++-------------------------- mm/vmpressure.c | 14 +++---------- 2 files changed, 38 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2fcacb18404b..3c93dcfd26da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -236,10 +236,6 @@ struct cgroup_event { * css which the event belongs to. */ struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; /* * eventfd to signal userspace about the event. */ @@ -254,15 +250,13 @@ struct cgroup_event { * on eventfd to send notification to userspace. */ int (*register_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args); + struct eventfd_ctx *eventfd, const char *args); /* * unregister_event() callback will be called when userspace closes * the eventfd or on cgroup removing. This callback must be set, * if you want provide notification functionality. */ void (*unregister_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when @@ -5688,13 +5682,12 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static int __mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -5771,13 +5764,24 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(css, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(css, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd, enum res_type type) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -5850,14 +5854,24 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } +static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(css, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(css, eventfd, _MEMSWAP); +} + static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) + struct eventfd_ctx *eventfd, const char *args) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; - enum res_type type = MEMFILE_TYPE(cft->private); - BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; @@ -5876,13 +5890,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, } static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) + struct eventfd_ctx *eventfd) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; - enum res_type type = MEMFILE_TYPE(cft->private); - - BUG_ON(type != _OOM_TYPE); spin_lock(&memcg_oom_lock); @@ -6012,7 +6023,7 @@ static void cgroup_event_remove(struct work_struct *work) remove_wait_queue(event->wqh, &event->wait); - event->unregister_event(css, event->cft, event->eventfd); + event->unregister_event(css, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -6133,12 +6144,6 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret < 0) goto out_put_cfile; - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows @@ -6157,8 +6162,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { - event->register_event = mem_cgroup_usage_register_event; - event->unregister_event = mem_cgroup_usage_unregister_event; + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; } else { ret = -EINVAL; goto out_put_cfile; @@ -6181,7 +6186,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret) goto out_put_cfile; - ret = event->register_event(css, event->cft, event->eventfd, buffer); + ret = event->register_event(css, event->eventfd, buffer); if (ret) goto out_put_css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..0f25a996d150 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -279,7 +279,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @css: css that is interested in vmpressure notifications - * @cft: cgroup control files handle * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) * @@ -289,13 +288,10 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or * "critical"). * - * This function should not be used directly, just pass it to (struct - * cftype).register_event, and then cgroup core will handle everything by - * itself. + * To be used as memcg event method. */ int vmpressure_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args) + struct eventfd_ctx *eventfd, const char *args) { struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; @@ -326,19 +322,15 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure * @css: css handle - * @cft: cgroup control files handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). * - * This function should not be used directly, just pass it to (struct - * cftype).unregister_event, and then cgroup core will handle everything - * by itself. + * To be used as memcg event method. */ void vmpressure_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) { struct vmpressure *vmpr = css_to_vmpressure(css); -- cgit v1.2.3 From 59b6f87344ab5eb3057e5844b8cd8a39e668f477 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:43 -0500 Subject: memcg: make cgroup_event deal with mem_cgroup instead of cgroup_subsys_state cgroup_event is now memcg specific. Replace cgroup_event->css with ->memcg and convert [un]register_event() callbacks to take mem_cgroup pointer instead of cgroup_subsys_state one. This simplifies the code slightly and makes css_to_vmpressure() unnecessary which is removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko --- mm/memcontrol.c | 53 ++++++++++++++++++++++------------------------------- mm/vmpressure.c | 12 ++++++------ 2 files changed, 28 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3c93dcfd26da..42f2843af1a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -233,9 +233,9 @@ struct mem_cgroup_eventfd_list { */ struct cgroup_event { /* - * css which the event belongs to. + * memcg which the event belongs to. */ - struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; /* * eventfd to signal userspace about the event. */ @@ -249,14 +249,14 @@ struct cgroup_event { * waiter for changes related to this event. Use eventfd_signal() * on eventfd to send notification to userspace. */ - int (*register_event)(struct cgroup_subsys_state *css, + int (*register_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); /* * unregister_event() callback will be called when userspace closes * the eventfd or on cgroup removing. This callback must be set, * if you want provide notification functionality. */ - void (*unregister_event)(struct cgroup_subsys_state *css, + void (*unregister_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when @@ -535,11 +535,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) -{ - return &mem_cgroup_from_css(css)->vmpressure; -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -5682,10 +5677,9 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int __mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; u64 threshold, usage; @@ -5764,22 +5758,21 @@ unlock: return ret; } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - return __mem_cgroup_usage_register_event(css, eventfd, args, _MEM); + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); } -static int memsw_cgroup_usage_register_event(struct cgroup_subsys_state *css, +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - return __mem_cgroup_usage_register_event(css, eventfd, args, _MEMSWAP); + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); } -static void __mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; u64 usage; @@ -5854,22 +5847,21 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - return __mem_cgroup_usage_unregister_event(css, eventfd, _MEM); + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); } -static void memsw_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - return __mem_cgroup_usage_unregister_event(css, eventfd, _MEMSWAP); + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); } -static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -5889,10 +5881,9 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; spin_lock(&memcg_oom_lock); @@ -6019,18 +6010,18 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup_subsys_state *css = event->css; + struct mem_cgroup *memcg = event->memcg; remove_wait_queue(event->wqh, &event->wait); - event->unregister_event(css, event->eventfd); + event->unregister_event(memcg, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); eventfd_ctx_put(event->eventfd); kfree(event); - css_put(css); + css_put(&memcg->css); } /* @@ -6043,7 +6034,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct mem_cgroup *memcg = mem_cgroup_from_css(event->css); + struct mem_cgroup *memcg = event->memcg; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -6114,7 +6105,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (!event) return -ENOMEM; - event->css = css; + event->memcg = memcg; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -6186,7 +6177,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, if (ret) goto out_put_cfile; - ret = event->register_event(css, event->eventfd, buffer); + ret = event->register_event(memcg, event->eventfd, buffer); if (ret) goto out_put_css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 0f25a996d150..196970a4541f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -278,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd - * @css: css that is interested in vmpressure notifications + * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) * @@ -290,10 +290,10 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * * To be used as memcg event method. */ -int vmpressure_register_event(struct cgroup_subsys_state *css, +int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; int level; @@ -321,7 +321,7 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure - * @css: css handle + * @memcg: memcg handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from @@ -330,10 +330,10 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, * * To be used as memcg event method. */ -void vmpressure_unregister_event(struct cgroup_subsys_state *css, +void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); -- cgit v1.2.3 From 3bc942f372af383f49d56aab599469561a5e39ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 18:20:44 -0500 Subject: memcg: rename cgroup_event to mem_cgroup_event cgroup_event is only available in memcg now. Let's brand it that way. While at it, add a comment encouraging deprecation of the feature and remove the respective section from cgroup documentation. This patch is cosmetic. v3: Typo update as per Li Zefan. v2: Index in cgroups.txt updated accordingly as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko --- mm/memcontrol.c | 57 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42f2843af1a7..ec8582b3a232 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -231,7 +231,7 @@ struct mem_cgroup_eventfd_list { /* * cgroup_event represents events which userspace want to receive. */ -struct cgroup_event { +struct mem_cgroup_event { /* * memcg which the event belongs to. */ @@ -6001,15 +6001,28 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) } #endif +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + /* * Unregister event and free resources. * * Gets called from workqueue. */ -static void cgroup_event_remove(struct work_struct *work) +static void memcg_event_remove(struct work_struct *work) { - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); struct mem_cgroup *memcg = event->memcg; remove_wait_queue(event->wqh, &event->wait); @@ -6029,11 +6042,11 @@ static void cgroup_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) { - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); struct mem_cgroup *memcg = event->memcg; unsigned long flags = (unsigned long)key; @@ -6062,27 +6075,29 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, return 0; } -static void cgroup_event_ptable_queue_proc(struct file *file, +static void memcg_event_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); event->wqh = wqh; add_wait_queue(wqh, &event->wait); } /* + * DO NOT USE IN NEW FILES. + * * Parse input and register new cgroup event handler. * * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static int memcg_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cgroup_event *event; + struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct fd efile; @@ -6107,9 +6122,9 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event->memcg = memcg; INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); efile = fdget(efd); if (!efile.file) { @@ -6140,6 +6155,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. */ name = cfile.file->f_dentry->d_name.name; @@ -6251,8 +6268,8 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_hierarchy_read, }, { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, + .name = "cgroup.event_control", /* XXX: for compat */ + .write_string = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX, .mode = S_IWUGO, }, @@ -6585,7 +6602,7 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cgroup_event *event, *tmp; + struct mem_cgroup_event *event, *tmp; /* * Unregister events and notify userspace. -- cgit v1.2.3 From 4f024f3797c43cb4b73cd2c50cec728842d0e49e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 11 Oct 2013 15:44:27 -0700 Subject: block: Abstract out bvec iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Geert Uytterhoeven Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Ed L. Cashin" Cc: Nick Piggin Cc: Lars Ellenberg Cc: Jiri Kosina Cc: Matthew Wilcox Cc: Geoff Levand Cc: Yehuda Sadeh Cc: Sage Weil Cc: Alex Elder Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris Cc: Philip Kelleher Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux390@de.ibm.com Cc: Boaz Harrosh Cc: Benny Halevy Cc: "James E.J. Bottomley" Cc: Greg Kroah-Hartman Cc: "Nicholas A. Bellinger" Cc: Alexander Viro Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Jaegeuk Kim Cc: Steven Whitehouse Cc: Dave Kleikamp Cc: Joern Engel Cc: Prasad Joshi Cc: Trond Myklebust Cc: KONISHI Ryusuke Cc: Mark Fasheh Cc: Joel Becker Cc: Ben Myers Cc: xfs@oss.sgi.com Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Herton Ronaldo Krzesinski Cc: Ben Hutchings Cc: Andrew Morton Cc: Guo Chao Cc: Tejun Heo Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Wei Yongjun Cc: "Roger Pau MonnĂ©" Cc: Jan Beulich Cc: Stefano Stabellini Cc: Ian Campbell Cc: Sebastian Ott Cc: Christian Borntraeger Cc: Minchan Kim Cc: Jiang Liu Cc: Nitin Gupta Cc: Jerome Marchand Cc: Joe Perches Cc: Peng Tao Cc: Andy Adamson Cc: fanchaoting Cc: Jie Liu Cc: Sunil Mushran Cc: "Martin K. Petersen" Cc: Namjae Jeon Cc: Pankaj Kumar Cc: Dan Magenheimer Cc: Mel Gorman 6 --- mm/page_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 8c79a4764be0..f14eded987fa 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio = bio_alloc(gfp_flags, 1); if (bio) { - bio->bi_sector = map_swap_page(page, &bio->bi_bdev); - bio->bi_sector <<= PAGE_SHIFT - 9; + bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_io_vec[0].bv_page = page; bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_size = PAGE_SIZE; + bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } return bio; @@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err) printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err) printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); goto out; } -- cgit v1.2.3 From 7988613b0e5b2638caf6cd493cc78e9595eba19c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2013 17:19:00 -0800 Subject: block: Convert bio_for_each_segment() to bvec_iter More prep work for immutable biovecs - with immutable bvecs drivers won't be able to use the biovec directly, they'll need to use helpers that take into account bio->bi_iter.bi_bvec_done. This updates callers for the new usage without changing the implementation yet. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Geert Uytterhoeven Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Ed L. Cashin" Cc: Nick Piggin Cc: Lars Ellenberg Cc: Jiri Kosina Cc: Paul Clements Cc: Jim Paris Cc: Geoff Levand Cc: Yehuda Sadeh Cc: Sage Weil Cc: Alex Elder Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris Cc: Philip Kelleher Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Neil Brown Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux390@de.ibm.com Cc: Nagalakshmi Nandigama Cc: Sreekanth Reddy Cc: support@lsi.com Cc: "James E.J. Bottomley" Cc: Greg Kroah-Hartman Cc: Alexander Viro Cc: Steven Whitehouse Cc: Herton Ronaldo Krzesinski Cc: Tejun Heo Cc: Andrew Morton Cc: Guo Chao Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Matthew Wilcox Cc: Keith Busch Cc: Stephen Hemminger Cc: Quoc-Son Anh Cc: Sebastian Ott Cc: Nitin Gupta Cc: Minchan Kim Cc: Jerome Marchand Cc: Seth Jennings Cc: "Martin K. Petersen" Cc: Mike Snitzer Cc: Vivek Goyal Cc: "Darrick J. Wong" Cc: Chris Metcalf Cc: Jan Kara Cc: linux-m68k@lists.linux-m68k.org Cc: linuxppc-dev@lists.ozlabs.org Cc: drbd-user@lists.linbit.com Cc: nbd-general@lists.sourceforge.net Cc: cbe-oss-dev@lists.ozlabs.org Cc: xen-devel@lists.xensource.com Cc: virtualization@lists.linux-foundation.org Cc: linux-raid@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: DL-MPTFusionLinux@lsi.com Cc: linux-scsi@vger.kernel.org Cc: devel@driverdev.osuosl.org Cc: linux-fsdevel@vger.kernel.org Cc: cluster-devel@redhat.com Cc: linux-mm@kvack.org Acked-by: Geoff Levand --- mm/bounce.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/bounce.c b/mm/bounce.c index 5a7d58fb883b..523918b8c6dc 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -98,27 +98,24 @@ int init_emergency_isa_pool(void) static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { unsigned char *vfrom; - struct bio_vec *tovec, *fromvec; - int i; - - bio_for_each_segment(tovec, to, i) { - fromvec = from->bi_io_vec + i; - - /* - * not bounced - */ - if (tovec->bv_page == fromvec->bv_page) - continue; - - /* - * fromvec->bv_offset and fromvec->bv_len might have been - * modified by the block layer, so use the original copy, - * bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; + struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bvec_iter iter; + + bio_for_each_segment(tovec, to, iter) { + if (tovec.bv_page != fromvec->bv_page) { + /* + * fromvec->bv_offset and fromvec->bv_len might have + * been modified by the block layer, so use the original + * copy, bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + + tovec.bv_offset; + + bounce_copy_vec(&tovec, vfrom); + flush_dcache_page(tovec.bv_page); + } - bounce_copy_vec(tovec, vfrom); - flush_dcache_page(tovec->bv_page); + fromvec++; } } @@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, { struct bio *bio; int rw = bio_data_dir(*bio_orig); - struct bio_vec *to, *from; + struct bio_vec *to, from; + struct bvec_iter iter; unsigned i; if (force) goto bounce; - bio_for_each_segment(from, *bio_orig, i) - if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) + bio_for_each_segment(from, *bio_orig, iter) + if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) goto bounce; return; -- cgit v1.2.3 From 791badbdb3e4fc1001ee3bcdaedc6d4f167fcbe8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Dec 2013 12:28:02 -0500 Subject: memcg: convert away from cftype->read() and ->read_map() In preparation of conversion to kernfs, cgroup file handling is being consolidated so that it can be easily mapped to the seq_file based interface of kernfs. cftype->read_map() doesn't add any value and being replaced with ->read_seq_string(), and all users of cftype->read() can be easily served, usually better, by seq_file and other methods. Update mem_cgroup_read() to return u64 instead of printing itself and rename it to mem_cgroup_read_u64(), and update mem_cgroup_oom_control_read() to use ->read_seq_string() instead of ->read_map(). This patch doesn't make any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- mm/memcontrol.c | 49 +++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7aa0d405b148..f149521a77e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5150,14 +5150,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - char str[64]; u64 val; - int name, len; + int name; enum res_type type; type = MEMFILE_TYPE(cft->private); @@ -5183,8 +5181,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, BUG(); } - len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); - return simple_read_from_buffer(buf, nbytes, ppos, str, len); + return val; } static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) @@ -5911,16 +5908,12 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, } static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct cgroup_map_cb *cb) + struct cftype *cft, struct seq_file *sf) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); - - if (atomic_read(&memcg->under_oom)) - cb->fill(cb, "under_oom", 1); - else - cb->fill(cb, "under_oom", 0); + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); + seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); return 0; } @@ -6239,31 +6232,31 @@ static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "stat", @@ -6297,7 +6290,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "oom_control", - .read_map = mem_cgroup_oom_control_read, + .read_seq_string = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, @@ -6315,24 +6308,24 @@ static struct cftype mem_cgroup_files[] = { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.failcnt", .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, #ifdef CONFIG_SLABINFO { @@ -6349,25 +6342,25 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { }, /* terminate */ }; -- cgit v1.2.3 From 716f479d279fb456f58be44180d7479da75e5a4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Dec 2013 12:28:03 -0500 Subject: hugetlb_cgroup: convert away from cftype->read() In preparation of conversion to kernfs, cgroup file handling is being consolidated so that it can be easily mapped to the seq_file based interface of kernfs. All users of cftype->read() can be easily served, usually better, by seq_file and other methods. Update hugetlb_cgroup_read() to return u64 instead of printing itself and rename it to hugetlb_cgroup_read_u64(). This patch doesn't make any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: Li Zefan Cc: Aneesh Kumar K.V Cc: Johannes Weiner --- mm/hugetlb_cgroup.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index bda8e44f6fde..d747a84e09b0 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - u64 val; - char str[64]; - int idx, name, len; + int idx, name; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); - val = res_counter_read_u64(&h_cg->hugepage[idx], name); - len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); - return simple_read_from_buffer(buf, nbytes, ppos, str, len); + return res_counter_read_u64(&h_cg->hugepage[idx], name); } static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, @@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; cft->write_string = hugetlb_cgroup_write; /* Add the usage file */ cft = &h->cgroup_files[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ cft = &h->cgroup_files[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->trigger = hugetlb_cgroup_reset; - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->trigger = hugetlb_cgroup_reset; - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ cft = &h->cgroup_files[4]; -- cgit v1.2.3 From 2da8ca822d49c8b8781800ad155aaa00e7bb5f1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Dec 2013 12:28:04 -0500 Subject: cgroup: replace cftype->read_seq_string() with cftype->seq_show() In preparation of conversion to kernfs, cgroup file handling is updated so that it can be easily mapped to kernfs. This patch replaces cftype->read_seq_string() with cftype->seq_show() which is not limited to single_open() operation and will map directcly to kernfs seq_file interface. The conversions are mechanical. As ->seq_show() doesn't have @css and @cft, the functions which make use of them are converted to use seq_css() and seq_cft() respectively. In several occassions, e.f. if it has seq_string in its name, the function name is updated to fit the new method better. This patch does not introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Aristeu Rozanski Acked-by: Vivek Goyal Acked-by: Michal Hocko Acked-by: Daniel Wagner Acked-by: Li Zefan Cc: Jens Axboe Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Neil Horman --- mm/memcontrol.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f149521a77e6..9252219376cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3014,10 +3014,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) } #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct memcg_cache_params *params; if (!memcg_can_account_kmem(memcg)) @@ -5418,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { const char *name; @@ -5435,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css, const struct numa_stat *stat; int nid; unsigned long nr; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -5474,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *m) +static int memcg_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct mem_cgroup *mi; unsigned int i; @@ -5907,10 +5904,9 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); @@ -6260,7 +6256,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "stat", - .read_seq_string = memcg_stat_show, + .seq_show = memcg_stat_show, }, { .name = "force_empty", @@ -6290,7 +6286,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "oom_control", - .read_seq_string = mem_cgroup_oom_control_read, + .seq_show = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, @@ -6300,7 +6296,7 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_NUMA { .name = "numa_stat", - .read_seq_string = memcg_numa_stat_show, + .seq_show = memcg_numa_stat_show, }, #endif #ifdef CONFIG_MEMCG_KMEM @@ -6330,7 +6326,7 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .read_seq_string = mem_cgroup_slabinfo_read, + .seq_show = mem_cgroup_slabinfo_read, }, #endif #endif -- cgit v1.2.3 From 5877231f646bbd6d1d545e7af83aaa6e6b746013 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 6 Dec 2013 00:08:22 +0530 Subject: mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE. On archs like ppc64 that don't use _PAGE_PROTNONE and also have a separate page table outside linux pagetable, we just need to make sure that when calling change_prot_numa we flush the hardware page table entry so that next page access result in a numa fault. We still need to make sure we use the numa faulting logic only when CONFIG_NUMA_BALANCING is set. This implies the migrate-on-fault (Lazy migration) via mbind will only work if CONFIG_NUMA_BALANCING is set. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Benjamin Herrenschmidt --- mm/mempolicy.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eca4a3129129..9f73b29d304d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); if (nr_updated) @@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, { return 0; } -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#endif /* CONFIG_NUMA_BALANCING */ /* * Walk through page tables and collect pages to be migrated. -- cgit v1.2.3 From 8afb1474db4701d1ab80cd8251137a3260e6913e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 10 Sep 2013 11:43:37 +0800 Subject: slub: Fix calculation of cpu slabs /sys/kernel/slab/:t-0000048 # cat cpu_slabs 231 N0=16 N1=215 /sys/kernel/slab/:t-0000048 # cat slabs 145 N0=36 N1=109 See, the number of slabs is smaller than that of cpu slabs. The bug was introduced by commit 49e2258586b423684f03c278149ab46d8f8b6700 ("slub: per cpu cache for partial pages"). We should use page->pages instead of page->pobjects when calculating the number of cpu partial slabs. This also fixes the mapping of slabs and nodes. As there's no variable storing the number of total/active objects in cpu partial slabs, and we don't have user interfaces requiring those statistics, I just add WARN_ON for those cases. Cc: # 3.2+ Acked-by: Christoph Lameter Reviewed-by: Wanpeng Li Signed-off-by: Li Zefan Signed-off-by: Pekka Enberg --- mm/slub.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 545a170ebf9f..89490d9d91e0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4299,7 +4299,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, page = ACCESS_ONCE(c->partial); if (page) { - x = page->pobjects; + node = page_to_nid(page); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); + else if (flags & SO_OBJECTS) + WARN_ON_ONCE(1); + else + x = page->pages; total += x; nodes[node] += x; } -- cgit v1.2.3 From b3ff8a2f9569fb41b9cf8902897d787a33bac84f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 12 Jan 2014 20:23:27 -0800 Subject: cgroup: remove stray references to css_id Trivial: remove the few stray references to css_id, which itself was removed in v3.13's 2ff2a7d03bbe "cgroup: kill css_id". Signed-off-by: Hugh Dickins Signed-off-by: Tejun Heo --- mm/page_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6d757e3a872a..3bd0b8e6ab12 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -451,7 +451,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * - * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { -- cgit v1.2.3 From c65c1877bd6826ce0d9713d76e30a7bed8e49f38 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jan 2014 13:23:49 +0100 Subject: slub: use lockdep_assert_held Instead of using comments in an attempt at getting the locking right, use proper assertions that actively warn you if you got it wrong. Also add extra braces in a few sites to comply with coding-style. Signed-off-by: Peter Zijlstra Signed-off-by: Pekka Enberg --- mm/slub.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 89490d9d91e0..367b224f2aa5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -985,23 +985,22 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Tracking of fully allocated slabs for debugging purposes. - * - * list_lock must be held. */ static void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { + lockdep_assert_held(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) return; list_add(&page->lru, &n->full); } -/* - * list_lock must be held. - */ -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { + lockdep_assert_held(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) return; @@ -1250,7 +1249,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -static inline void remove_full(struct kmem_cache *s, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1504,12 +1504,12 @@ static void discard_slab(struct kmem_cache *s, struct page *page) /* * Management of partially allocated slabs. - * - * list_lock must be held. */ static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { + lockdep_assert_held(&n->list_lock); + n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); @@ -1517,12 +1517,11 @@ static inline void add_partial(struct kmem_cache_node *n, list_add(&page->lru, &n->partial); } -/* - * list_lock must be held. - */ static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { + lockdep_assert_held(&n->list_lock); + list_del(&page->lru); n->nr_partial--; } @@ -1532,8 +1531,6 @@ static inline void remove_partial(struct kmem_cache_node *n, * return the pointer to the freelist. * * Returns a list of objects or NULL if it fails. - * - * Must hold list_lock since we modify the partial list. */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, @@ -1543,6 +1540,8 @@ static inline void *acquire_slab(struct kmem_cache *s, unsigned long counters; struct page new; + lockdep_assert_held(&n->list_lock); + /* * Zap the freelist and set the frozen bit. * The old freelist is the list of objects for the @@ -1887,7 +1886,7 @@ redo: else if (l == M_FULL) - remove_full(s, page); + remove_full(s, n, page); if (m == M_PARTIAL) { @@ -2541,7 +2540,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, new.inuse--; if ((!new.inuse || !prior) && !was_frozen) { - if (kmem_cache_has_cpu_partial(s) && !prior) + if (kmem_cache_has_cpu_partial(s) && !prior) { /* * Slab was on no list before and will be @@ -2551,7 +2550,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ new.frozen = 1; - else { /* Needs to be taken off a list */ + } else { /* Needs to be taken off a list */ n = get_node(s, page_to_nid(page)); /* @@ -2600,7 +2599,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { if (kmem_cache_debug(s)) - remove_full(s, page); + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -2614,9 +2613,10 @@ slab_empty: */ remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } else + } else { /* Slab must be on the full list */ - remove_full(s, page); + remove_full(s, n, page); + } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); -- cgit v1.2.3 From 26e4f2057516f1c457e0e95346a00303f983ad53 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 4 Jan 2014 16:32:31 +0900 Subject: slub: Fix possible format string bug. The "name" is determined at runtime and is parsed as format string. Acked-by: David Rientjes Signed-off-by: Tetsuo Handa Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 367b224f2aa5..a99e9e67c60e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5169,7 +5169,7 @@ static int sysfs_slab_add(struct kmem_cache *s) } s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) { kobject_put(&s->kobj); return err; -- cgit v1.2.3 From b3084f4db3aeb991c507ca774337c7e7893ed04f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 13 Jan 2014 11:34:24 +0530 Subject: powerpc/thp: Fix crash on mremap This patch fix the below crash NIP [c00000000004cee4] .__hash_page_thp+0x2a4/0x440 LR [c0000000000439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c000000736103c40] [00001ffffb000000] 0x1ffffb000000(unreliable) [437908.479693] [c000000736103d50] [c0000000000439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c000000736103e30] [c00000000000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Signed-off-by: Benjamin Herrenschmidt --- mm/huge_memory.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..5d80c53b87cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { - pgtable_t pgtable; - /* - * Move preallocated PTE page table if new_pmd is on - * different PMD page table. - */ + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - - spin_unlock(new_ptl); } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- cgit v1.2.3 From 8a0921712ec6d00754b5d7afea78137772efee0a Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 2 Jan 2014 13:53:21 -0800 Subject: percpu: use VMALLOC_TOTAL instead of VMALLOC_END - VMALLOC_START vmalloc already gives a useful macro to calculate the total vmalloc size. Use it. Signed-off-by: Laura Abbott Signed-off-by: Tejun Heo --- mm/percpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 0d10defe951e..afbf352ae580 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1686,10 +1686,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, max_distance += ai->unit_size; /* warn if maximum distance is further than 75% of vmalloc space */ - if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " "space 0x%lx\n", max_distance, - (unsigned long)(VMALLOC_END - VMALLOC_START)); + VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; -- cgit v1.2.3 From 0abdd7a81b7e3fd781d7fabcca49501852bba17e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 21 Jan 2014 15:48:12 -0800 Subject: dma-debug: introduce debug_dma_assert_idle() Record actively mapped pages and provide an api for asserting a given page is dma inactive before execution proceeds. Placing debug_dma_assert_idle() in cow_user_page() flagged the violation of the dma-api in the NET_DMA implementation (see commit 77873803363c "net_dma: mark broken"). The implementation includes the capability to count, in a limited way, repeat mappings of the same page that occur without an intervening unmap. This 'overlap' counter is limited to the few bits of tag space in a radix tree. This mechanism is added to mitigate false negative cases where, for example, a page is dma mapped twice and debug_dma_assert_idle() is called after the page is un-mapped once. Signed-off-by: Dan Williams Cc: Joerg Roedel Cc: Vinod Koul Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6768ce9e57d2..e9c550484ba6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { + debug_dma_assert_idle(src); + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by -- cgit v1.2.3 From a0368d4e48fc9ad65a66f6819a801f3f542b4f0f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:49 -0800 Subject: mm: hugetlb: use get_page_foll() in follow_hugetlb_page() get_page_foll() is more optimal and is always safe to use under the PT lock. More so for hugetlbfs as there's no risk of race conditions with split_huge_page regardless of the PT lock. Signed-off-by: Andrea Arcangeli Tested-by: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4e6d34..7596e104bffa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3079,7 +3079,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page(pages[i]); + get_page_foll(pages[i]); } if (vmas) -- cgit v1.2.3 From ebf360f9bb957f68e19e88f5067c015997dc26a6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:51 -0800 Subject: mm: hugetlbfs: move the put/get_page slab and hugetlbfs optimization in a faster path We don't actually need a reference on the head page in the slab and hugetlbfs paths, as long as we add a smp_rmb() which should be faster than get_page_unless_zero. [akpm@linux-foundation.org: fix typo in comment] Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 140 ++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 78 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 84b26aaabd03..e2757fbb04ea 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -86,45 +86,61 @@ static void put_compound_page(struct page *page) /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); + /* + * THP can not break up slab pages so avoid taking + * compound_lock(). Slab performs non-atomic bit ops + * on page->flags for better performance. In + * particular slab_unlock() in slub used to be a hot + * path. It is still hot on arches that do not support + * this_cpu_cmpxchg_double(). + * + * If "page" is part of a slab or hugetlbfs page it + * cannot be splitted and the head page cannot change + * from under us. And if "page" is part of a THP page + * under splitting, if the head page pointed by the + * THP tail isn't a THP head anymore, we'll find + * PageTail clear after smp_rmb() and we'll treat it + * as a single page. + */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + /* + * If "page" is a THP tail, we must read the tail page + * flags after the head page flags. The + * split_huge_page side enforces write memory + * barriers between clearing PageTail and before the + * head page can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + return; + } else + /* + * __split_huge_page_refcount + * run before us, "page" was a + * THP tail. The split + * page_head has been freed + * and reallocated as slab or + * hugetlbfs page of smaller + * order (only possible if + * reallocated as slab on + * x86). + */ + goto out_put_single; + } + if (likely(page != page_head && get_page_unless_zero(page_head))) { unsigned long flags; - /* - * THP can not break up slab pages so avoid taking - * compound_lock(). Slab performs non-atomic bit ops - * on page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is - * still hot on arches that do not support - * this_cpu_cmpxchg_double(). - */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - atomic_dec(&page->_mapcount); - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - if (put_page_testzero(page_head)) - __put_compound_page(page_head); - return; - } else - /* - * __split_huge_page_refcount - * run before us, "page" was a - * THP tail. The split - * page_head has been freed - * and reallocated as slab or - * hugetlbfs page of smaller - * order (only possible if - * reallocated as slab on - * x86). - */ - goto skip_lock; - } /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time @@ -135,7 +151,6 @@ static void put_compound_page(struct page *page) if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); -skip_lock: if (put_page_testzero(page_head)) { /* * The head page may have been @@ -221,36 +236,37 @@ bool __get_page_tail(struct page *page) * split_huge_page(). */ unsigned long flags; - bool got = false; + bool got; struct page *page_head = compound_trans_head(page); - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - __get_page_tail_foll(page, false); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - put_page(page_head); - return false; - } + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + __get_page_tail_foll(page, true); + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ + return false; } + } + got = false; + if (likely(page != page_head && get_page_unless_zero(page_head))) { /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time -- cgit v1.2.3 From 44518d2b32646e37b4b7a0813bbbe98dc21c7f8f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:54 -0800 Subject: mm: tail page refcounting optimization for slab and hugetlbfs This skips the _mapcount mangling for slab and hugetlbfs pages. The main trouble in doing this is to guarantee that PageSlab and PageHeadHuge remains constant for all get_page/put_page run on the tail of slab or hugetlbfs compound pages. Otherwise if they're set during get_page but not set during put_page, the _mapcount of the tail page would underflow. PageHeadHuge will remain true until the compound page is released and enters the buddy allocator so it won't risk to change even if the tail page is the last reference left on the page. PG_slab instead is cleared before the slab frees the head page with put_page, so if the tail pin is released after the slab freed the page, we would have a problem. But in the slab case the tail pin cannot be the last reference left on the page. This is because the slab code is free to reuse the compound page after a kfree/kmem_cache_free without having to check if there's any tail pin left. In turn all tail pins must be always released while the head is still pinned by the slab code and so we know PG_slab will be still set too. Signed-off-by: Andrea Arcangeli Reviewed-by: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/swap.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..a85a3ab1f7ef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,7 +51,8 @@ static inline void __get_page_tail_foll(struct page *page, VM_BUG_ON(page_mapcount(page) < 0); if (get_page_head) atomic_inc(&page->first_page->_count); - atomic_inc(&page->_mapcount); + if (compound_tail_refcounted(page->first_page)) + atomic_inc(&page->_mapcount); } /* diff --git a/mm/swap.c b/mm/swap.c index e2757fbb04ea..bba4aa5bf686 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -88,8 +88,9 @@ static void put_compound_page(struct page *page) /* * THP can not break up slab pages so avoid taking - * compound_lock(). Slab performs non-atomic bit ops - * on page->flags for better performance. In + * compound_lock() and skip the tail page refcounting + * (in _mapcount) too. Slab performs non-atomic bit + * ops on page->flags for better performance. In * particular slab_unlock() in slub used to be a hot * path. It is still hot on arches that do not support * this_cpu_cmpxchg_double(). @@ -102,7 +103,7 @@ static void put_compound_page(struct page *page) * PageTail clear after smp_rmb() and we'll treat it * as a single page. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (!__compound_tail_refcounted(page_head)) { /* * If "page" is a THP tail, we must read the tail page * flags after the head page flags. The @@ -117,10 +118,30 @@ static void put_compound_page(struct page *page) * cannot race here. */ VM_BUG_ON(!PageHead(page_head)); - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - if (put_page_testzero(page_head)) + VM_BUG_ON(page_mapcount(page) != 0); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a + * slab compound page, the + * tail pin must not be the + * last reference held on the + * page, because the PG_slab + * cannot be cleared before + * all tail pins (which skips + * the _mapcount tail + * refcounting) have been + * released. For hugetlbfs the + * tail pin may be the last + * reference on the page + * instead, because + * PageHeadHuge will not go + * away until the compound + * page enters the buddy + * allocator. + */ + VM_BUG_ON(PageSlab(page_head)); __put_compound_page(page_head); + } return; } else /* -- cgit v1.2.3 From 3bfcd13ec0b43b39b02072ba67bf197d15379387 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:56 -0800 Subject: mm: hugetlbfs: use __compound_tail_refcounted in __get_page_tail too Also remove hugetlb.h which isn't needed anymore as PageHeadHuge is handled in mm.h. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index bba4aa5bf686..7434e3619c14 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,7 +31,6 @@ #include #include #include -#include #include "internal.h" @@ -261,7 +260,7 @@ bool __get_page_tail(struct page *page) struct page *page_head = compound_trans_head(page); /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (!__compound_tail_refcounted(page_head)) { smp_rmb(); if (likely(PageTail(page))) { /* -- cgit v1.2.3 From 758f66a29ccc6383353fd395aa04be15e8dea445 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jan 2014 15:48:57 -0800 Subject: mm/hugetlb.c: simplify PageHeadHuge() and PageHuge() Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7596e104bffa..1d9125360bf5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ int PageHuge(struct page *page) { - compound_page_dtor *dtor; - if (!PageCompound(page)) return 0; page = compound_head(page); - dtor = get_compound_page_dtor(page); - - return dtor == free_huge_page; + return get_compound_page_dtor(page) == free_huge_page; } EXPORT_SYMBOL_GPL(PageHuge); @@ -708,14 +704,10 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - compound_page_dtor *dtor; - if (!PageHead(page_head)) return 0; - dtor = get_compound_page_dtor(page_head); - - return dtor == free_huge_page; + return get_compound_page_dtor(page_head) == free_huge_page; } EXPORT_SYMBOL_GPL(PageHeadHuge); -- cgit v1.2.3 From 26296ad2dfb4059f840e46cd7af38d0025a9d8d7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jan 2014 15:48:59 -0800 Subject: mm/swap.c: reorganize put_compound_page() Tweak it so save a tab stop, make code layout slightly less nutty. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 254 +++++++++++++++++++++++++++++++------------------------------- 1 file changed, 125 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 7434e3619c14..d1100b619e61 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -81,154 +81,150 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - if (unlikely(PageTail(page))) { - /* __split_huge_page_refcount can run under us */ - struct page *page_head = compound_trans_head(page); + struct page *page_head; - /* - * THP can not break up slab pages so avoid taking - * compound_lock() and skip the tail page refcounting - * (in _mapcount) too. Slab performs non-atomic bit - * ops on page->flags for better performance. In - * particular slab_unlock() in slub used to be a hot - * path. It is still hot on arches that do not support - * this_cpu_cmpxchg_double(). - * - * If "page" is part of a slab or hugetlbfs page it - * cannot be splitted and the head page cannot change - * from under us. And if "page" is part of a THP page - * under splitting, if the head page pointed by the - * THP tail isn't a THP head anymore, we'll find - * PageTail clear after smp_rmb() and we'll treat it - * as a single page. - */ - if (!__compound_tail_refcounted(page_head)) { + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { /* - * If "page" is a THP tail, we must read the tail page - * flags after the head page flags. The - * split_huge_page side enforces write memory - * barriers between clearing PageTail and before the - * head page can be freed and reallocated. + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - VM_BUG_ON(page_mapcount(page) != 0); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a - * slab compound page, the - * tail pin must not be the - * last reference held on the - * page, because the PG_slab - * cannot be cleared before - * all tail pins (which skips - * the _mapcount tail - * refcounting) have been - * released. For hugetlbfs the - * tail pin may be the last - * reference on the page - * instead, because - * PageHeadHuge will not go - * away until the compound - * page enters the buddy - * allocator. - */ - VM_BUG_ON(PageSlab(page_head)); - __put_compound_page(page_head); - } - return; - } else - /* - * __split_huge_page_refcount - * run before us, "page" was a - * THP tail. The split - * page_head has been freed - * and reallocated as slab or - * hugetlbfs page of smaller - * order (only possible if - * reallocated as slab on - * x86). - */ - goto out_put_single; + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } + return; + } - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - unsigned long flags; + /* __split_huge_page_refcount can run under us */ + page_head = compound_trans_head(page); + /* + * THP can not break up slab pages so avoid taking + * compound_lock() and skip the tail page refcounting (in + * _mapcount) too. Slab performs non-atomic bit ops on + * page->flags for better performance. In particular + * slab_unlock() in slub used to be a hot path. It is still + * hot on arches that do not support + * this_cpu_cmpxchg_double(). + * + * If "page" is part of a slab or hugetlbfs page it cannot be + * splitted and the head page cannot change from under us. And + * if "page" is part of a THP page under splitting, if the + * head page pointed by the THP tail isn't a THP head anymore, + * we'll find PageTail clear after smp_rmb() and we'll treat + * it as a single page. + */ + if (!__compound_tail_refcounted(page_head)) { + /* + * If "page" is a THP tail, we must read the tail page + * flags after the head page flags. The + * split_huge_page side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. + * __split_huge_page_refcount cannot race + * here. */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The head page may have been - * freed and reallocated as a - * compound page of smaller - * order and then freed again. - * All we know is that it - * cannot have become: a THP - * page, a compound page of - * higher order, a tail page. - * That is because we still - * hold the refcount of the - * split THP tail and - * page_head was the THP head - * before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; + VM_BUG_ON(!PageHead(page_head)); + VM_BUG_ON(page_mapcount(page) != 0); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab + * compound page, the tail pin must + * not be the last reference held on + * the page, because the PG_slab + * cannot be cleared before all tail + * pins (which skips the _mapcount + * tail refcounting) have been + * released. For hugetlbfs the tail + * pin may be the last reference on + * the page instead, because + * PageHeadHuge will not go away until + * the compound page enters the buddy + * allocator. + */ + VM_BUG_ON(PageSlab(page_head)); + __put_compound_page(page_head); } - VM_BUG_ON(page_head != page->first_page); + return; + } else /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on - * the compound_lock. + * __split_huge_page_refcount run before us, + * "page" was a THP tail. The split page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). */ - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - VM_BUG_ON(atomic_read(&page_head->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - compound_unlock_irqrestore(page_head, flags); + goto out_put_single; + } + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { + /* + * The head page may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ if (PageHead(page_head)) __put_compound_page(page_head); else __put_single_page(page_head); } - } else { - /* page_head is a dangling pointer */ - VM_BUG_ON(PageTail(page)); - goto out_put_single; +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; } - } else if (put_page_testzero(page)) { - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; } } -- cgit v1.2.3 From 9b7ac260188ddacffdcaadd6a61e4a502238a63f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:49:01 -0800 Subject: mm/hugetlb.c: defer PageHeadHuge() symbol export No actual need of it. So keep it internal. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d9125360bf5..f730b7a37590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -709,7 +709,6 @@ int PageHeadHuge(struct page *page_head) return get_compound_page_dtor(page_head) == free_huge_page; } -EXPORT_SYMBOL_GPL(PageHeadHuge); pgoff_t __basepage_index(struct page *page) { -- cgit v1.2.3 From c728852f5dd41ce34e7ce0a179cf28cd5f4dc301 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:02 -0800 Subject: mm: thp: __get_page_tail_foll() can use get_huge_page_tail() Cleanup. Change __get_page_tail_foll() to use get_huge_page_tail() to avoid the code duplication. Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Dave Jones Cc: Darren Hart Cc: Peter Zijlstra Cc: Mel Gorman Acked-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index a85a3ab1f7ef..a346ba120e42 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -47,12 +47,9 @@ static inline void __get_page_tail_foll(struct page *page, * page_cache_get_speculative()) on tail pages. */ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - VM_BUG_ON(page_mapcount(page) < 0); if (get_page_head) atomic_inc(&page->first_page->_count); - if (compound_tail_refcounted(page->first_page)) - atomic_inc(&page->_mapcount); + get_huge_page_tail(page); } /* -- cgit v1.2.3 From 943dca1a1fcbccb58de944669b833fd38a6c809b Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Jan 2014 15:49:06 -0800 Subject: mm: get rid of unnecessary pageblock scanning in setup_zone_migrate_reserve Yasuaki Ishimatsu reported memory hot-add spent more than 5 _hours_ on 9TB memory machine since onlining memory sections is too slow. And we found out setup_zone_migrate_reserve spent >90% of the time. The problem is, setup_zone_migrate_reserve scans all pageblocks unconditionally, but it is only necessary if the number of reserved block was reduced (i.e. memory hot remove). Moreover, maximum MIGRATE_RESERVE per zone is currently 2. It means that the number of reserved pageblocks is almost always unchanged. This patch adds zone->nr_migrate_reserve_block to maintain the number of MIGRATE_RESERVE pageblocks and it reduces the overhead of setup_zone_migrate_reserve dramatically. The following table shows time of onlining a memory section. Amount of memory | 128GB | 192GB | 256GB| --------------------------------------------- linux-3.12 | 23.9 | 31.4 | 44.5 | This patch | 8.3 | 8.3 | 8.6 | Mel's proposal patch | 10.9 | 19.2 | 31.3 | --------------------------------------------- (millisecond) 128GB : 4 nodes and each node has 32GB of memory 192GB : 6 nodes and each node has 32GB of memory 256GB : 8 nodes and each node has 32GB of memory (*1) Mel proposed his idea by the following threads. https://lkml.org/lkml/2013/10/30/272 [akpm@linux-foundation.org: tweak comment] Signed-off-by: KOSAKI Motohiro Signed-off-by: Yasuaki Ishimatsu Reported-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..89d81f4429ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3901,6 +3901,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) struct page *page; unsigned long block_migratetype; int reserve; + int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve @@ -3922,6 +3923,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) * future allocation of hugepages at runtime. */ reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) @@ -3959,6 +3966,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve--; continue; } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; } /* -- cgit v1.2.3 From b35f1819acd9243a3ff7ad25b1fa8bd6bfe80fb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 21 Jan 2014 15:49:07 -0800 Subject: mm: create a separate slab for page->ptl allocation If DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC are enabled spinlock_t on x86_64 is 72 bytes. For page->ptl they will be allocated from kmalloc-96 slab, so we loose 24 on each. An average system can easily allocate few tens thousands of page->ptl and overhead is significant. Let's create a separate slab for page->ptl allocation to solve this. To make sure that it really works this time, some numbers from my test machine (just booted, no load): Before: # grep '^\(kmalloc-96\|page->ptl\)' /proc/slabinfo kmalloc-96 31987 32190 128 30 1 : tunables 120 60 8 : slabdata 1073 1073 92 After: # grep '^\(kmalloc-96\|page->ptl\)' /proc/slabinfo page->ptl 27516 28143 72 53 1 : tunables 120 60 8 : slabdata 531 531 9 kmalloc-96 3853 5280 128 30 1 : tunables 120 60 8 : slabdata 176 176 0 Note that the patch is useful not only for debug case, but also for PREEMPT_RT, where spinlock_t is always bloated. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e9c550484ba6..86487dfa5e59 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4275,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + bool ptlock_alloc(struct page *page) { spinlock_t *ptl; - ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; page->ptl = ptl; @@ -4288,6 +4297,6 @@ bool ptlock_alloc(struct page *page) void ptlock_free(struct page *page) { - kfree(page->ptl); + kmem_cache_free(page_ptl_cachep, page->ptl); } #endif -- cgit v1.2.3 From 549543dff797ae1081f61a69f8511c61806c3735 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Tue, 21 Jan 2014 15:49:08 -0800 Subject: mm, memory-failure: fix typo in me_pagecache_dirty() [akpm@linux-foundation.org: s/cache/pagecache/] Signed-off-by: Zhi Yong Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe55046c1d..9fa6586d5275 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ -- cgit v1.2.3 From e8569dd299dbc7bac878325c0bdc7aa449eae479 Mon Sep 17 00:00:00 2001 From: Andreas Sandberg Date: Tue, 21 Jan 2014 15:49:09 -0800 Subject: mm/hugetlb.c: call MMU notifiers when copying a hugetlb page range When copy_hugetlb_page_range() is called to copy a range of hugetlb mappings, the secondary MMUs are not notified if there is a protection downgrade, which breaks COW semantics in KVM. This patch adds the necessary MMU notifier calls. Signed-off-by: Andreas Sandberg Acked-by: Steve Capper Acked-by: Marc Zyngier Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f730b7a37590..1697ff0cc53a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2346,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + mmun_start = vma->vm_start; + mmun_end = vma->vm_end; + if (cow) + mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); - if (!dst_pte) - goto nomem; + if (!dst_pte) { + ret = -ENOMEM; + break; + } /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) @@ -2377,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_unlock(src_ptl); spin_unlock(dst_ptl); } - return 0; -nomem: - return -ENOMEM; + if (cow) + mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + + return ret; } static int is_hugetlb_entry_migration(pte_t pte) -- cgit v1.2.3 From ece86e222db48d04bda218a2be70e384518bb08c Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Tue, 21 Jan 2014 15:49:12 -0800 Subject: mm/vmalloc: interchage the implementation of vmalloc_to_{pfn,page} Currently we are implementing vmalloc_to_pfn() as a wrapper around vmalloc_to_page(), which is implemented as follow: 1. walks the page talbes to generates the corresponding pfn, 2. then converts the pfn to struct page, 3. returns it. And vmalloc_to_pfn() re-wraps vmalloc_to_page() to get the pfn. This seems too circuitous, so this patch reverses the way: implement vmalloc_to_page() as a wrapper around vmalloc_to_pfn(). This makes vmalloc_to_pfn() and vmalloc_to_page() slightly more efficient. No functional change. Signed-off-by: Jianyu Zhan Cc: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b..e4f0db2a3eae 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the physical pfn it maps to. */ -struct page *vmalloc_to_page(const void *vmalloc_addr) +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; + unsigned long pfn = 0; pgd_t *pgd = pgd_offset_k(addr); /* @@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) - page = pte_page(pte); + pfn = pte_pfn(pte); pte_unmap(ptep); } } } - return page; + return pfn; } -EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vmalloc_to_pfn); /* - * Map a vmalloc()-space virtual address to the physical page frame number. + * Map a vmalloc()-space virtual address to the struct page. */ -unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +struct page *vmalloc_to_page(const void *vmalloc_addr) { - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); + return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); } -EXPORT_SYMBOL(vmalloc_to_pfn); +EXPORT_SYMBOL(vmalloc_to_page); /*** Global kva allocator ***/ -- cgit v1.2.3 From aec6a8889a98a0cd58357cd0937a25189908f191 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:49:13 -0800 Subject: mm, show_mem: remove SHOW_MEM_FILTER_PAGE_COUNT Commit 4b59e6c47309 ("mm, show_mem: suppress page counts in non-blockable contexts") introduced SHOW_MEM_FILTER_PAGE_COUNT to suppress PFN walks on large memory machines. Commit c78e93630d15 ("mm: do not walk all of system memory during show_mem") avoided a PFN walk in the generic show_mem helper which removes the requirement for SHOW_MEM_FILTER_PAGE_COUNT in that case. This patch removes PFN walkers from the arch-specific implementations that report on a per-node or per-zone granularity. ARM and unicore32 still do a PFN walk as they report memory usage on each bank which is a much finer granularity where the debugging information may still be of use. As the remaining arches doing PFN walks have relatively small amounts of memory, this patch simply removes SHOW_MEM_FILTER_PAGE_COUNT. [akpm@linux-foundation.org: fix parisc] Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Tony Luck Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d81f4429ca..ec4417cb458a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2071,13 +2071,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) debug_guardpage_minorder() > 0) return; - /* - * Walking all memory to count page types is very expensive and should - * be inhibited in non-blockable contexts. - */ - if (!(gfp_mask & __GFP_WAIT)) - filter |= SHOW_MEM_FILTER_PAGE_COUNT; - /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set -- cgit v1.2.3 From 49f0ce5f92321cdcf741e35f385669a421013cb7 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 21 Jan 2014 15:49:14 -0800 Subject: mm: add overcommit_kbytes sysctl variable Some applications that run on HPC clusters are designed around the availability of RAM and the overcommit ratio is fine tuned to get the maximum usage of memory without swapping. With growing memory, the 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse for these workload (on a 2TB machine it represents no less than 20GB). This patch adds the new overcommit_kbytes sysctl variable that allow a much finer grain. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Jerome Marchand Cc: Dave Hansen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 + mm/nommu.c | 1 + mm/util.c | 36 ++++++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 834b2d785f1e..39552de6e1db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ diff --git a/mm/util.c b/mm/util.c index 808f375648e7..a24aa22f2473 100644 --- a/mm/util.c +++ b/mm/util.c @@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) return mapping; } +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { - return ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; } -- cgit v1.2.3 From 363ee17f0f405faff74d9aaf93d21d5f41d5102d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Jan 2014 15:49:15 -0800 Subject: mm/mmap.c: add mlock_future_check() helper Both do_brk and do_mmap_pgoff verify that we are actually capable of locking future pages if the corresponding VM_LOCKED flags are used. Encapsulate this logic into a single mlock_future_check() helper function. Signed-off-by: Davidlohr Bueso Cc: Rik van Riel Reviewed-by: Michel Lespinasse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 39552de6e1db..a0e7153a79e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1191,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1252,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; if (file) { struct inode *inode = file_inode(file); @@ -2592,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (error & ~PAGE_MASK) return error; - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; /* * mm->mmap_sem is required to protect against another thread -- cgit v1.2.3 From 1f1cd7054fe7f45e65dd4963d0a38e5ab7a57cae Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Jan 2014 15:49:16 -0800 Subject: mm/mlock: prepare params outside critical region All mlock related syscalls prepare lock limits, lengths and start parameters with the mmap_sem held. Move this logic outside of the critical region. For the case of mlock, continue incrementing the amount already locked by mm->locked_vm with the rwsem taken. Signed-off-by: Davidlohr Bueso Cc: Rik van Riel Reviewed-by: Michel Lespinasse Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..10819ed4df3e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; - locked = len >> PAGE_SHIFT; - locked += current->mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked += current->mm->locked_vm; /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); if (!error) error = __mm_populate(start, len, 0); @@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; + + down_write(¤t->mm->mmap_sem); ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); + return ret; } @@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; + down_write(¤t->mm->mmap_sem); + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); -- cgit v1.2.3 From 931d13f534a9bb39539f0a851209ca18013ba0c2 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:49:17 -0800 Subject: mm/memblock: debug: correct displaying of upper memory boundary Current memblock APIs don't work on 32 PAE or LPAE extension arches where the physical memory start address beyond 4GB. The problem was discussed here [3] where Tejun, Yinghai(thanks) proposed a way forward with memblock interfaces. Based on the proposal, this series adds necessary memblock interfaces and convert the core kernel code to use them. Architectures already converted to NO_BOOTMEM use these new interfaces and other which still uses bootmem, these new interfaces just fallback to exiting bootmem APIs. So no functional change in behavior. In long run, once all the architectures moves to NO_BOOTMEM, we can get rid of bootmem layer completely. This is one step to remove the core code dependency with bootmem and also gives path for architectures to move away from bootmem. Testing is done on ARM architecture with 32 bit ARM LAPE machines with normal as well sparse(faked) memory model. This patch (of 23): When debugging is enabled (cmdline has "memblock=debug") the memblock will display upper memory boundary per each allocated/freed memory range wrongly. For example: memblock_reserve: [0x0000009e7e8000-0x0000009e7ed000] _memblock_early_alloc_try_nid_nopanic+0xfc/0x12c The 0x0000009e7ed000 is displayed instead of 0x0000009e7ecfff Hence, correct this by changing formula used to calculate upper memory boundary to (u64)base + size - 1 instead of (u64)base + size everywhere in the debug messages. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Acked-by: Tejun Heo Cc: H. Peter Anvin Cc: Russell King Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 53e477bb5558..aab566998b61 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -643,7 +643,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", (unsigned long long)base, - (unsigned long long)base + size, + (unsigned long long)base + size - 1, (void *)_RET_IP_); return __memblock_remove(&memblock.reserved, base, size); @@ -655,7 +655,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", (unsigned long long)base, - (unsigned long long)base + size, + (unsigned long long)base + size - 1, (void *)_RET_IP_); return memblock_add_region(_rgn, base, size, MAX_NUMNODES); -- cgit v1.2.3 From 66a20757214d94b915f2d2aada1384dead9ab18d Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:20 -0800 Subject: memblock, numa: introduce flags field into memblock There is no flag in memblock to describe what type the memory is. Sometimes, we may use memblock to reserve some memory for special usage. And we want to know what kind of memory it is. So we need a way to In hotplug environment, we want to reserve hotpluggable memory so the kernel won't be able to use it. And when the system is up, we have to free these hotpluggable memory to buddy. So we need to mark these memory first. In order to do so, we need to mark out these special memory in memblock. In this patch, we introduce a new "flags" member into memblock_region: struct memblock_region { phys_addr_t base; phys_addr_t size; unsigned long flags; /* This is new. */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int nid; #endif }; This patch does the following things: 1) Add "flags" member to memblock_region. 2) Modify the following APIs' prototype: memblock_add_region() memblock_insert_region() 3) Add memblock_reserve_region() to support reserve memory with flags, and keep memblock_reserve()'s prototype unmodified. 4) Modify other APIs to support flags, but keep their prototype unmodified. The idea is from Wen Congyang and Liu Jiang . Suggested-by: Wen Congyang Suggested-by: Liu Jiang Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index aab566998b61..270b005ca964 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -255,6 +255,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + type->regions[0].flags = 0; memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } @@ -405,7 +406,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) if (this->base + this->size != next->base || memblock_get_region_node(this) != - memblock_get_region_node(next)) { + memblock_get_region_node(next) || + this->flags != next->flags) { BUG_ON(this->base + this->size > next->base); i++; continue; @@ -425,13 +427,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @base: base address of the new region * @size: size of the new region * @nid: node id of the new region + * @flags: flags of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, - phys_addr_t size, int nid) + phys_addr_t size, + int nid, unsigned long flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -439,6 +443,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); rgn->base = base; rgn->size = size; + rgn->flags = flags; memblock_set_region_node(rgn, nid); type->cnt++; type->total_size += size; @@ -450,6 +455,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base,@base+@size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already @@ -460,7 +466,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * 0 on success, -errno on failure. */ static int __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, int nid) + phys_addr_t base, phys_addr_t size, + int nid, unsigned long flags) { bool insert = false; phys_addr_t obase = base; @@ -475,6 +482,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; return 0; @@ -505,7 +513,8 @@ repeat: nr_new++; if (insert) memblock_insert_region(type, i++, base, - rbase - base, nid); + rbase - base, nid, + flags); } /* area below @rend is dealt with, forget about it */ base = min(rend, end); @@ -515,7 +524,8 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, nid); + memblock_insert_region(type, i, base, end - base, + nid, flags); } /* @@ -537,12 +547,13 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid); + return memblock_add_region(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); + return memblock_add_region(&memblock.memory, base, size, + MAX_NUMNODES, 0); } /** @@ -597,7 +608,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= base - rbase; type->total_size -= base - rbase; memblock_insert_region(type, i, rbase, base - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else if (rend > end) { /* * @rgn intersects from above. Split and redo the @@ -607,7 +619,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= end - rbase; type->total_size -= end - rbase; memblock_insert_region(type, i--, rbase, end - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) @@ -649,16 +662,24 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) return __memblock_remove(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_reserve_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) { struct memblock_type *_rgn = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - (void *)_RET_IP_); + flags, (void *)_RET_IP_); + + return memblock_add_region(_rgn, base, size, nid, flags); +} - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); } /** @@ -1101,6 +1122,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; + unsigned long flags; int i; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); @@ -1111,13 +1133,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", - name, i, base, base + size - 1, size, nid_buf); + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", + name, i, base, base + size - 1, size, nid_buf, flags); } } -- cgit v1.2.3 From 66b16edf9eafc3291cabb2253d0f342a847656b7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:23 -0800 Subject: memblock, mem_hotplug: introduce MEMBLOCK_HOTPLUG flag to mark hotpluggable regions In find_hotpluggable_memory, once we find out a memory region which is hotpluggable, we want to mark them in memblock.memory. So that we could control memblock allocator not to allocte hotpluggable memory for the kernel later. To achieve this goal, we introduce MEMBLOCK_HOTPLUG flag to indicate the hotpluggable memory regions in memblock and a function memblock_mark_hotplug() to mark hotpluggable memory if we find one. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 270b005ca964..2121ec4c7fa0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -682,6 +682,59 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) return memblock_reserve_region(base, size, MAX_NUMNODES, 0); } +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and mark it with flag + * MEMBLOCK_HOTPLUG. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and clear flag + * MEMBLOCK_HOTPLUG for the isolated regions. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_clear_region_flags(&type->regions[i], + MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable -- cgit v1.2.3 From e7e8de5918dd6a07cbddae559600d7765ad6a56e Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:26 -0800 Subject: memblock: make memblock_set_node() support different memblock_type [sfr@canb.auug.org.au: fix powerpc build] Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 2121ec4c7fa0..d5681008dce1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -911,18 +911,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for * @size: size of area to set node ID for + * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. * Regions which cross the area boundaries are split as necessary. * * RETURNS: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, - int nid) + struct memblock_type *type, int nid) { - struct memblock_type *type = &memblock.memory; int start_rgn, end_rgn; int i, ret; -- cgit v1.2.3 From 55ac590c2fadad785d60dd70c12d62823bc2cd39 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:35 -0800 Subject: memblock, mem_hotplug: make memblock skip hotpluggable regions if needed Linux kernel cannot migrate pages used by the kernel. As a result, hotpluggable memory used by the kernel won't be able to be hot-removed. To solve this problem, the basic idea is to prevent memblock from allocating hotpluggable memory for the kernel at early time, and arrange all hotpluggable memory in ACPI SRAT(System Resource Affinity Table) as ZONE_MOVABLE when initializing zones. In the previous patches, we have marked hotpluggable memory regions with MEMBLOCK_HOTPLUG flag in memblock.memory. In this patch, we make memblock skip these hotpluggable memory regions in the default top-down allocation function if movable_node boot option is specified. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 12 ++++++++++++ mm/memory_hotplug.c | 1 + 2 files changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index d5681008dce1..6a2a48a122a9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -39,6 +39,9 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; +#ifdef CONFIG_MOVABLE_NODE +bool movable_node_enabled __initdata_memblock = false; +#endif static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; @@ -820,6 +823,11 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). + * + * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't + * be able to hot-remove hotpluggable memory used by the kernel. So this + * function skip hotpluggable regions if needed when allocating memory for the + * kernel. */ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, @@ -844,6 +852,10 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + /* scan areas before each reservation for intersection */ for ( ; ri >= 0; ri--) { struct memblock_region *r = &rsv->regions[ri]; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..01e39afde1cb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p) * the kernel away from hotpluggable memory. */ memblock_set_bottom_up(true); + movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); #endif -- cgit v1.2.3 From b2f3eebe7a8ef6cd4e2ea088ac7f613793f6cad6 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:38 -0800 Subject: x86, numa, acpi, memory-hotplug: make movable_node have higher priority If users specify the original movablecore=nn@ss boot option, the kernel will arrange [ss, ss+nn) as ZONE_MOVABLE. The kernelcore=nn@ss boot option is similar except it specifies ZONE_NORMAL ranges. Now, if users specify "movable_node" in kernel commandline, the kernel will arrange hotpluggable memory in SRAT as ZONE_MOVABLE. And if users do this, all the other movablecore=nn@ss and kernelcore=nn@ss options should be ignored. For those who don't want this, just specify nothing. The kernel will act as before. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Reviewed-by: Wanpeng Li Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec4417cb458a..4f59d1986018 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5018,9 +5018,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); + struct memblock_type *type = &memblock.memory; + + /* Need to find movable_zone earlier when movable_node is specified. */ + find_usable_zone_for_movable(); + + /* + * If movable_node is specified, ignore kernelcore and movablecore + * options. + */ + if (movable_node_is_enabled()) { + for (i = 0; i < type->cnt; i++) { + if (!memblock_is_hotpluggable(&type->regions[i])) + continue; + + nid = type->regions[i].nid; + + usable_startpfn = PFN_DOWN(type->regions[i].base); + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + goto out2; + } /* - * If movablecore was specified, calculate what size of + * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -5046,7 +5070,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ - find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: @@ -5137,6 +5160,7 @@ restart: if (usable_nodes && required_kernelcore > usable_nodes) goto restart; +out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = -- cgit v1.2.3 From 1c98dd905ddb7552f13a3f06aa0bd9ef6affeeb7 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 21 Jan 2014 15:49:41 -0800 Subject: memcg: fix kmem_account_flags check in memcg_can_account_kmem() We should start kmem accounting for a memory cgroup only after both its kmem limit is set (KMEM_ACCOUNTED_ACTIVE) and related call sites are patched (KMEM_ACCOUNTED_ACTIVATED). Currently memcg_can_account_kmem() allows kmem accounting even if only one of the conditions is true. Fix it. This means that a page might get charged by memcg_kmem_newpage_charge which would see its static key patched already but memcg_kmem_commit_charge would still see it unpatched and so the charge won't be committed. The result would be charge inconsistency (page_cgroup not marked as PageCgroupUsed) and the charge would leak because __memcg_kmem_uncharge_pages would ignore it. [mhocko@suse.cz: augment changelog] Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f1a356153c0..3065fa80251d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2959,7 +2959,8 @@ static DEFINE_MUTEX(set_limit_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == + KMEM_ACCOUNTED_MASK; } /* -- cgit v1.2.3 From 2753b35bcd3156727138cd2305b825611a571a3f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 21 Jan 2014 15:49:42 -0800 Subject: memcg: make memcg_update_cache_sizes() static This function is not used outside of memcontrol.c so make it static. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3065fa80251d..08541f680d90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3087,7 +3087,7 @@ int memcg_cache_id(struct mem_cgroup *memcg) * But when we create a new cache, we can call this as well if its parent * is kmem-limited. That will have to hold set_limit_mutex as well. */ -int memcg_update_cache_sizes(struct mem_cgroup *memcg) +static int memcg_update_cache_sizes(struct mem_cgroup *memcg) { int num, ret; -- cgit v1.2.3 From b854f711f6b8b49674d494c5e6d706096dd38301 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:43 -0800 Subject: mm/rmap: recompute pgoff for huge page Rmap traversing is used in five different cases, try_to_unmap(), try_to_munlock(), page_referenced(), page_mkclean() and remove_migration_ptes(). Each one implements its own traversing functions for the cases, anon, file, ksm, respectively. These cause lots of duplications and cause maintenance overhead. They also make codes being hard to understand and error-prone. One example is hugepage handling. There is a code to compute hugepage offset correctly in try_to_unmap_file(), but, there isn't a code to compute hugepage offset in rmap_walk_file(). These are used pairwise in migration context, but we missed to modify pairwise. To overcome these drawbacks, we should unify these through one unified function. I decide rmap_walk() as main function since it has no unnecessity. And to control behavior of rmap_walk(), I introduce struct rmap_walk_control having some function pointers. These makes rmap_walk() working for their specific needs. This patchset remove a lot of duplicated code as you can see in below short-stat and kernel text size also decrease slightly. text data bss dec hex filename 10640 1 16 10657 29a1 mm/rmap.o 10047 1 16 10064 2750 mm/rmap.o 13823 705 8288 22816 5920 mm/ksm.o 13199 705 8288 22192 56b0 mm/ksm.o This patch (of 9): We have to recompute pgoff if the given page is huge, since result based on HPAGE_SIZE is not approapriate for scanning the vma interval tree, as shown by commit 36e4f20af833 ("hugetlb: do not use vma_hugecache_offset() for vma_prio_tree_foreach") and commit 369a713e ("rmap: recompute pgoff for unmapping huge page"). To handle both the cases, normal page for page cache and hugetlb page, by same way, we can use compound_page(). It returns 0 on non-compound page and it also returns proper value on compound page. Signed-off-by: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 068522d8502a..edc0aea2c4e3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1512,7 +1512,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; @@ -1520,9 +1520,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - if (PageHuge(page)) - pgoff = page->index << compound_order(page); - mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1712,7 +1709,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; -- cgit v1.2.3 From 0f843c6ac318bb3ea7b63437b66dd39d8f01b088 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:45 -0800 Subject: mm/rmap: factor nonlinear handling out of try_to_unmap_file() To merge all kinds of rmap traverse functions, try_to_unmap(), try_to_munlock(), page_referenced() and page_mkclean(), we need to extract common parts and separate out non-common parts. Nonlinear handling is handled just in try_to_unmap_file() and other rmap traverse functions doesn't care of it. Therfore it is better to factor nonlinear handling out of try_to_unmap_file() in order to merge all kinds of rmap traverse functions easily. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 136 ++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 74 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index edc0aea2c4e3..7eab4ed304c1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1426,6 +1426,79 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } +static int try_to_unmap_nonlinear(struct page *page, + struct address_space *mapping, struct vm_area_struct *vma) +{ + int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; + unsigned int mapcount; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; + } + + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ + return SWAP_FAIL; + } + + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + mapcount = page_mapcount(page); + if (!mapcount) + return ret; + + cond_resched(); + + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + cursor = (unsigned long) vma->vm_private_data; + while (cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + if (try_to_unmap_cluster(cursor, &mapcount, + vma, page) == SWAP_MLOCK) + ret = SWAP_MLOCK; + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if ((int)mapcount <= 0) + return ret; + } + vma->vm_private_data = (void *) max_nl_cursor; + } + cond_resched(); + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); + + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) + vma->vm_private_data = NULL; + + return ret; +} + bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1515,10 +1588,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; - unsigned long cursor; - unsigned long max_nl_cursor = 0; - unsigned long max_nl_size = 0; - unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1539,64 +1608,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.nonlinear) { - cursor = (unsigned long) vma->vm_private_data; - if (cursor > max_nl_cursor) - max_nl_cursor = cursor; - cursor = vma->vm_end - vma->vm_start; - if (cursor > max_nl_size) - max_nl_size = cursor; - } - - if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - ret = SWAP_FAIL; - goto out; - } - - /* - * We don't try to search for this page in the nonlinear vmas, - * and page_referenced wouldn't have found it anyway. Instead - * just walk the nonlinear vmas trying to age and unmap some. - * The mapcount of the page we came in with is irrelevant, - * but even so use it as a guide to how hard we should try? - */ - mapcount = page_mapcount(page); - if (!mapcount) - goto out; - cond_resched(); - - max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; - if (max_nl_cursor == 0) - max_nl_cursor = CLUSTER_SIZE; - - do { - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.nonlinear) { - cursor = (unsigned long) vma->vm_private_data; - while ( cursor < max_nl_cursor && - cursor < vma->vm_end - vma->vm_start) { - if (try_to_unmap_cluster(cursor, &mapcount, - vma, page) == SWAP_MLOCK) - ret = SWAP_MLOCK; - cursor += CLUSTER_SIZE; - vma->vm_private_data = (void *) cursor; - if ((int)mapcount <= 0) - goto out; - } - vma->vm_private_data = (void *) max_nl_cursor; - } - cond_resched(); - max_nl_cursor += CLUSTER_SIZE; - } while (max_nl_cursor <= max_nl_size); - - /* - * Don't loop forever (perhaps all the remaining pages are - * in locked vmas). Reset cursor on all unreserved nonlinear - * vmas, now forgetting on which ones it had fallen behind. - */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) - vma->vm_private_data = NULL; + ret = try_to_unmap_nonlinear(page, mapping, vma); out: mutex_unlock(&mapping->i_mmap_mutex); return ret; -- cgit v1.2.3 From faecd8dd852d4e4a63a1b8ad43e5df8e41ee0336 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:46 -0800 Subject: mm/rmap: factor lock function out of rmap_walk_anon() When we traverse anon_vma, we need to take a read-side anon_lock. But there is subtle difference in the situation so that we can't use same method to take a lock in each cases. Therefore, we need to make rmap_walk_anon() taking difference lock function. This patch is the first step, factoring lock function for anon_lock out of rmap_walk_anon(). It will be used in case of removing migration entry and in default of rmap_walk_anon(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 7eab4ed304c1..5a79bf585e27 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1684,6 +1684,24 @@ void __put_anon_vma(struct anon_vma *anon_vma) } #ifdef CONFIG_MIGRATION +static struct anon_vma *rmap_walk_anon_lock(struct page *page) +{ + struct anon_vma *anon_vma; + + /* + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing + */ + anon_vma = page_anon_vma(page); + if (!anon_vma) + return NULL; + + anon_vma_lock_read(anon_vma); + return anon_vma; +} + /* * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): * Called by migrate.c to remove migration ptes, but might be used more later. @@ -1696,16 +1714,10 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() - * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem. Users without mmap_sem are required to - * take a reference count to prevent the anon_vma disappearing - */ - anon_vma = page_anon_vma(page); + anon_vma = rmap_walk_anon_lock(page); if (!anon_vma) return ret; - anon_vma_lock_read(anon_vma); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); -- cgit v1.2.3 From 051ac83adf69eea4f57a97356e4282e395a5fa6d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:48 -0800 Subject: mm/rmap: make rmap_walk to get the rmap_walk_control argument In each rmap traverse case, there is some difference so that we need function pointers and arguments to them in order to handle these For this purpose, struct rmap_walk_control is introduced in this patch, and will be extended in following patch. Introducing and extending are separate, because it clarify changes. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 6 +++--- mm/migrate.c | 7 ++++++- mm/rmap.c | 19 ++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 175fff79dc95..c3035fee8080 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1997,8 +1997,7 @@ out: } #ifdef CONFIG_MIGRATION -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; @@ -2033,7 +2032,8 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; - ret = rmap_one(page, vma, rmap_item->address, arg); + ret = rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..11d89dc0574c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -199,7 +199,12 @@ out: */ static void remove_migration_ptes(struct page *old, struct page *new) { - rmap_walk(new, remove_migration_pte, old); + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = old, + }; + + rmap_walk(new, &rwc); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 5a79bf585e27..f8f10ad5d359 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1706,8 +1706,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page) * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): * Called by migrate.c to remove migration ptes, but might be used more later. */ -static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1721,7 +1720,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; } @@ -1729,8 +1728,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, return ret; } -static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << compound_order(page); @@ -1742,7 +1740,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; } @@ -1755,17 +1753,16 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, return ret; } -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { VM_BUG_ON(!PageLocked(page)); if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rmap_one, arg); + return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rmap_one, arg); + return rmap_walk_anon(page, rwc); else - return rmap_walk_file(page, rmap_one, arg); + return rmap_walk_file(page, rwc); } #endif /* CONFIG_MIGRATION */ -- cgit v1.2.3 From 0dd1c7bbce8d1d142bb25aefaa50262dfd77cb78 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:49 -0800 Subject: mm/rmap: extend rmap_walk_xxx() to cope with different cases There are a lot of common parts in traversing functions, but there are also a little of uncommon parts in it. By assigning proper function pointer on each rmap_walker_control, we can handle these difference correctly. Following are differences we should handle. 1. difference of lock function in anon mapping case 2. nonlinear handling in file mapping case 3. prechecked condition: checking memcg in page_referenced(), checking VM_SHARE in page_mkclean() checking temporary vma in try_to_unmap() 4. exit condition: checking page_mapped() in try_to_unmap() So, in this patch, I introduce 4 function pointers to handle above differences. Signed-off-by: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 +++++++ mm/rmap.c | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index c3035fee8080..91b8cb35f7cc 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2032,12 +2032,19 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; } + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); + goto out; + } } anon_vma_unlock_read(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index f8f10ad5d359..97bf8f0396f8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1684,10 +1684,14 @@ void __put_anon_vma(struct anon_vma *anon_vma) } #ifdef CONFIG_MIGRATION -static struct anon_vma *rmap_walk_anon_lock(struct page *page) +static struct anon_vma *rmap_walk_anon_lock(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; + if (rwc->anon_lock) + return rwc->anon_lock(page); + /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages @@ -1713,16 +1717,22 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page); + anon_vma = rmap_walk_anon_lock(page, rwc); if (!anon_vma) return ret; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; + if (rwc->done && rwc->done(page)) + break; } anon_vma_unlock_read(anon_vma); return ret; @@ -1740,15 +1750,26 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) - break; + goto done; + if (rwc->done && rwc->done(page)) + goto done; } - /* - * No nonlinear handling: being always shared, nonlinear vmas - * never contain migration ptes. Decide what to do about this - * limitation to linear when we need rmap_walk() on nonlinear. - */ + + if (!rwc->file_nonlinear) + goto done; + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto done; + + ret = rwc->file_nonlinear(page, mapping, vma); + +done: mutex_unlock(&mapping->i_mmap_mutex); return ret; } -- cgit v1.2.3 From 52629506420ce32997f1fba0a1ab2f1aaa9a4f79 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:50 -0800 Subject: mm/rmap: use rmap_walk() in try_to_unmap() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in try_to_unmap(). In this patch, I change following things. 1. enable rmap_walk() if !CONFIG_MIGRATION. 2. mechanical change to use rmap_walk() in try_to_unmap(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++-- mm/rmap.c | 48 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 91b8cb35f7cc..6b4baa97f4c0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1982,7 +1982,7 @@ again: continue; ret = try_to_unmap_one(page, vma, - rmap_item->address, flags); + rmap_item->address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { anon_vma_unlock_read(anon_vma); goto out; @@ -1996,7 +1996,6 @@ out: return ret; } -#ifdef CONFIG_MIGRATION int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; @@ -2054,6 +2053,7 @@ out: return ret; } +#ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { struct stable_node *stable_node; diff --git a/mm/rmap.c b/mm/rmap.c index 97bf8f0396f8..b3263cb32361 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1179,15 +1179,18 @@ out: /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. + * + * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, enum ttu_flags flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; + enum ttu_flags flags = (enum ttu_flags)arg; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1513,6 +1516,11 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) return false; } +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return is_vma_temporary_stack(vma); +} + /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method @@ -1558,7 +1566,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); + ret = try_to_unmap_one(page, vma, address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; } @@ -1592,7 +1600,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); + ret = try_to_unmap_one(page, vma, address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; } @@ -1614,6 +1622,11 @@ out: return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped @@ -1631,16 +1644,29 @@ out: int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .file_nonlinear = try_to_unmap_nonlinear, + .anon_lock = page_lock_anon_vma_read, + }; - BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); - if (unlikely(PageKsm(page))) - ret = try_to_unmap_ksm(page, flags); - else if (PageAnon(page)) - ret = try_to_unmap_anon(page, flags); - else - ret = try_to_unmap_file(page, flags); + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + rwc.invalid_vma = invalid_migration_vma; + + ret = rmap_walk(page, &rwc); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1683,7 +1709,6 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(anon_vma); } -#ifdef CONFIG_MIGRATION static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct rmap_walk_control *rwc) { @@ -1785,7 +1810,6 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) else return rmap_walk_file(page, rwc); } -#endif /* CONFIG_MIGRATION */ #ifdef CONFIG_HUGETLB_PAGE /* -- cgit v1.2.3 From e8351ac9bfa7f4412d5d196b6742309473ca506d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:52 -0800 Subject: mm/rmap: use rmap_walk() in try_to_munlock() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in try_to_munlock(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> try_to_unmap_ksm, try_to_unmap_anon, try_to_unmap_file 2. mechanical change to use rmap_walk() in try_to_munlock(). 3. copy and paste comments. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 50 -------------------- mm/rmap.c | 154 +++++++++++++++++--------------------------------------------- 2 files changed, 42 insertions(+), 162 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 6b4baa97f4c0..646d45a6b6c8 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1946,56 +1946,6 @@ out: return referenced; } -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return SWAP_FAIL; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - ret = try_to_unmap_one(page, vma, - rmap_item->address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock_read(anon_vma); - goto out; - } - } - anon_vma_unlock_read(anon_vma); - } - if (!search_new_forks++) - goto again; -out: - return ret; -} - int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; diff --git a/mm/rmap.c b/mm/rmap.c index b3263cb32361..c73e0c645d09 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1177,9 +1177,6 @@ out: } /* - * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. - * * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, @@ -1521,107 +1518,6 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -/** - * try_to_unmap_anon - unmap or unlock anonymous page using the object-based - * rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * anonymous pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) -{ - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return ret; - - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address; - - /* - * During exec, a temporary VMA is setup and later moved. - * The VMA is moved under the anon_vma lock but not the - * page tables leading to a race where migration cannot - * find the migration ptes. Rather than increasing the - * locking requirements of exec(), migration skips - * temporary VMAs until after exec() completes. - */ - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && - is_vma_temporary_stack(vma)) - continue; - - address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - break; - } - - page_unlock_anon_vma_read(anon_vma); - return ret; -} - -/** - * try_to_unmap_file - unmap/unlock file page using the object-based rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the address_space struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * object-based pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_file(struct page *page, enum ttu_flags flags) -{ - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - goto out; - } - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto out; - - /* - * We don't bother to try to find the munlocked page in nonlinears. - * It's costly. Instead, later, page reclaim logic may call - * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. - */ - if (TTU_ACTION(flags) == TTU_MUNLOCK) - goto out; - - ret = try_to_unmap_nonlinear(page, mapping, vma); -out: - mutex_unlock(&mapping->i_mmap_mutex); - return ret; -} - static int page_not_mapped(struct page *page) { return !page_mapped(page); @@ -1689,14 +1585,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) */ int try_to_munlock(struct page *page) { + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)TTU_MUNLOCK, + .done = page_not_mapped, + /* + * We don't bother to try to find the munlocked page in + * nonlinears. It's costly. Instead, later, page reclaim logic + * may call try_to_unmap() and recover PG_mlocked lazily. + */ + .file_nonlinear = NULL, + .anon_lock = page_lock_anon_vma_read, + + }; + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (unlikely(PageKsm(page))) - return try_to_unmap_ksm(page, TTU_MUNLOCK); - else if (PageAnon(page)) - return try_to_unmap_anon(page, TTU_MUNLOCK); - else - return try_to_unmap_file(page, TTU_MUNLOCK); + ret = rmap_walk(page, &rwc); + return ret; } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1732,8 +1639,18 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, } /* - * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): - * Called by migrate.c to remove migration ptes, but might be used more later. + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. */ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { @@ -1763,6 +1680,19 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) return ret; } +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; -- cgit v1.2.3 From 9f32624be943538983eb0f18b73a9052d1493c80 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:53 -0800 Subject: mm/rmap: use rmap_walk() in page_referenced() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in page_referenced(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> page_referenced_ksm, page_referenced_anon, page_referenced_file 2. introduce new struct page_referenced_arg and pass it to page_referenced_one(), main function of rmap_walk, in order to count reference, to store vm_flags and to check finish condition. 3. mechanical change to use rmap_walk() in page_referenced(). [liwanp@linux.vnet.ibm.com: fix BUG at rmap_walk] Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Wanpeng Li Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 60 ++---------------- mm/rmap.c | 210 ++++++++++++++++++++++---------------------------------------- 2 files changed, 79 insertions(+), 191 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 646d45a6b6c8..3df141e5f3e0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1891,61 +1891,6 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - unsigned int mapcount = page_mapcount(page); - int referenced = 0; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return 0; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - - referenced += page_referenced_one(page, vma, - rmap_item->address, &mapcount, vm_flags); - if (!search_new_forks || !mapcount) - break; - } - anon_vma_unlock_read(anon_vma); - if (!mapcount) - goto out; - } - if (!search_new_forks++) - goto again; -out: - return referenced; -} - int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; @@ -1954,6 +1899,11 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) int search_new_forks = 0; VM_BUG_ON(!PageKsm(page)); + + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ VM_BUG_ON(!PageLocked(page)); stable_node = page_stable_node(page); diff --git a/mm/rmap.c b/mm/rmap.c index c73e0c645d09..080413036406 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +struct page_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; /* - * Subfunctions of page_referenced: page_referenced_one called - * repeatedly from either page_referenced_anon or page_referenced_file. + * arg: page_referenced_arg will be passed */ int page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, unsigned int *mapcount, - unsigned long *vm_flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; @@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); if (!pmd) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { spin_unlock(ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } /* go ahead even if the pmd is pmd_trans_splitting() */ @@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - (*mapcount)--; - - if (referenced) - *vm_flags |= vma->vm_flags; -out: - return referenced; -} - -static int page_referenced_anon(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - unsigned int mapcount; - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int referenced = 0; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return referenced; - - mapcount = page_mapcount(page); - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags; } - page_unlock_anon_vma_read(anon_vma); - return referenced; + pra->mapcount--; + if (!pra->mapcount) + return SWAP_SUCCESS; /* To break the loop */ + + return SWAP_AGAIN; } -/** - * page_referenced_file - referenced check for object-based rmap - * @page: the page we're checking references on. - * @memcg: target memory control group - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page - * - * For an object-based mapped page, find all the places it is mapped and - * check/clear the referenced flag. This is done by following the page->mapping - * pointer, then walking the chain of vmas it holds. It returns the number - * of references it found. - * - * This function is only called from page_referenced for object-based pages. - */ -static int page_referenced_file(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) +static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) { - unsigned int mapcount; - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int referenced = 0; - - /* - * The caller's checks on page->mapping and !PageAnon have made - * sure that this is a file page: the check for page->mapping - * excludes the case just before it gets set on an anon page. - */ - BUG_ON(PageAnon(page)); + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. - */ - BUG_ON(!PageLocked(page)); - - mutex_lock(&mapping->i_mmap_mutex); - - /* - * i_mmap_mutex does not stabilize mapcount at all, but mapcount - * is more likely to be accurate if we note it after spinning. - */ - mapcount = page_mapcount(page); - - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; - } + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; - mutex_unlock(&mapping->i_mmap_mutex); - return referenced; + return false; } /** @@ -851,32 +768,47 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int referenced = 0; + int ret; int we_locked = 0; + struct page_referenced_arg pra = { + .mapcount = page_mapcount(page), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, + }; *vm_flags = 0; - if (page_mapped(page) && page_rmapping(page)) { - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { - we_locked = trylock_page(page); - if (!we_locked) { - referenced++; - goto out; - } - } - if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, memcg, - vm_flags); - else if (PageAnon(page)) - referenced += page_referenced_anon(page, memcg, - vm_flags); - else if (page->mapping) - referenced += page_referenced_file(page, memcg, - vm_flags); - if (we_locked) - unlock_page(page); + if (!page_mapped(page)) + return 0; + + if (!page_rmapping(page)) + return 0; + + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) + return 1; } -out: - return referenced; + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_page_referenced_vma; + } + + ret = rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + unlock_page(page); + + return pra.referenced; } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, @@ -1700,6 +1632,14 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) struct vm_area_struct *vma; int ret = SWAP_AGAIN; + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_mutex. + */ + VM_BUG_ON(!PageLocked(page)); + if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); @@ -1731,8 +1671,6 @@ done: int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { - VM_BUG_ON(!PageLocked(page)); - if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) -- cgit v1.2.3 From 9853a407b97d8d066b5a865173a4859a3e69fd8a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:55 -0800 Subject: mm/rmap: use rmap_walk() in page_mkclean() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in page_mkclean(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> page_mkclean_file 2. mechanical change to use rmap_walk() in page_mkclean(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 080413036406..962e2a1e13a0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -812,12 +812,13 @@ int page_referenced(struct page *page, } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; spinlock_t *ptl; int ret = 0; + int *cleaned = arg; pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) @@ -836,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { mmu_notifier_invalidate_page(mm, address); + (*cleaned)++; + } out: - return ret; + return SWAP_AGAIN; } -static int page_mkclean_file(struct address_space *mapping, struct page *page) +static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int ret = 0; - - BUG_ON(PageAnon(page)); + if (vma->vm_flags & VM_SHARED) + return 0; - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) { - unsigned long address = vma_address(page, vma); - ret += page_mkclean_one(page, vma, address); - } - } - mutex_unlock(&mapping->i_mmap_mutex); - return ret; + return 1; } int page_mkclean(struct page *page) { - int ret = 0; + int cleaned = 0; + struct address_space *mapping; + struct rmap_walk_control rwc = { + .arg = (void *)&cleaned, + .rmap_one = page_mkclean_one, + .invalid_vma = invalid_mkclean_vma, + }; BUG_ON(!PageLocked(page)); - if (page_mapped(page)) { - struct address_space *mapping = page_mapping(page); - if (mapping) - ret = page_mkclean_file(mapping, page); - } + if (!page_mapped(page)) + return 0; - return ret; + mapping = page_mapping(page); + if (!mapping) + return 0; + + rmap_walk(page, &rwc); + + return cleaned; } EXPORT_SYMBOL_GPL(page_mkclean); -- cgit v1.2.3 From 1da4db0cd5c8a31d4468ec906b413e75e604b465 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:58 -0800 Subject: oom_kill: change oom_kill.c to use for_each_thread() Change oom_kill.c to use for_each_thread() rather than the racy while_each_thread() which can loop forever if we race with exit. Note also that most users were buggy even if while_each_thread() was fine, the task can exit even _before_ rcu_read_lock(). Fortunately the new for_each_thread() only requires the stable task_struct, so this change fixes both problems. Signed-off-by: Oleg Nesterov Reviewed-by: Sergey Dyasly Tested-by: Sergey Dyasly Reviewed-by: Sameer Nanda Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Reviewed-by: Michal Hocko Cc: "Tu, Xiaobing" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600a6163..96d7945f75a6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -59,7 +59,7 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, { struct task_struct *start = tsk; - do { + for_each_thread(start, tsk) { if (mask) { /* * If this is a mempolicy constrained oom, tsk's @@ -77,7 +77,7 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, if (cpuset_mems_allowed_intersects(current, tsk)) return true; } - } while_each_thread(start, tsk); + } return false; } @@ -97,14 +97,14 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, */ struct task_struct *find_lock_task_mm(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; - do { + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) return t; task_unlock(t); - } while_each_thread(p, t); + } return NULL; } @@ -301,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long chosen_points = 0; rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { unsigned int points; switch (oom_scan_process_thread(p, totalpages, nodemask, @@ -323,7 +323,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; chosen_points = points; } - } while_each_thread(g, p); + } if (chosen) get_task_struct(chosen); rcu_read_unlock(); @@ -406,7 +406,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *victim = p; struct task_struct *child; - struct task_struct *t = p; + struct task_struct *t; struct mm_struct *mm; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -437,7 +437,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * still freeing memory. */ read_lock(&tasklist_lock); - do { + for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -455,7 +455,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, get_task_struct(victim); } } - } while_each_thread(p, t); + } read_unlock(&tasklist_lock); rcu_read_lock(); -- cgit v1.2.3 From ad96244179fbd55b40c00f10f399bc04739b8e1f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:50:00 -0800 Subject: oom_kill: has_intersects_mems_allowed() needs rcu_read_lock() At least out_of_memory() calls has_intersects_mems_allowed() without even rcu_read_lock(), this is obviously buggy. Add the necessary rcu_read_lock(). This means that we can not simply return from the loop, we need "bool ret" and "break". While at it, swap the names of task_struct's (the argument and the local). This cleans up the code a little bit and avoids the unnecessary initialization. Signed-off-by: Oleg Nesterov Reviewed-by: Sergey Dyasly Tested-by: Sergey Dyasly Reviewed-by: Sameer Nanda Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Reviewed-by: Michal Hocko Cc: "Tu, Xiaobing" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 96d7945f75a6..0d8ad1ebd1d1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -47,18 +47,20 @@ static DEFINE_SPINLOCK(zone_scan_lock); #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill - * @tsk: task struct of which task to consider + * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. */ -static bool has_intersects_mems_allowed(struct task_struct *tsk, +static bool has_intersects_mems_allowed(struct task_struct *start, const nodemask_t *mask) { - struct task_struct *start = tsk; + struct task_struct *tsk; + bool ret = false; + rcu_read_lock(); for_each_thread(start, tsk) { if (mask) { /* @@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, * mempolicy intersects current, otherwise it may be * needlessly killed. */ - if (mempolicy_nodemask_intersects(tsk, mask)) - return true; + ret = mempolicy_nodemask_intersects(tsk, mask); } else { /* * This is not a mempolicy constrained oom, so only * check the mems of tsk's cpuset. */ - if (cpuset_mems_allowed_intersects(current, tsk)) - return true; + ret = cpuset_mems_allowed_intersects(current, tsk); } + if (ret) + break; } + rcu_read_unlock(); - return false; + return ret; } #else static bool has_intersects_mems_allowed(struct task_struct *tsk, -- cgit v1.2.3 From 4d4048be8a93769350efa31d2482a038b7de73d0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:50:01 -0800 Subject: oom_kill: add rcu_read_lock() into find_lock_task_mm() find_lock_task_mm() expects it is called under rcu or tasklist lock, but it seems that at least oom_unkillable_task()->task_in_mem_cgroup() and mem_cgroup_out_of_memory()->oom_badness() can call it lockless. Perhaps we could fix the callers, but this patch simply adds rcu lock into find_lock_task_mm(). This also allows to simplify a bit one of its callers, oom_kill_process(). Signed-off-by: Oleg Nesterov Cc: Sergey Dyasly Cc: Sameer Nanda Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Reviewed-by: Michal Hocko Cc: "Tu, Xiaobing" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0d8ad1ebd1d1..054ff47c4478 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -102,14 +102,19 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) { struct task_struct *t; + rcu_read_lock(); + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - return t; + goto found; task_unlock(t); } + t = NULL; +found: + rcu_read_unlock(); - return NULL; + return t; } /* return true if the task is not adequate as candidate victim task. */ @@ -461,10 +466,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } read_unlock(&tasklist_lock); - rcu_read_lock(); p = find_lock_task_mm(victim); if (!p) { - rcu_read_unlock(); put_task_struct(victim); return; } else if (victim != p) { @@ -490,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * That thread will now get access to memory reserves since it has a * pending fatal signal. */ + rcu_read_lock(); for_each_process(p) if (p->mm == mm && !same_thread_group(p, victim) && !(p->flags & PF_KTHREAD)) { -- cgit v1.2.3 From fd615c4e671979e3e362df537d6be38f8d27aa80 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:05 -0800 Subject: mm/memblock: debug: don't free reserved array if !ARCH_DISCARD_MEMBLOCK Now the Nobootmem allocator will always try to free memory allocated for reserved memory regions (free_low_memory_core_early()) without taking into to account current memblock debugging configuration (CONFIG_ARCH_DISCARD_MEMBLOCK and CONFIG_DEBUG_FS state). As result if: - CONFIG_DEBUG_FS defined - CONFIG_ARCH_DISCARD_MEMBLOCK not defined; - reserved memory regions array have been resized during boot then: - memory allocated for reserved memory regions array will be freed to buddy allocator; - debug_fs entry "sys/kernel/debug/memblock/reserved" will show garbage instead of state of memory reservations. like: 0: 0x98393bc0..0x9a393bbf 1: 0xff120000..0xff11ffff 2: 0x00000000..0xffffffff Hence, do not free memory allocated for reserved memory regions if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK). Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Reviewed-by: Tejun Heo Cc: Yinghai Lu Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 6a2a48a122a9..de4d9c352fd6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -269,6 +269,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( if (memblock.reserved.regions == memblock_reserved_init_regions) return 0; + /* + * Don't allow nobootmem allocator to free reserved memory regions + * array if + * - CONFIG_DEBUG_FS is enabled; + * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; + * - reserved memory regions array have been resized during boot. + * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" + * will show garbage instead of state of memory reservations. + */ + if (IS_ENABLED(CONFIG_DEBUG_FS) && + !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + return 0; + *addr = __pa(memblock.reserved.regions); return PAGE_ALIGN(sizeof(struct memblock_region) * -- cgit v1.2.3 From 869a84e1ca163b737236dae997db4a6a1e230b9b Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:10 -0800 Subject: mm/memblock: remove unnecessary inclusions of bootmem.h Clean-up to remove depedency with bootmem headers. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Reviewed-by: Tejun Heo Cc: Yinghai Lu Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Christoph Lameter Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01e39afde1cb..af4935ee444f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 79f40fab0b3a78e0e41fac79a65a9870f4b05652 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:12 -0800 Subject: mm/memblock: drop WARN and use SMP_CACHE_BYTES as a default alignment Don't produce warning and interpret 0 as "default align" equal to SMP_CACHE_BYTES in case if caller of memblock_alloc_base_nid() doesn't specify alignment for the block (align == 0). This is done in preparation of introducing common memblock alloc interface to make code behavior consistent. More details are in below thread : https://lkml.org/lkml/2013/10/13/117. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index de4d9c352fd6..6aca54812db0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -969,8 +969,8 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; - if (WARN_ON(!align)) - align = __alignof__(long long); + if (!align) + align = SMP_CACHE_BYTES; /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); -- cgit v1.2.3 From 87029ee9390b2297dae699d5fb135b77992116e5 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:14 -0800 Subject: mm/memblock: reorder parameters of memblock_find_in_range_node Reorder parameters of memblock_find_in_range_node to be consistent with other memblock APIs. The change was suggested by Tejun Heo . Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 16 ++++++++-------- mm/nobootmem.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 6aca54812db0..a95d6dc066d5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -157,10 +157,10 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, /** * memblock_find_in_range_node - find free area in given range and node - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %MAX_NUMNODES for any node * * Find @size free area aligned to @align in the specified range and node. @@ -176,9 +176,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * RETURNS: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { int ret; phys_addr_t kernel_end; @@ -241,8 +241,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(start, end, size, align, - MAX_NUMNODES); + return memblock_find_in_range_node(size, align, start, end, + MAX_NUMNODES); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -975,7 +975,7 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); - found = memblock_find_in_range_node(0, max_addr, size, align, nid); + found = memblock_find_in_range_node(size, align, 0, max_addr, nid); if (found && !memblock_reserve(found, size)) return found; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..59777e050d09 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(goal, limit, size, align, nid); + addr = memblock_find_in_range_node(size, align, goal, limit, nid); if (!addr) return NULL; -- cgit v1.2.3 From b115423357e0cda6d8f45d0c81df537d7b004020 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:16 -0800 Subject: mm/memblock: switch to use NUMA_NO_NODE instead of MAX_NUMNODES It's recommended to use NUMA_NO_NODE everywhere to select "process any node" behavior or to indicate that "no node id specified". Hence, update __next_free_mem_range*() API's to accept both NUMA_NO_NODE and MAX_NUMNODES, but emit warning once on MAX_NUMNODES, and correct corresponding API's documentation to describe new behavior. Also, update other memblock/nobootmem APIs where MAX_NUMNODES is used dirrectly. The change was suggested by Tejun Heo. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 28 +++++++++++++++++++--------- mm/nobootmem.c | 8 ++++---- 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index a95d6dc066d5..03f1dc7b663c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -126,7 +126,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -161,7 +161,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @align: alignment of free area to find * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Find @size free area aligned to @align in the specified range and node. * @@ -242,7 +242,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { return memblock_find_in_range_node(size, align, start, end, - MAX_NUMNODES); + NUMA_NO_NODE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -754,7 +754,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -782,6 +782,11 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; + bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); + + if (nid == MAX_NUMNODES) + pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + __func__); for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; @@ -789,7 +794,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (check_node && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -830,7 +835,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, /** * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -850,6 +855,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; + bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); + + if (nid == MAX_NUMNODES) + pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + __func__); if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; @@ -862,7 +872,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (check_node && nid != memblock_get_region_node(m)) continue; /* skip hotpluggable memory regions if needed */ @@ -989,7 +999,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 59777e050d09..19121ceb8874 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end, size; u64 i; - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); /* free range that is used for reserved array if we allocate it */ @@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ @@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, restart: - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; @@ -299,7 +299,7 @@ again: if (ptr) return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; -- cgit v1.2.3 From 26f09e9b3a0696f6fe20b021901300fba26fb579 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:19 -0800 Subject: mm/memblock: add memblock memory allocation apis Introduce memblock memory allocation APIs which allow to support PAE or LPAE extension on 32 bits archs where the physical memory start address can be beyond 4GB. In such cases, existing bootmem APIs which operate on 32 bit addresses won't work and needs memblock layer which operates on 64 bit addresses. So we add equivalent APIs so that we can replace usage of bootmem with memblock interfaces. Architectures already converted to NO_BOOTMEM use these new memblock interfaces. The architectures which are still not converted to NO_BOOTMEM continue to function as is because we still maintain the fal lback option of bootmem back-end supporting these new interfaces. So no functional change as such. In long run, once all the architectures moves to NO_BOOTMEM, we can get rid of bootmem layer completely. This is one step to remove the core code dependency with bootmem and also gives path for architectures to move away from bootmem. The proposed interface will became active if both CONFIG_HAVE_MEMBLOCK and CONFIG_NO_BOOTMEM are specified by arch. In case !CONFIG_NO_BOOTMEM, the memblock() wrappers will fallback to the existing bootmem apis so that arch's not converted to NO_BOOTMEM continue to work as is. The meaning of MEMBLOCK_ALLOC_ACCESSIBLE and MEMBLOCK_ALLOC_ANYWHERE is kept same. [akpm@linux-foundation.org: s/depricated/deprecated/] Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 207 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 03f1dc7b663c..018e55dc004d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -21,6 +21,9 @@ #include #include +#include + +#include "internal.h" static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -785,7 +788,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", __func__); for ( ; mi < mem->cnt; mi++) { @@ -858,7 +861,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", __func__); if (*idx == (u64)ULLONG_MAX) { @@ -1029,6 +1032,208 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +/** + * memblock_virt_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Also, allocation may fall back + * to any node in the system if the specified node can not + * hold the requested memory. + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. + * + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * + * The phys address of allocated boot memory block is converted to virtual and + * allocated memory is reset to 0. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc for + * allocated boot memory block, so that it is never reported as leaks. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_virt_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + phys_addr_t alloc; + void *ptr; + + if (nid == MAX_NUMNODES) + pr_warn("%s: usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE\n", + __func__); + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of free_all_bootmem) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (!align) + align = SMP_CACHE_BYTES; + + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); + +again: + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, + nid); + if (alloc) + goto done; + + if (nid != NUMA_NO_NODE) { + alloc = memblock_find_in_range_node(size, align, min_addr, + max_addr, NUMA_NO_NODE); + if (alloc) + goto done; + } + + if (min_addr) { + min_addr = 0; + goto again; + } else { + goto error; + } + +done: + memblock_reserve(alloc, size); + ptr = phys_to_virt(alloc); + memset(ptr, 0, size); + + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. This is because many of these blocks + * are only referred via the physical address which is not + * looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); + + return ptr; + +error: + return NULL; +} + +/** + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides + * additional debug information (including caller info), if enabled. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + return memblock_virt_alloc_internal(size, align, min_addr, + max_addr, nid); +} + +/** + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * which provides debug information (including caller info), if enabled, + * and panics if the request can not be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + return ptr; + + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr); + return NULL; +} + +/** + * __memblock_free_early - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + __memblock_remove(&memblock.reserved, base, size); +} + +/* + * __memblock_free_late - free bootmem block pages directly to buddy allocator + * @addr: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator, no bootmem metadata is updated because it is gone. + */ +void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + u64 cursor, end; + + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} /* * Remaining API functions -- cgit v1.2.3 From 6782832eba5e8c87a749a41da8deda1c3ef67ba0 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:25 -0800 Subject: mm/page_alloc.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59d1986018..b230e838883d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4215,7 +4215,6 @@ static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; - struct pglist_data *pgdat = zone->zone_pgdat; size_t alloc_size; /* @@ -4231,7 +4230,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node_nopanic(pgdat, alloc_size); + memblock_virt_alloc_node_nopanic( + alloc_size, zone->zone_pgdat->node_id); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -4351,13 +4351,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) #endif /** - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling free_bootmem() manually. + * this function may be used instead of calling memblock_free_early_nid() + * manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4369,9 +4370,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) end_pfn = min(end_pfn, max_low_pfn); if (start_pfn < end_pfn) - free_bootmem_node(NODE_DATA(this_nid), - PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT); + memblock_free_early_nid(PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT, + this_nid); } } @@ -4642,8 +4643,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, - usemapsize); + zone->pageblock_flags = + memblock_virt_alloc_node_nopanic(usemapsize, + pgdat->node_id); } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -4837,7 +4839,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node_nopanic(pgdat, size); + map = memblock_virt_alloc_node_nopanic(size, + pgdat->node_id); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -5887,7 +5890,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem_nopanic(size); + table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { -- cgit v1.2.3 From bb016b84164554725899aef544331085e08cb402 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:34 -0800 Subject: mm/sparse: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 6 ++++-- mm/sparse.c | 27 +++++++++++++++------------ 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3be757..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); + return memblock_virt_alloc_try_nid(size, align, goal, + BOOTMEM_ALLOC_ACCESSIBLE, node); } static void *vmemmap_buf; @@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ - free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); + memblock_free_early(__pa(vmemmap_buf), + vmemmap_buf_end - vmemmap_buf); vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0e9590..63c3ea5c119c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) else section = kzalloc(array_size, GFP_KERNEL); } else { - section = alloc_bootmem_node(NODE_DATA(nid), array_size); + section = memblock_virt_alloc_node(array_size, nid); } return section; @@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, - SMP_CACHE_BYTES, goal, limit); + p = memblock_virt_alloc_try_nid_nopanic(size, + SMP_CACHE_BYTES, goal, limit, + nid); if (!p && limit) { limit = 0; goto again; @@ -331,7 +332,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return alloc_bootmem_node_nopanic(pgdat, size); + return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) return map; size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); - map = __alloc_bootmem_node_high(NODE_DATA(nid), size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } void __init sparse_mem_maps_populate_node(struct page **map_map, @@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) @@ -545,7 +548,7 @@ void __init sparse_init(void) * sparse_early_mem_map_alloc, so allocate usemap_map at first. */ size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = alloc_bootmem(size); + usemap_map = memblock_virt_alloc(size, 0); if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, @@ -553,7 +556,7 @@ void __init sparse_init(void) #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = alloc_bootmem(size2); + map_map = memblock_virt_alloc(size2, 0); if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, @@ -583,9 +586,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - free_bootmem(__pa(map_map), size2); + memblock_free_early(__pa(map_map), size2); #endif - free_bootmem(__pa(usemap_map), size); + memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG -- cgit v1.2.3 From 8b89a1169437541a2a9b62c8f7b1a5c0ceb0fbde Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:36 -0800 Subject: mm/hugetlb.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1697ff0cc53a..04306b9de90d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1271,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), - huge_page_size(h), huge_page_size(h), 0); - + addr = memblock_virt_alloc_try_nid_nopanic( + huge_page_size(h), huge_page_size(h), + 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -1313,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) #ifdef CONFIG_HIGHMEM page = pfn_to_page(m->phys >> PAGE_SHIFT); - free_bootmem_late((unsigned long)m, - sizeof(struct huge_bootmem_page)); + memblock_free_late(__pa(m), + sizeof(struct huge_bootmem_page)); #else page = virt_to_page(m); #endif -- cgit v1.2.3 From 0d036e9e33df8befa9348683ba68258fee7f0a00 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:38 -0800 Subject: mm/page_cgroup.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6d757e3a872a..d8bd2c500aa4 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) table_size = sizeof(struct page_cgroup) * nr_pages; - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_cgroup = base; -- cgit v1.2.3 From 999c17e3de4855af4e829c0871ad32fc76a93991 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:40 -0800 Subject: mm/percpu.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 0d10defe951e..65fd8a749712 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); if (!ptr) return NULL; ai = ptr; @@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - free_bootmem(__pa(ai), ai->__ai_size); + memblock_free_early(__pa(ai), ai->__ai_size); } /** @@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); - group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); - unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + group_offsets = memblock_virt_alloc(ai->nr_groups * + sizeof(group_offsets[0]), 0); + group_sizes = memblock_virt_alloc(ai->nr_groups * + sizeof(group_sizes[0]), 0); + unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); + unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + pcpu_slot = memblock_virt_alloc( + pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * covers static area + reserved area (mostly used for module * static percpu allocation). */ - schunk = alloc_bootmem(pcpu_chunk_struct_size); + schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); schunk->base_addr = base_addr; schunk->map = smap; @@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(pcpu_chunk_struct_size); + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); dchunk->base_addr = base_addr; dchunk->map = dmap; @@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = alloc_bootmem_nopanic(areas_size); + areas = memblock_virt_alloc_nopanic(areas_size, 0); if (!areas) { rc = -ENOMEM; goto out_free; @@ -1712,7 +1715,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - free_bootmem(__pa(areas), areas_size); + memblock_free_early(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = alloc_bootmem(pages_size); + pages = memblock_virt_alloc(pages_size, 0); /* allocate pages */ j = 0; @@ -1823,7 +1826,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - free_bootmem(__pa(pages), pages_size); + memblock_free_early(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); + return memblock_virt_alloc_from_nopanic( + size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - free_bootmem(__pa(ptr), size); + memblock_free_early(__pa(ptr), size); } void __init setup_per_cpu_areas(void) @@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + fc = memblock_virt_alloc_from_nopanic(unit_size, + PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ -- cgit v1.2.3 From 9e43aa2b8d1cb3137bd7e60d5fead83d0569de2b Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:43 -0800 Subject: mm/memory_hotplug.c: use memblock apis for early memory allocations Correct ensure_zone_is_initialized() function description according to the introduced memblock APIs for early memory allocations. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index af4935ee444f..cc2ab37220b7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, } /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic() */ + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ static int __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { -- cgit v1.2.3 From 560dca27a6b36015e4f69a4ceba0ee5be0707c17 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:55 -0800 Subject: mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter Check nid parameter and produce warning if it has deprecated MAX_NUMNODES value. Also re-assign NUMA_NO_NODE value to the nid parameter in this case. These will help to identify the wrong API usage (the caller) and make code simpler. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 018e55dc004d..1c2ef2c7edab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -785,11 +785,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; - bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); - if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", - __func__); + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; @@ -797,7 +795,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (check_node && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -858,11 +856,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; - bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); - if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", - __func__); + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; @@ -875,7 +871,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (check_node && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) continue; /* skip hotpluggable memory regions if needed */ @@ -1067,9 +1063,8 @@ static void * __init memblock_virt_alloc_internal( phys_addr_t alloc; void *ptr; - if (nid == MAX_NUMNODES) - pr_warn("%s: usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE\n", - __func__); + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; /* * Detect any accidental use of these APIs after slab is ready, as at -- cgit v1.2.3 From 4883e997b26ed857da8dae6a6e6aeb12830b978d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 Jan 2014 15:50:56 -0800 Subject: mm/hwpoison: add '#' to hwpoison_inject Add '#' to hwpoison_inject just as done in madvise_hwpoison. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Reviewed-by: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678371eb..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; inject: - printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); } -- cgit v1.2.3 From 1c30e0177e4f41a11cb88b0f1f056ccebfe0fff4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:50:58 -0800 Subject: mm: numa: make NUMA-migrate related functions static numamigrate_update_ratelimit and numamigrate_isolate_page only have callers in mm/migrate.c. This patch makes them static. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 11d89dc0574c..41eba21f10ba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1599,7 +1599,8 @@ bool migrate_ratelimited(int node) } /* Returns true if the node is migrate rate-limited after the update */ -bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) +static bool numamigrate_update_ratelimit(pg_data_t *pgdat, + unsigned long nr_pages) { bool rate_limited = false; @@ -1623,7 +1624,7 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) return rate_limited; } -int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; -- cgit v1.2.3 From 1c5e9c27cbd966c7f0038698d5dcd5ada3574f47 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:50:59 -0800 Subject: mm: numa: limit scope of lock for NUMA migrate rate limiting NUMA migrate rate limiting protects a migration counter and window using a lock but in some cases this can be a contended lock. It is not critical that the number of pages be perfect, lost updates are acceptable. Reduce the importance of this lock. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 41eba21f10ba..4612bb2e3677 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1602,26 +1602,29 @@ bool migrate_ratelimited(int node) static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) { - bool rate_limited = false; - /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - spin_lock(&pgdat->numabalancing_migrate_lock); if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + spin_lock(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); + spin_unlock(&pgdat->numabalancing_migrate_lock); } if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) - rate_limited = true; - else - pgdat->numabalancing_migrate_nr_pages += nr_pages; - spin_unlock(&pgdat->numabalancing_migrate_lock); - - return rate_limited; + return true; + + /* + * This is an unlocked non-atomic update so errors are possible. + * The consequences are failing to migrate when we potentiall should + * have which is not severe enough to warrant locking. If it is ever + * a problem, it can be converted to a per-cpu counter. + */ + pgdat->numabalancing_migrate_nr_pages += nr_pages; + return false; } static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) -- cgit v1.2.3 From af1839d722c986ffeaae1e70a6ef1c75ff38dcd5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:01 -0800 Subject: mm: numa: trace tasks that fail migration due to rate limiting A low local/remote numa hinting fault ratio is potentially explained by failed migrations. This patch adds a tracepoint that fires when migration fails due to migration rate limitation. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 4612bb2e3677..f9e16350d09c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1614,8 +1614,11 @@ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, msecs_to_jiffies(migrate_interval_millisecs); spin_unlock(&pgdat->numabalancing_migrate_lock); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, + nr_pages); return true; + } /* * This is an unlocked non-atomic update so errors are possible. -- cgit v1.2.3 From 64a9a34e22896dad430e21a28ad8cb00a756fefc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:02 -0800 Subject: mm: numa: do not automatically migrate KSM pages KSM pages can be shared between tasks that are not necessarily related to each other from a NUMA perspective. This patch causes those pages to be ignored by automatic NUMA balancing so they do not migrate and do not cause unrelated tasks to be grouped together. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a6591aea..7332c1785744 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = *pte; page = vm_normal_page(vma, addr, oldpte); - if (page) { + if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); set_pte_at(mm, addr, pte, ptent); -- cgit v1.2.3 From 947b3dd1a84ba3fcb7163688fdc36671941786f4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Jan 2014 15:51:04 -0800 Subject: memcg, oom: lock mem_cgroup_print_oom_info mem_cgroup_print_oom_info uses a static buffer (memcg_name) to store the name of the cgroup. This is not safe as pointed out by David Rientjes because memcg oom is locked only for its hierarchy and nothing prevents another parallel hierarchy to trigger oom as well and overwrite the already in-use buffer. This patch introduces oom_info_lock hidden inside mem_cgroup_print_oom_info which is held throughout the function. It makes access to memcg_name safe and as a bonus it also prevents parallel memcg ooms to interleave their statistics which would make the printed data hard to analyze otherwise. Signed-off-by: Michal Hocko Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08541f680d90..57b16083f046 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1647,13 +1647,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; /* - * Need a buffer in BSS, can't rely on allocations. The code relies - * on the assumption that OOM is serialized for memory controller. - * If this assumption is broken, revisit this code. + * protects memcg_name and makes sure that parallel ooms do not + * interleave */ + static DEFINE_SPINLOCK(oom_info_lock); + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; static char memcg_name[PATH_MAX]; int ret; struct mem_cgroup *iter; @@ -1662,6 +1662,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!p) return; + spin_lock(&oom_info_lock); rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1730,6 +1731,7 @@ done: pr_cont("\n"); } + spin_unlock(&oom_info_lock); } /* -- cgit v1.2.3 From 0eb927c0ab789d3d7d69f68acb850f69d4e7c36f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:05 -0800 Subject: mm: compaction: trace compaction begin and end The broad goal of the series is to improve allocation success rates for huge pages through memory compaction, while trying not to increase the compaction overhead. The original objective was to reintroduce capturing of high-order pages freed by the compaction, before they are split by concurrent activity. However, several bugs and opportunities for simple improvements were found in the current implementation, mostly through extra tracepoints (which are however too ugly for now to be considered for sending). The patches mostly deal with two mechanisms that reduce compaction overhead, which is caching the progress of migrate and free scanners, and marking pageblocks where isolation failed to be skipped during further scans. Patch 1 (from mgorman) adds tracepoints that allow calculate time spent in compaction and potentially debug scanner pfn values. Patch 2 encapsulates the some functionality for handling deferred compactions for better maintainability, without a functional change type is not determined without being actually needed. Patch 3 fixes a bug where cached scanner pfn's are sometimes reset only after they have been read to initialize a compaction run. Patch 4 fixes a bug where scanners meeting is sometimes not properly detected and can lead to multiple compaction attempts quitting early without doing any work. Patch 5 improves the chances of sync compaction to process pageblocks that async compaction has skipped due to being !MIGRATE_MOVABLE. Patch 6 improves the chances of sync direct compaction to actually do anything when called after async compaction fails during allocation slowpath. The impact of patches were validated using mmtests's stress-highalloc benchmark with mmtests's stress-highalloc benchmark on a x86_64 machine with 4GB memory. Due to instability of the results (mostly related to the bugs fixed by patches 2 and 3), 10 iterations were performed, taking min,mean,max values for success rates and mean values for time and vmstat-based metrics. First, the default GFP_HIGHUSER_MOVABLE allocations were tested with the patches stacked on top of v3.13-rc2. Patch 2 is OK to serve as baseline due to no functional changes in 1 and 2. Comments below. stress-highalloc 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp Success 1 Min 9.00 ( 0.00%) 10.00 (-11.11%) 43.00 (-377.78%) 43.00 (-377.78%) 33.00 (-266.67%) Success 1 Mean 27.50 ( 0.00%) 25.30 ( 8.00%) 45.50 (-65.45%) 45.90 (-66.91%) 46.30 (-68.36%) Success 1 Max 36.00 ( 0.00%) 36.00 ( 0.00%) 47.00 (-30.56%) 48.00 (-33.33%) 52.00 (-44.44%) Success 2 Min 10.00 ( 0.00%) 8.00 ( 20.00%) 46.00 (-360.00%) 45.00 (-350.00%) 35.00 (-250.00%) Success 2 Mean 26.40 ( 0.00%) 23.50 ( 10.98%) 47.30 (-79.17%) 47.60 (-80.30%) 48.10 (-82.20%) Success 2 Max 34.00 ( 0.00%) 33.00 ( 2.94%) 48.00 (-41.18%) 50.00 (-47.06%) 54.00 (-58.82%) Success 3 Min 65.00 ( 0.00%) 63.00 ( 3.08%) 85.00 (-30.77%) 84.00 (-29.23%) 85.00 (-30.77%) Success 3 Mean 76.70 ( 0.00%) 70.50 ( 8.08%) 86.20 (-12.39%) 85.50 (-11.47%) 86.00 (-12.13%) Success 3 Max 87.00 ( 0.00%) 86.00 ( 1.15%) 88.00 ( -1.15%) 87.00 ( 0.00%) 87.00 ( 0.00%) 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp User 6437.72 6459.76 5960.32 5974.55 6019.67 System 1049.65 1049.09 1029.32 1031.47 1032.31 Elapsed 1856.77 1874.48 1949.97 1994.22 1983.15 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp Minor Faults 253952267 254581900 250030122 250507333 250157829 Major Faults 420 407 506 530 530 Swap Ins 4 9 9 6 6 Swap Outs 398 375 345 346 333 Direct pages scanned 197538 189017 298574 287019 299063 Kswapd pages scanned 1809843 1801308 1846674 1873184 1861089 Kswapd pages reclaimed 1806972 1798684 1844219 1870509 1858622 Direct pages reclaimed 197227 188829 298380 286822 298835 Kswapd efficiency 99% 99% 99% 99% 99% Kswapd velocity 953.382 970.449 952.243 934.569 922.286 Direct efficiency 99% 99% 99% 99% 99% Direct velocity 104.058 101.832 153.961 143.200 148.205 Percentage direct scans 9% 9% 13% 13% 13% Zone normal velocity 347.289 359.676 348.063 339.933 332.983 Zone dma32 velocity 710.151 712.605 758.140 737.835 737.507 Zone dma velocity 0.000 0.000 0.000 0.000 0.000 Page writes by reclaim 557.600 429.000 353.600 426.400 381.800 Page writes file 159 53 7 79 48 Page writes anon 398 375 345 346 333 Page reclaim immediate 825 644 411 575 420 Sector Reads 2781750 2769780 2878547 2939128 2910483 Sector Writes 12080843 12083351 12012892 12002132 12010745 Page rescued immediate 0 0 0 0 0 Slabs scanned 1575654 1545344 1778406 1786700 1794073 Direct inode steals 9657 10037 15795 14104 14645 Kswapd inode steals 46857 46335 50543 50716 51796 Kswapd skipped wait 0 0 0 0 0 THP fault alloc 97 91 81 71 77 THP collapse alloc 456 506 546 544 565 THP splits 6 5 5 4 4 THP fault fallback 0 1 0 0 0 THP collapse fail 14 14 12 13 12 Compaction stalls 1006 980 1537 1536 1548 Compaction success 303 284 562 559 578 Compaction failures 702 696 974 976 969 Page migrate success 1177325 1070077 3927538 3781870 3877057 Page migrate failure 0 0 0 0 0 Compaction pages isolated 2547248 2306457 8301218 8008500 8200674 Compaction migrate scanned 42290478 38832618 153961130 154143900 159141197 Compaction free scanned 89199429 79189151 356529027 351943166 356326727 Compaction cost 1566 1426 5312 5156 5294 NUMA PTE updates 0 0 0 0 0 NUMA hint faults 0 0 0 0 0 NUMA hint local faults 0 0 0 0 0 NUMA hint local percent 100 100 100 100 100 NUMA pages migrated 0 0 0 0 0 AutoNUMA cost 0 0 0 0 0 Observations: - The "Success 3" line is allocation success rate with system idle (phases 1 and 2 are with background interference). I used to get stable values around 85% with vanilla 3.11. The lower min and mean values came with 3.12. This was bisected to commit 81c0a2bb ("mm: page_alloc: fair zone allocator policy") As explained in comment for patch 3, I don't think the commit is wrong, but that it makes the effect of compaction bugs worse. From patch 3 onwards, the results are OK and match the 3.11 results. - Patch 4 also clearly helps phases 1 and 2, and exceeds any results I've seen with 3.11 (I didn't measure it that thoroughly then, but it was never above 40%). - Compaction cost and number of scanned pages is higher, especially due to patch 4. However, keep in mind that patches 3 and 4 fix existing bugs in the current design of compaction overhead mitigation, they do not change it. If overhead is found unacceptable, then it should be decreased differently (and consistently, not due to random conditions) than the current implementation does. In contrast, patches 5 and 6 (which are not strictly bug fixes) do not increase the overhead (but also not success rates). This might be a limitation of the stress-highalloc benchmark as it's quite uniform. Another set of results is when configuring stress-highalloc t allocate with similar flags as THP uses: (GFP_HIGHUSER_MOVABLE|__GFP_NOMEMALLOC|__GFP_NORETRY|__GFP_NO_KSWAPD) stress-highalloc 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp Success 1 Min 2.00 ( 0.00%) 7.00 (-250.00%) 18.00 (-800.00%) 19.00 (-850.00%) 26.00 (-1200.00%) Success 1 Mean 19.20 ( 0.00%) 17.80 ( 7.29%) 29.20 (-52.08%) 29.90 (-55.73%) 32.80 (-70.83%) Success 1 Max 27.00 ( 0.00%) 29.00 ( -7.41%) 35.00 (-29.63%) 36.00 (-33.33%) 37.00 (-37.04%) Success 2 Min 3.00 ( 0.00%) 8.00 (-166.67%) 21.00 (-600.00%) 21.00 (-600.00%) 32.00 (-966.67%) Success 2 Mean 19.30 ( 0.00%) 17.90 ( 7.25%) 32.20 (-66.84%) 32.60 (-68.91%) 35.70 (-84.97%) Success 2 Max 27.00 ( 0.00%) 30.00 (-11.11%) 36.00 (-33.33%) 37.00 (-37.04%) 39.00 (-44.44%) Success 3 Min 62.00 ( 0.00%) 62.00 ( 0.00%) 85.00 (-37.10%) 75.00 (-20.97%) 64.00 ( -3.23%) Success 3 Mean 66.30 ( 0.00%) 65.50 ( 1.21%) 85.60 (-29.11%) 83.40 (-25.79%) 83.50 (-25.94%) Success 3 Max 70.00 ( 0.00%) 69.00 ( 1.43%) 87.00 (-24.29%) 86.00 (-22.86%) 87.00 (-24.29%) 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp User 6547.93 6475.85 6265.54 6289.46 6189.96 System 1053.42 1047.28 1043.23 1042.73 1038.73 Elapsed 1835.43 1821.96 1908.67 1912.74 1956.38 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp Minor Faults 256805673 253106328 253222299 249830289 251184418 Major Faults 395 375 423 434 448 Swap Ins 12 10 10 12 9 Swap Outs 530 537 487 455 415 Direct pages scanned 71859 86046 153244 152764 190713 Kswapd pages scanned 1900994 1870240 1898012 1892864 1880520 Kswapd pages reclaimed 1897814 1867428 1894939 1890125 1877924 Direct pages reclaimed 71766 85908 153167 152643 190600 Kswapd efficiency 99% 99% 99% 99% 99% Kswapd velocity 1029.000 1067.782 1000.091 991.049 951.218 Direct efficiency 99% 99% 99% 99% 99% Direct velocity 38.897 49.127 80.747 79.983 96.468 Percentage direct scans 3% 4% 7% 7% 9% Zone normal velocity 351.377 372.494 348.910 341.689 335.310 Zone dma32 velocity 716.520 744.414 731.928 729.343 712.377 Zone dma velocity 0.000 0.000 0.000 0.000 0.000 Page writes by reclaim 669.300 604.000 545.700 538.900 429.900 Page writes file 138 66 58 83 14 Page writes anon 530 537 487 455 415 Page reclaim immediate 806 655 772 548 517 Sector Reads 2711956 2703239 2811602 2818248 2839459 Sector Writes 12163238 12018662 12038248 11954736 11994892 Page rescued immediate 0 0 0 0 0 Slabs scanned 1385088 1388364 1507968 1513292 1558656 Direct inode steals 1739 2564 4622 5496 6007 Kswapd inode steals 47461 46406 47804 48013 48466 Kswapd skipped wait 0 0 0 0 0 THP fault alloc 110 82 84 69 70 THP collapse alloc 445 482 467 462 539 THP splits 6 5 4 5 3 THP fault fallback 3 0 0 0 0 THP collapse fail 15 14 14 14 13 Compaction stalls 659 685 1033 1073 1111 Compaction success 222 225 410 427 456 Compaction failures 436 460 622 646 655 Page migrate success 446594 439978 1085640 1095062 1131716 Page migrate failure 0 0 0 0 0 Compaction pages isolated 1029475 1013490 2453074 2482698 2565400 Compaction migrate scanned 9955461 11344259 24375202 27978356 30494204 Compaction free scanned 27715272 28544654 80150615 82898631 85756132 Compaction cost 552 555 1344 1379 1436 NUMA PTE updates 0 0 0 0 0 NUMA hint faults 0 0 0 0 0 NUMA hint local faults 0 0 0 0 0 NUMA hint local percent 100 100 100 100 100 NUMA pages migrated 0 0 0 0 0 AutoNUMA cost 0 0 0 0 0 There are some differences from the previous results for THP-like allocations: - Here, the bad result for unpatched kernel in phase 3 is much more consistent to be between 65-70% and not related to the "regression" in 3.12. Still there is the improvement from patch 4 onwards, which brings it on par with simple GFP_HIGHUSER_MOVABLE allocations. - Compaction costs have increased, but nowhere near as much as the non-THP case. Again, the patches should be worth the gained determininsm. - Patches 5 and 6 somewhat increase the number of migrate-scanned pages. This is most likely due to __GFP_NO_KSWAPD flag, which means the cached pfn's and pageblock skip bits are not reset by kswapd that often (at least in phase 3 where no concurrent activity would wake up kswapd) and the patches thus help the sync-after-async compaction. It doesn't however show that the sync compaction would help so much with success rates, which can be again seen as a limitation of the benchmark scenario. This patch (of 6): Add two tracepoints for compaction begin and end of a zone. Using this it is possible to calculate how much time a workload is spending within compaction and potentially debug problems related to cached pfns for scanning. In combination with the direct reclaim and slab trace points it should be possible to estimate most allocation-related overhead for a workload. Signed-off-by: Mel Gorman Signed-off-by: Vlastimil Babka Cc: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..a03995eddedb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -970,6 +970,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) __reset_isolation_suitable(zone); + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { @@ -1015,6 +1017,8 @@ out: cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); + trace_mm_compaction_end(ret); + return ret; } -- cgit v1.2.3 From de6c60a6c115acaa721cfd499e028a413d1fcbf3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:07 -0800 Subject: mm: compaction: encapsulate defer reset logic Currently there are several functions to manipulate the deferred compaction state variables. The remaining case where the variables are touched directly is when a successful allocation occurs in direct compaction, or is expected to be successful in the future by kswapd. Here, the lowest order that is expected to fail is updated, and in the case of successful allocation, the deferred status and counter is reset completely. Create a new function compaction_defer_reset() to encapsulate this functionality and make it easier to understand the code. No functional change. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 ++++----- mm/page_alloc.c | 5 +---- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index a03995eddedb..927de97cab8d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1124,12 +1124,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) compact_zone(zone, cc); if (cc->order > 0) { - int ok = zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0); - if (ok && cc->order >= zone->compact_order_failed) - zone->compact_order_failed = cc->order + 1; + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) + else if (cc->sync) defer_compaction(zone, cc->order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b230e838883d..84da0e3bc886 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2235,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; - preferred_zone->compact_considered = 0; - preferred_zone->compact_defer_shift = 0; - if (order >= preferred_zone->compact_order_failed) - preferred_zone->compact_order_failed = order + 1; + compaction_defer_reset(preferred_zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } -- cgit v1.2.3 From d3132e4b83e6bd383c74d716f7281d7c3136089c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:08 -0800 Subject: mm: compaction: reset cached scanner pfn's before reading them Compaction caches pfn's for its migrate and free scanners to avoid scanning the whole zone each time. In compact_zone(), the cached values are read to set up initial values for the scanners. There are several situations when these cached pfn's are reset to the first and last pfn of the zone, respectively. One of these situations is when a compaction has been deferred for a zone and is now being restarted during a direct compaction, which is also done in compact_zone(). However, compact_zone() currently reads the cached pfn's *before* resetting them. This means the reset doesn't affect the compaction that performs it, and with good chance also subsequent compactions, as update_pageblock_skip() is likely to be called and update the cached pfn's to those being processed. Another chance for a successful reset is when a direct compaction detects that migration and free scanners meet (which has its own problems addressed by another patch) and sets update_pageblock_skip flag which kswapd uses to do the reset because it goes to sleep. This is clearly a bug that results in non-deterministic behavior, so this patch moves the cached pfn reset to be performed *before* the values are read. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 927de97cab8d..f4e2c166880b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -946,6 +946,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); + /* * Setup to move all movable pages to the end of the zone. Used cached * information on where the scanners should start but check that it @@ -962,14 +970,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn = cc->migrate_pfn; } - /* - * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. - */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) - __reset_isolation_suitable(zone); - trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); migrate_prep_local(); -- cgit v1.2.3 From 7ed695e069c3cbea5e1fd08f84a04536da91f584 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:09 -0800 Subject: mm: compaction: detect when scanners meet in isolate_freepages Compaction of a zone is finished when the migrate scanner (which begins at the zone's lowest pfn) meets the free page scanner (which begins at the zone's highest pfn). This is detected in compact_zone() and in the case of direct compaction, the compact_blockskip_flush flag is set so that kswapd later resets the cached scanner pfn's, and a new compaction may again start at the zone's borders. The meeting of the scanners can happen during either scanner's activity. However, it may currently fail to be detected when it occurs in the free page scanner, due to two problems. First, isolate_freepages() keeps free_pfn at the highest block where it isolated pages from, for the purposes of not missing the pages that are returned back to allocator when migration fails. Second, failing to isolate enough free pages due to scanners meeting results in -ENOMEM being returned by migrate_pages(), which makes compact_zone() bail out immediately without calling compact_finished() that would detect scanners meeting. This failure to detect scanners meeting might result in repeated attempts at compaction of a zone that keep starting from the cached pfn's close to the meeting point, and quickly failing through the -ENOMEM path, without the cached pfns being reset, over and over. This has been observed (through additional tracepoints) in the third phase of the mmtests stress-highalloc benchmark, where the allocator runs on an otherwise idle system. The problem was observed in the DMA32 zone, which was used as a fallback to the preferred Normal zone, but on the 4GB system it was actually the largest zone. The problem is even amplified for such fallback zone - the deferred compaction logic, which could (after being fixed by a previous patch) reset the cached scanner pfn's, is only applied to the preferred zone and not for the fallbacks. The problem in the third phase of the benchmark was further amplified by commit 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") which resulted in a non-deterministic regression of the allocation success rate from ~85% to ~65%. This occurs in about half of benchmark runs, making bisection problematic. It is unlikely that the commit itself is buggy, but it should put more pressure on the DMA32 zone during phases 1 and 2, which may leave it more fragmented in phase 3 and expose the bugs that this patch fixes. The fix is to make scanners meeting in isolate_freepage() stay that way, and to check in compact_zone() for scanners meeting when migrate_pages() returns -ENOMEM. The result is that compact_finished() also detects scanners meeting and sets the compact_blockskip_flush flag to make kswapd reset the scanner pfn's. The results in stress-highalloc benchmark show that the "regression" by commit 81c0a2bb515f in phase 3 no longer occurs, and phase 1 and 2 allocation success rates are also significantly improved. Signed-off-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f4e2c166880b..cc46db36e708 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -660,7 +660,7 @@ static void isolate_freepages(struct zone *zone, * is the end of the pageblock the migration scanner is using. */ pfn = cc->free_pfn; - low_pfn = cc->migrate_pfn + pageblock_nr_pages; + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* * Take care that if the migration scanner is at the end of the zone @@ -676,7 +676,7 @@ static void isolate_freepages(struct zone *zone, * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -738,7 +738,14 @@ static void isolate_freepages(struct zone *zone, /* split_free_page does not map the pages */ map_pages(freelist); - cc->free_pfn = high_pfn; + /* + * If we crossed the migrate scanner, we want to keep it that way + * so that compact_finished() may detect this + */ + if (pfn < low_pfn) + cc->free_pfn = max(pfn, zone->zone_start_pfn); + else + cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; } @@ -1005,7 +1012,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - if (err == -ENOMEM) { + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it + */ + if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { ret = COMPACT_PARTIAL; goto out; } -- cgit v1.2.3 From 50b5b094e683f8e51e82c6dfe97b1608cf97e6c0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:10 -0800 Subject: mm: compaction: do not mark unmovable pageblocks as skipped in async compaction Compaction temporarily marks pageblocks where it fails to isolate pages as to-be-skipped in further compactions, in order to improve efficiency. One of the reasons to fail isolating pages is that isolation is not attempted in pageblocks that are not of MIGRATE_MOVABLE (or CMA) type. The problem is that blocks skipped due to not being MIGRATE_MOVABLE in async compaction become skipped due to the temporary mark also in future sync compaction. Moreover, this may follow quite soon during __alloc_page_slowpath, without much time for kswapd to clear the pageblock skip marks. This goes against the idea that sync compaction should try to scan these blocks more thoroughly than the async compaction. The fix is to ensure in async compaction that these !MIGRATE_MOVABLE blocks are not marked to be skipped. Note this should not affect performance or locking impact of further async compactions, as skipping a block due to being !MIGRATE_MOVABLE is done soon after skipping a block marked to be skipped, both without locking. Signed-off-by: Vlastimil Babka Cc: Rik van Riel Acked-by: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index cc46db36e708..32a033cb5c65 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; + bool skipped_async_unsuitable = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { cc->finished_update_migrate = true; + skipped_async_unsuitable = true; goto next_pageblock; } @@ -627,8 +629,13 @@ next_pageblock: if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ - if (low_pfn == end_pfn) + /* + * Update the pageblock-skip information and cached scanner pfn, + * if the whole pageblock was scanned without isolating any page. + * This is not done when pageblock was skipped due to being unsuitable + * for async compaction, so that eventual sync compaction can try. + */ + if (low_pfn == end_pfn && !skipped_async_unsuitable) update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); -- cgit v1.2.3 From 55b7c4c99f6a448f72179297fe6432544f220063 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:11 -0800 Subject: mm: compaction: reset scanner positions immediately when they meet Compaction used to start its migrate and free page scaners at the zone's lowest and highest pfn, respectively. Later, caching was introduced to remember the scanners' progress across compaction attempts so that pageblocks are not re-scanned uselessly. Additionally, pageblocks where isolation failed are marked to be quickly skipped when encountered again in future compactions. Currently, both the reset of cached pfn's and clearing of the pageblock skip information for a zone is done in __reset_isolation_suitable(). This function gets called when: - compaction is restarting after being deferred - compact_blockskip_flush flag is set in compact_finished() when the scanners meet (and not again cleared when direct compaction succeeds in allocation) and kswapd acts upon this flag before going to sleep This behavior is suboptimal for several reasons: - when direct sync compaction is called after async compaction fails (in the allocation slowpath), it will effectively do nothing, unless kswapd happens to process the compact_blockskip_flush flag meanwhile. This is racy and goes against the purpose of sync compaction to more thoroughly retry the compaction of a zone where async compaction has failed. The restart-after-deferring path cannot help here as deferring happens only after the sync compaction fails. It is also done only for the preferred zone, while the compaction might be done for a fallback zone. - the mechanism of marking pageblock to be skipped has little value since the cached pfn's are reset only together with the pageblock skip flags. This effectively limits pageblock skip usage to parallel compactions. This patch changes compact_finished() so that cached pfn's are reset immediately when the scanners meet. Clearing pageblock skip flags is unchanged, as well as the other situations where cached pfn's are reset. This allows the sync-after-async compaction to retry pageblocks not marked as skipped, such as blocks !MIGRATE_MOVABLE blocks that async compactions now skips without marking them. Signed-off-by: Vlastimil Babka Cc: Rik van Riel Acked-by: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 32a033cb5c65..3a91a2ea3d34 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -851,6 +851,10 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { + /* Let the next compaction start anew. */ + zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); + /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kswapd does not set the -- cgit v1.2.3 From aed0a0e32de387da831284fda25021de32477195 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 21 Jan 2014 15:51:12 -0800 Subject: mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure __GFP_NOFAIL may return NULL when coupled with GFP_NOWAIT or GFP_ATOMIC. Luckily, nothing currently does such craziness. So instead of causing such allocations to loop (potentially forever), we maintain the current behavior and also warn about the new users of the deprecated flag. Suggested-by: Andrew Morton Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84da0e3bc886..533e2147d14f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2525,8 +2525,15 @@ rebalance: } /* Atomic allocations - we can't balance anything */ - if (!wait) + if (!wait) { + /* + * All existing users of the deprecated __GFP_NOFAIL are + * blockable, so warn of any new users that actually allow this + * type of allocation to fail. + */ + WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; + } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) -- cgit v1.2.3 From 354a3363363724c21ea2e4b28370e27983c2452e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 21 Jan 2014 15:51:14 -0800 Subject: mm/migrate: add comment about permanent failure path Let's add a comment about where the failed page goes to, which makes code more readable. Signed-off-by: Naoya Horiguchi Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Wanpeng Li Acked-by: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index f9e16350d09c..626ca3c5d07b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1125,7 +1125,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, nr_succeeded++; break; default: - /* Permanent failure */ + /* + * Permanent failure (-EBUSY, -ENOSYS, etc.): + * unlike -EAGAIN case, the failed page is + * removed from migration page list and not + * retried in the next outer loop. + */ nr_failed++; break; } -- cgit v1.2.3 From 32665f2bbfed2e325d37236d9b0071a11a69124e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:15 -0800 Subject: mm/migrate: correct failure handling if !hugepage_migration_support() We should remove the page from the list if we fail with ENOSYS, since migrate_pages() consider error cases except -ENOMEM and -EAGAIN as permanent failure and it assumes that the page would be removed from the list. Without this patch, we could overcount number of failure. In addition, we should put back the new hugepage if !hugepage_migration_support(). If not, we would leak hugepage memory. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 626ca3c5d07b..13bedcc4656b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1013,7 +1013,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, { int rc = 0; int *result = NULL; - struct page *new_hpage = get_new_page(hpage, private, &result); + struct page *new_hpage; struct anon_vma *anon_vma = NULL; /* @@ -1023,9 +1023,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) + if (!hugepage_migration_support(page_hstate(hpage))) { + putback_active_hugepage(hpage); return -ENOSYS; + } + new_hpage = get_new_page(hpage, private, &result); if (!new_hpage) return -ENOMEM; -- cgit v1.2.3 From 59c82b70dcd9cc273c21fae5abc29e41fc732a17 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:17 -0800 Subject: mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages Some part of putback_lru_pages() and putback_movable_pages() is duplicated, so it could confuse us what we should use. We can remove putback_lru_pages() since it is not really needed now. This makes us undestand and maintain the code more easily. And comment on putback_movable_pages() is stale now, so fix it. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 +++++++- mm/migrate.c | 29 +++++++++-------------------- 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9fa6586d5275..b25ed321e667 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 13bedcc4656b..8a73d66be102 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -71,29 +71,13 @@ int migrate_prep_local(void) return 0; } -/* - * Add isolated pages on the list back to the LRU under page lock - * to avoid leaking evictable pages back onto unevictable list. - */ -void putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - - list_for_each_entry_safe(page, page2, l, lru) { - list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - putback_lru_page(page); - } -} - /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * - * This function shall be used instead of putback_lru_pages(), - * whenever the isolated pageset has been built by isolate_migratepages_range() + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_huge_page(). */ void putback_movable_pages(struct list_head *l) { @@ -1725,7 +1709,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { - putback_lru_pages(&migratepages); + if (!list_empty(&migratepages)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } isolated = 0; } else count_vm_numa_event(NUMA_PAGE_MIGRATE); -- cgit v1.2.3 From 78d5506e82b21a1a1de68c24182db2c2fe521422 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:18 -0800 Subject: mm/migrate: remove unused function, fail_migrate_page() fail_migrate_page() isn't used anywhere, so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Naoya Horiguchi Reviewed-by: Wanpeng Li Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 8a73d66be102..a8025befc323 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -552,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Migration functions ***********************************************************/ -/* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate/PagePrivate2. -- cgit v1.2.3 From 12ab028be0008640de712ca890dc1a9ae224934d Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 23 Jan 2014 15:52:48 -0800 Subject: mm/zswap.c: change params from hidden to ro The "compressor" and "enabled" params are currently hidden, this changes them to read-only, so userspace can tell if zswap is enabled or not and see what compressor is in use. Signed-off-by: Dan Streetman Cc: Vladimir Murzin Cc: Bob Liu Cc: Minchan Kim Cc: Weijie Yang Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 5a63f78a5601..e55bab9dc41f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -77,12 +77,12 @@ static u64 zswap_duplicate_entry; **********************************/ /* Enable/disable zswap (disabled by default, fixed at boot for now) */ static bool zswap_enabled __read_mostly; -module_param_named(enabled, zswap_enabled, bool, 0); +module_param_named(enabled, zswap_enabled, bool, 0444); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; -module_param_named(compressor, zswap_compressor, charp, 0); +module_param_named(compressor, zswap_compressor, charp, 0444); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; -- cgit v1.2.3 From f0b791a34cb3cffd2bbc3ca4365c9b719fa2c9f3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jan 2014 15:52:49 -0800 Subject: mm: print more details for bad_page() bad_page() is cool in that it prints out a bunch of data about the page. But, I can never remember which page flags are good and which are bad, or whether ->index or ->mapping is required to be NULL. This patch allows bad/dump_page() callers to specify a string about why they are dumping the page and adds explanation strings to a number of places. It also adds a 'bad_flags' argument to bad_page(), which it then dumps out separately from the flags which are actually set. This way, the messages will show specifically why the page was bad, *specifically* which flags it is complaining about, if it was a page flag combination which was the problem. [akpm@linux-foundation.org: switch to pr_alert] Signed-off-by: Dave Hansen Reviewed-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/balloon_compaction.c | 4 +-- mm/memory.c | 2 +- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 72 ++++++++++++++++++++++++++++++++++++------------- 4 files changed, 58 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 07dbc8ec46cf..6e45a5074bf0 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page) put_page(page); } else { WARN_ON(1); - dump_page(page); + dump_page(page, "not movable balloon page"); } unlock_page(page); } @@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage, BUG_ON(!trylock_page(newpage)); if (WARN_ON(!__is_movable_balloon_page(page))) { - dump_page(page); + dump_page(page, "not movable balloon page"); unlock_page(newpage); return rc; } diff --git a/mm/memory.c b/mm/memory.c index 86487dfa5e59..71d70c082b98 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -671,7 +671,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) - dump_page(page); + dump_page(page, "bad pte"); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cc2ab37220b7..a512a47241a4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1309,7 +1309,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_DEBUG_VM printk(KERN_ALERT "removing pfn %lx from LRU failed\n", pfn); - dump_page(page); + dump_page(page, "failed to remove from LRU"); #endif put_page(page); /* Because we don't have big zone->lock. we should diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 533e2147d14f..1939f4446a36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page) +static void bad_page(struct page *page, char *reason, unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -329,7 +329,7 @@ static void bad_page(struct page *page) printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page(page); + dump_page_badflags(page, reason, bad_flags); print_modules(); dump_stack(); @@ -383,7 +383,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) int bad = 0; if (unlikely(compound_order(page) != order)) { - bad_page(page); + bad_page(page, "wrong compound order", 0); bad++; } @@ -392,8 +392,11 @@ static int destroy_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) || (p->first_page != page))) { - bad_page(page); + if (unlikely(!PageTail(p))) { + bad_page(page, "PageTail not set", 0); + bad++; + } else if (unlikely(p->first_page != page)) { + bad_page(page, "first_page not consistent", 0); bad++; } __ClearPageTail(p); @@ -618,12 +621,23 @@ out: static inline int free_pages_check(struct page *page) { - if (unlikely(page_mapcount(page) | - (page->mapping != NULL) | - (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | - (mem_cgroup_bad_page_check(page)))) { - bad_page(page); + char *bad_reason = NULL; + unsigned long bad_flags = 0; + + if (unlikely(page_mapcount(page))) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(atomic_read(&page->_count) != 0)) + bad_reason = "nonzero _count"; + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; + bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + } + if (unlikely(mem_cgroup_bad_page_check(page))) + bad_reason = "cgroup check failed"; + if (unlikely(bad_reason)) { + bad_page(page, bad_reason, bad_flags); return 1; } page_cpupid_reset_last(page); @@ -843,12 +857,23 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - if (unlikely(page_mapcount(page) | - (page->mapping != NULL) | - (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | - (mem_cgroup_bad_page_check(page)))) { - bad_page(page); + char *bad_reason = NULL; + unsigned long bad_flags = 0; + + if (unlikely(page_mapcount(page))) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(atomic_read(&page->_count) != 0)) + bad_reason = "nonzero _count"; + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; + bad_flags = PAGE_FLAGS_CHECK_AT_PREP; + } + if (unlikely(mem_cgroup_bad_page_check(page))) + bad_reason = "cgroup check failed"; + if (unlikely(bad_reason)) { + bad_page(page, bad_reason, bad_flags); return 1; } return 0; @@ -6494,12 +6519,23 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page(struct page *page) +void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + if (reason) + pr_alert("page dumped because: %s\n", reason); + if (page->flags & badflags) { + pr_alert("bad because of flags:\n"); + dump_page_flags(page->flags & badflags); + } mem_cgroup_print_bad_page(page); } + +void dump_page(struct page *page, char *reason) +{ + dump_page_badflags(page, reason, 0); +} -- cgit v1.2.3 From 01cc2e58697e34c6ee9a40fb6cebc18bf5a1923f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 23 Jan 2014 15:52:50 -0800 Subject: mm: munlock: fix potential race with THP page split Since commit ff6a6da60b89 ("mm: accelerate munlock() treatment of THP pages") munlock skips tail pages of a munlocked THP page. There is some attempt to prevent bad consequences of racing with a THP page split, but code inspection indicates that there are two problems that may lead to a non-fatal, yet wrong outcome. First, __split_huge_page_refcount() copies flags including PageMlocked from the head page to the tail pages. Clearing PageMlocked by munlock_vma_page() in the middle of this operation might result in part of tail pages left with PageMlocked flag. As the head page still appears to be a THP page until all tail pages are processed, munlock_vma_page() might think it munlocked the whole THP page and skip all the former tail pages. Before ff6a6da60, those pages would be cleared in further iterations of munlock_vma_pages_range(), but NR_MLOCK would still become undercounted (related the next point). Second, NR_MLOCK accounting is based on call to hpage_nr_pages() after the PageMlocked is cleared. The accounting might also become inconsistent due to race with __split_huge_page_refcount() - undercount when HUGE_PMD_NR is subtracted, but some tail pages are left with PageMlocked set and counted again (only possible before ff6a6da60) - overcount when hpage_nr_pages() sees a normal page (split has already finished), but the parallel split has meanwhile cleared PageMlocked from additional tail pages This patch prevents both problems via extending the scope of lru_lock in munlock_vma_page(). This is convenient because: - __split_huge_page_refcount() takes lru_lock for its whole operation - munlock_vma_page() typically takes lru_lock anyway for page isolation As this becomes a second function where page isolation is done with lru_lock already held, factor this out to a new __munlock_isolate_lru_page() function and clean up the code around. [akpm@linux-foundation.org: avoid a coding-style ugly] Signed-off-by: Vlastimil Babka Cc: Sasha Levin Cc: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 104 +++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 10819ed4df3e..b30adbe62034 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -90,6 +90,26 @@ void mlock_vma_page(struct page *page) } } +/* + * Isolate a page from LRU with optional get_page() pin. + * Assumes lru_lock already held and page already pinned. + */ +static bool __munlock_isolate_lru_page(struct page *page, bool getpage) +{ + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + if (getpage) + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + return true; + } + + return false; +} + /* * Finish munlock after successful page isolation * @@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page) static void __munlock_isolation_failed(struct page *page) { if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); + __count_vm_event(UNEVICTABLE_PGSTRANDED); else - count_vm_event(UNEVICTABLE_PGMUNLOCKED); + __count_vm_event(UNEVICTABLE_PGMUNLOCKED); } /** @@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page) unsigned int munlock_vma_page(struct page *page) { unsigned int nr_pages; + struct zone *zone = page_zone(page); BUG_ON(!PageLocked(page)); - if (TestClearPageMlocked(page)) { - nr_pages = hpage_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (!isolate_lru_page(page)) - __munlock_isolated_page(page); - else - __munlock_isolation_failed(page); - } else { - nr_pages = hpage_nr_pages(page); - } - /* - * Regardless of the original PageMlocked flag, we determine nr_pages - * after touching the flag. This leaves a possible race with a THP page - * split, such that a whole THP page was munlocked, but nr_pages == 1. - * Returning a smaller mask due to that is OK, the worst that can - * happen is subsequent useless scanning of the former tail pages. - * The NR_MLOCK accounting can however become broken. + * Serialize with any parallel __split_huge_page_refcount() which + * might otherwise copy PageMlocked to part of the tail pages before + * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ + spin_lock_irq(&zone->lru_lock); + + nr_pages = hpage_nr_pages(page); + if (!TestClearPageMlocked(page)) + goto unlock_out; + + __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + + if (__munlock_isolate_lru_page(page, true)) { + spin_unlock_irq(&zone->lru_lock); + __munlock_isolated_page(page); + goto out; + } + __munlock_isolation_failed(page); + +unlock_out: + spin_unlock_irq(&zone->lru_lock); + +out: return nr_pages - 1; } @@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) struct page *page = pvec->pages[i]; if (TestClearPageMlocked(page)) { - struct lruvec *lruvec; - int lru; - - if (PageLRU(page)) { - lruvec = mem_cgroup_page_lruvec(page, zone); - lru = page_lru(page); - /* - * We already have pin from follow_page_mask() - * so we can spare the get_page() here. - */ - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, lru); - } else { - __munlock_isolation_failed(page); - goto skip_munlock; - } - - } else { -skip_munlock: /* - * We won't be munlocking this page in the next phase - * but we still need to release the follow_page_mask() - * pin. We cannot do it under lru_lock however. If it's - * the last pin, __page_cache_release would deadlock. + * We already have pin from follow_page_mask() + * so we can spare the get_page() here. */ - pagevec_add(&pvec_putback, pvec->pages[i]); - pvec->pages[i] = NULL; + if (__munlock_isolate_lru_page(page, false)) + continue; + else + __munlock_isolation_failed(page); } + + /* + * We won't be munlocking this page in the next phase + * but we still need to release the follow_page_mask() + * pin. We cannot do it under lru_lock however. If it's + * the last pin, __page_cache_release() would deadlock. + */ + pagevec_add(&pvec_putback, pvec->pages[i]); + pvec->pages[i] = NULL; } delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); -- cgit v1.2.3 From 8ff69e2c85f84b6b371e3c1d01207e73c0500125 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:52:52 -0800 Subject: memcg: do not use vmalloc for mem_cgroup allocations The vmalloc was introduced by 33327948782b ("memcgroup: use vmalloc for mem_cgroup allocation"), because at that time MAX_NUMNODES was used for defining the per-node array in the mem_cgroup structure so that the structure could be huge even if the system had the only NUMA node. The situation was significantly improved by commit 45cf7ebd5a03 ("memcg: reduce the size of struct memcg 244-fold"), which made the size of the mem_cgroup structure calculated dynamically depending on the real number of NUMA nodes installed on the system (nr_node_ids), so now there is no point in using vmalloc here: the structure is allocated rarely and on most systems its size is about 1K. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 67dd2a881433..7890ce9d6bd1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -381,12 +380,6 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; -static size_t memcg_size(void) -{ - return sizeof(struct mem_cgroup) + - nr_node_ids * sizeof(struct mem_cgroup_per_node *); -} - /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ @@ -6405,14 +6398,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - size_t size = memcg_size(); + size_t size; - /* Can be very big if nr_node_ids is very big */ - if (size < PAGE_SIZE) - memcg = kzalloc(size, GFP_KERNEL); - else - memcg = vzalloc(size); + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + memcg = kzalloc(size, GFP_KERNEL); if (!memcg) return NULL; @@ -6423,10 +6414,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) return memcg; out_free: - if (size < PAGE_SIZE) - kfree(memcg); - else - vfree(memcg); + kfree(memcg); return NULL; } @@ -6444,7 +6432,6 @@ out_free: static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - size_t size = memcg_size(); mem_cgroup_remove_from_trees(memcg); @@ -6465,10 +6452,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) * the cgroup_lock. */ disarm_static_keys(memcg); - if (size < PAGE_SIZE) - kfree(memcg); - else - vfree(memcg); + kfree(memcg); } /* -- cgit v1.2.3 From 309381feaee564281c3d9e90fbca8963bb7428ad Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 23 Jan 2014 15:52:54 -0800 Subject: mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE Most of the VM_BUG_ON assertions are performed on a page. Usually, when one of these assertions fails we'll get a BUG_ON with a call stack and the registers. I've recently noticed based on the requests to add a small piece of code that dumps the page to various VM_BUG_ON sites that the page dump is quite useful to people debugging issues in mm. This patch adds a VM_BUG_ON_PAGE(cond, page) which beyond doing what VM_BUG_ON() does, also dumps the page before executing the actual BUG_ON. [akpm@linux-foundation.org: fix up includes] Signed-off-by: Sasha Levin Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cleancache.c | 6 +++--- mm/compaction.c | 2 +- mm/filemap.c | 16 ++++++++-------- mm/huge_memory.c | 36 ++++++++++++++++++------------------ mm/hugetlb.c | 10 +++++----- mm/hugetlb_cgroup.c | 2 +- mm/internal.h | 10 +++++----- mm/ksm.c | 12 ++++++------ mm/memcontrol.c | 28 ++++++++++++++-------------- mm/memory.c | 8 ++++---- mm/migrate.c | 6 +++--- mm/mlock.c | 4 ++-- mm/page_alloc.c | 21 +++++++++++---------- mm/page_io.c | 4 ++-- mm/rmap.c | 10 +++++----- mm/shmem.c | 8 ++++---- mm/slub.c | 12 ++++++------ mm/swap.c | 36 ++++++++++++++++++------------------ mm/swap_state.c | 16 ++++++++-------- mm/swapfile.c | 8 ++++---- mm/vmscan.c | 20 ++++++++++---------- 21 files changed, 138 insertions(+), 137 deletions(-) (limited to 'mm') diff --git a/mm/cleancache.c b/mm/cleancache.c index 5875f48ce279..d0eac4350403 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -237,7 +237,7 @@ int __cleancache_get_page(struct page *page) goto out; } - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; if (fake_pool_id < 0) goto out; @@ -279,7 +279,7 @@ void __cleancache_put_page(struct page *page) return; } - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; if (fake_pool_id < 0) return; @@ -318,7 +318,7 @@ void __cleancache_invalidate_page(struct address_space *mapping, if (pool_id < 0) return; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { cleancache_ops->invalidate_page(pool_id, key, page->index); diff --git a/mm/compaction.c b/mm/compaction.c index 3a91a2ea3d34..e0ab02d70f13 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -601,7 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (__isolate_lru_page(page, mode) != 0) continue; - VM_BUG_ON(PageTransCompound(page)); + VM_BUG_ON_PAGE(PageTransCompound(page), page); /* Successfully isolated */ cc->finished_update_migrate = true; diff --git a/mm/filemap.c b/mm/filemap.c index b7749a92021c..7a7f3e0db738 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -409,9 +409,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - VM_BUG_ON(!PageLocked(old)); - VM_BUG_ON(!PageLocked(new)); - VM_BUG_ON(new->mapping); + VM_BUG_ON_PAGE(!PageLocked(old), old); + VM_BUG_ON_PAGE(!PageLocked(new), new); + VM_BUG_ON_PAGE(new->mapping, new); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { @@ -461,8 +461,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, { int error; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapBacked(page), page); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); @@ -607,7 +607,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); @@ -760,7 +760,7 @@ repeat: page_cache_release(page); goto repeat; } - VM_BUG_ON(page->index != offset); + VM_BUG_ON_PAGE(page->index != offset, page); } return page; } @@ -1656,7 +1656,7 @@ retry_find: put_page(page); goto retry_find; } - VM_BUG_ON(page->index != offset); + VM_BUG_ON_PAGE(page->index != offset, page); /* * We have a locked page in the page cache, now we need to check diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..25fab7150fa0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -712,7 +712,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_t pgtable; spinlock_t *ptl; - VM_BUG_ON(!PageCompound(page)); + VM_BUG_ON_PAGE(!PageCompound(page), page); pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -893,7 +893,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out; } src_page = pmd_page(pmd); - VM_BUG_ON(!PageHead(src_page)); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); get_page(src_page); page_dup_rmap(src_page); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1067,7 +1067,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ @@ -1133,7 +1133,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; page = pmd_page(orig_pmd); - VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); if (page_mapcount(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); @@ -1211,7 +1211,7 @@ alloc: add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); put_huge_zero_page(); } else { - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page); put_page(page); } @@ -1249,7 +1249,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto out; page = pmd_page(*pmd); - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); if (flags & FOLL_TOUCH) { pmd_t _pmd; /* @@ -1274,7 +1274,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, } } page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON(!PageCompound(page)); + VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) get_page_foll(page); @@ -1432,9 +1432,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { page = pmd_page(orig_pmd); page_remove_rmap(page); - VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); tlb_remove_page(tlb, page); @@ -2176,9 +2176,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (unlikely(!page)) goto out; - VM_BUG_ON(PageCompound(page)); - BUG_ON(!PageAnon(page)); - VM_BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* cannot use mapcount: can't collapse if there's a gup pin */ if (page_count(page) != 1) @@ -2201,8 +2201,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* 0 stands for page_is_file_cache(page) == false */ inc_zone_page_state(page, NR_ISOLATED_ANON + 0); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ if (pte_young(pteval) || PageReferenced(page) || @@ -2232,7 +2232,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to @@ -2311,7 +2311,7 @@ static struct page struct vm_area_struct *vma, unsigned long address, int node) { - VM_BUG_ON(*hpage); + VM_BUG_ON_PAGE(*hpage, *hpage); /* * Allocate the page while the vma is still valid and under * the mmap_sem read mode so there is no memory allocation @@ -2580,7 +2580,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, */ node = page_to_nid(page); khugepaged_node_load[node]++; - VM_BUG_ON(PageCompound(page)); + VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; /* cannot use mapcount: can't collapse if there's a gup pin */ @@ -2876,7 +2876,7 @@ again: return; } page = pmd_page(*pmd); - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); get_page(page); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 04306b9de90d..c01cb9fedb18 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -584,7 +584,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON(hugetlb_cgroup_from_page(page)); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); @@ -1089,7 +1089,7 @@ retry: * no users -- drop the buddy allocator's reference. */ put_page_testzero(page); - VM_BUG_ON(page_count(page)); + VM_BUG_ON_PAGE(page_count(page), page); enqueue_huge_page(h, page); } free: @@ -3503,7 +3503,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) bool isolate_huge_page(struct page *page, struct list_head *list) { - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); if (!get_page_unless_zero(page)) return false; spin_lock(&hugetlb_lock); @@ -3514,7 +3514,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) void putback_active_hugepage(struct page *page) { - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); @@ -3523,7 +3523,7 @@ void putback_active_hugepage(struct page *page) bool is_hugepage_active(struct page *page) { - VM_BUG_ON(!PageHuge(page)); + VM_BUG_ON_PAGE(!PageHuge(page), page); /* * This function can be called for a tail page because the caller, * scan_movable_pages, scans through a given pfn-range which typically diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d747a84e09b0..cb00829bb466 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -390,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON(!PageHuge(oldhpage)); + VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); spin_lock(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); diff --git a/mm/internal.h b/mm/internal.h index a346ba120e42..dc95e979ae56 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -27,8 +27,8 @@ static inline void set_page_count(struct page *page, int v) */ static inline void set_page_refcounted(struct page *page) { - VM_BUG_ON(PageTail(page)); - VM_BUG_ON(atomic_read(&page->_count)); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(atomic_read(&page->_count), page); set_page_count(page, 1); } @@ -46,7 +46,7 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); if (get_page_head) atomic_inc(&page->first_page->_count); get_huge_page_tail(page); @@ -71,7 +71,7 @@ static inline void get_page_foll(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); atomic_inc(&page->_count); } } @@ -173,7 +173,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) static inline int mlocked_vma_newpage(struct vm_area_struct *vma, struct page *page) { - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) return 0; diff --git a/mm/ksm.c b/mm/ksm.c index 3df141e5f3e0..f91ddf5c3688 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1898,13 +1898,13 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) int ret = SWAP_AGAIN; int search_new_forks = 0; - VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON_PAGE(!PageKsm(page), page); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); stable_node = page_stable_node(page); if (!stable_node) @@ -1958,13 +1958,13 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) { struct stable_node *stable_node; - VM_BUG_ON(!PageLocked(oldpage)); - VM_BUG_ON(!PageLocked(newpage)); - VM_BUG_ON(newpage->mapping != oldpage->mapping); + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); stable_node = page_stable_node(newpage); if (stable_node) { - VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); stable_node->kpfn = page_to_pfn(newpage); /* * newpage->mapping was set in advance; now we need smp_wmb() diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7890ce9d6bd1..72f2d90e7ef6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2897,7 +2897,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) unsigned short id; swp_entry_t ent; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); lock_page_cgroup(pc); @@ -2931,7 +2931,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, bool anon; lock_page_cgroup(pc); - VM_BUG_ON(PageCgroupUsed(pc)); + VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. @@ -2966,7 +2966,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, if (lrucare) { if (was_on_lru) { lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } @@ -3780,7 +3780,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) if (!memcg) return; - VM_BUG_ON(mem_cgroup_is_root(memcg)); + VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else @@ -3859,7 +3859,7 @@ static int mem_cgroup_move_account(struct page *page, bool anon = PageAnon(page); VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); /* * The page is isolated from LRU. So, collapse function * will not handle this page. But page splitting can happen. @@ -3952,7 +3952,7 @@ static int mem_cgroup_move_parent(struct page *page, parent = root_mem_cgroup; if (nr_pages > 1) { - VM_BUG_ON(!PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); flags = compound_lock_irqsave(page); } @@ -3986,7 +3986,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); - VM_BUG_ON(!PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* * Never OOM-kill a process for a huge page. The * fault handler will fall back to regular pages. @@ -4006,8 +4006,8 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - VM_BUG_ON(page_mapped(page)); - VM_BUG_ON(page->mapping && !PageAnon(page)); + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); VM_BUG_ON(!mm); return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_ANON); @@ -4211,7 +4211,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); - VM_BUG_ON(!PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); } /* * Check if our page_cgroup is valid @@ -4303,7 +4303,7 @@ void mem_cgroup_uncharge_page(struct page *page) /* early check. */ if (page_mapped(page)) return; - VM_BUG_ON(page->mapping && !PageAnon(page)); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); /* * If the page is in swap cache, uncharge should be deferred * to the swap path, which also properly accounts swap usage @@ -4323,8 +4323,8 @@ void mem_cgroup_uncharge_page(struct page *page) void mem_cgroup_uncharge_cache_page(struct page *page) { - VM_BUG_ON(page_mapped(page)); - VM_BUG_ON(page->mapping); + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping, page); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); } @@ -6880,7 +6880,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, enum mc_target_type ret = MC_TARGET_NONE; page = pmd_page(pmd); - VM_BUG_ON(!page || !PageHead(page)); + VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!move_anon()) return ret; pc = lookup_page_cgroup(page); diff --git a/mm/memory.c b/mm/memory.c index 71d70c082b98..be6a0c0d4ae0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -289,7 +289,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) return 0; batch = tlb->active; } - VM_BUG_ON(batch->nr > batch->max); + VM_BUG_ON_PAGE(batch->nr > batch->max, page); return batch->max - batch->nr; } @@ -2702,7 +2702,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto unwritable_page; } } else - VM_BUG_ON(!PageLocked(old_page)); + VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); /* * Since we dropped the lock we need to revalidate @@ -3358,7 +3358,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf.page); else - VM_BUG_ON(!PageLocked(vmf.page)); + VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); /* * Should we do an early C-O-W break? @@ -3395,7 +3395,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto unwritable_page; } } else - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); page_mkwrite = 1; } } diff --git a/mm/migrate.c b/mm/migrate.c index a8025befc323..4b3996eb7f0f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -499,7 +499,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageUptodate(page)) SetPageUptodate(newpage); if (TestClearPageActive(page)) { - VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON_PAGE(PageUnevictable(page), page); SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); @@ -871,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * free the metadata, so the page can be freed. */ if (!page->mapping) { - VM_BUG_ON(PageAnon(page)); + VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); goto uncharge; @@ -1618,7 +1618,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; - VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); + VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) diff --git a/mm/mlock.c b/mm/mlock.c index b30adbe62034..4e1a68162285 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -279,8 +279,8 @@ static int __mlock_posix_error_return(long retval) static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, int *pgrescued) { - VM_BUG_ON(PageLRU(page)); - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (page_mapcount(page) <= 1 && page_evictable(page)) { pagevec_add(pvec, page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1939f4446a36..f18f016cca80 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -509,12 +509,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (page_is_guard(buddy) && page_order(buddy) == order) { - VM_BUG_ON(page_count(buddy) != 0); + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { - VM_BUG_ON(page_count(buddy) != 0); + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return 1; } return 0; @@ -564,8 +564,8 @@ static inline void __free_one_page(struct page *page, page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - VM_BUG_ON(page_idx & ((1 << order) - 1)); - VM_BUG_ON(bad_range(zone, page)); + VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); while (order < MAX_ORDER-1) { buddy_idx = __find_buddy_index(page_idx, order); @@ -827,7 +827,7 @@ static inline void expand(struct zone *zone, struct page *page, area--; high--; size >>= 1; - VM_BUG_ON(bad_range(zone, &page[size])); + VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); #ifdef CONFIG_DEBUG_PAGEALLOC if (high < debug_guardpage_minorder()) { @@ -980,7 +980,7 @@ int move_freepages(struct zone *zone, for (page = start_page; page <= end_page;) { /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1429,8 +1429,8 @@ void split_page(struct page *page, unsigned int order) { int i; - VM_BUG_ON(PageCompound(page)); - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!page_count(page), page); #ifdef CONFIG_KMEMCHECK /* @@ -1577,7 +1577,7 @@ again: zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); - VM_BUG_ON(bad_range(zone, page)); + VM_BUG_ON_PAGE(bad_range(zone, page), page); if (prep_new_page(page, order, gfp_flags)) goto again; return page; @@ -6021,7 +6021,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); - VM_BUG_ON(!zone_spans_pfn(zone, pfn)); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) if (flags & value) @@ -6539,3 +6539,4 @@ void dump_page(struct page *page, char *reason) { dump_page_badflags(page, reason, 0); } +EXPORT_SYMBOL_GPL(dump_page); diff --git a/mm/page_io.c b/mm/page_io.c index 8c79a4764be0..7247be6114ac 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -320,8 +320,8 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageUptodate(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index 962e2a1e13a0..2dcd3353c3f6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -894,9 +894,9 @@ void page_move_anon_rmap(struct page *page, { struct anon_vma *anon_vma = vma->anon_vma; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON(!anon_vma); - VM_BUG_ON(page->index != linear_page_index(vma, address)); + VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; @@ -995,7 +995,7 @@ void do_page_add_anon_rmap(struct page *page, if (unlikely(PageKsm(page))) return; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, exclusive); @@ -1481,7 +1481,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) .anon_lock = page_lock_anon_vma_read, }; - VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); /* * During exec, a temporary VMA is setup and later moved. @@ -1533,7 +1533,7 @@ int try_to_munlock(struct page *page) }; - VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); ret = rmap_walk(page, &rwc); return ret; diff --git a/mm/shmem.c b/mm/shmem.c index 902a14842b74..8156f95ec0cf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -285,8 +285,8 @@ static int shmem_add_to_page_cache(struct page *page, { int error; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); page_cache_get(page); page->mapping = mapping; @@ -491,7 +491,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; if (!unfalloc || !PageUptodate(page)) { if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); + VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); } } @@ -568,7 +568,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, lock_page(page); if (!unfalloc || !PageUptodate(page)) { if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); + VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); } } diff --git a/mm/slub.c b/mm/slub.c index 545a170ebf9f..34bb8c65a2d8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1559,7 +1559,7 @@ static inline void *acquire_slab(struct kmem_cache *s, new.freelist = freelist; } - VM_BUG_ON(new.frozen); + VM_BUG_ON_PAGE(new.frozen, &new); new.frozen = 1; if (!__cmpxchg_double_slab(s, page, @@ -1812,7 +1812,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, set_freepointer(s, freelist, prior); new.counters = counters; new.inuse--; - VM_BUG_ON(!new.frozen); + VM_BUG_ON_PAGE(!new.frozen, &new); } while (!__cmpxchg_double_slab(s, page, prior, counters, @@ -1840,7 +1840,7 @@ redo: old.freelist = page->freelist; old.counters = page->counters; - VM_BUG_ON(!old.frozen); + VM_BUG_ON_PAGE(!old.frozen, &old); /* Determine target state of the slab */ new.counters = old.counters; @@ -1952,7 +1952,7 @@ static void unfreeze_partials(struct kmem_cache *s, old.freelist = page->freelist; old.counters = page->counters; - VM_BUG_ON(!old.frozen); + VM_BUG_ON_PAGE(!old.frozen, &old); new.counters = old.counters; new.freelist = old.freelist; @@ -2225,7 +2225,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) counters = page->counters; new.counters = counters; - VM_BUG_ON(!new.frozen); + VM_BUG_ON_PAGE(!new.frozen, &new); new.inuse = page->objects; new.frozen = freelist != NULL; @@ -2319,7 +2319,7 @@ load_freelist: * page is pointing to the page from which the objects are obtained. * That page must be frozen for per cpu allocations to work. */ - VM_BUG_ON(!c->page->frozen); + VM_BUG_ON_PAGE(!c->page->frozen, c->page); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_irq_restore(flags); diff --git a/mm/swap.c b/mm/swap.c index d1100b619e61..b31ba67d440a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -57,7 +57,7 @@ static void __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -130,8 +130,8 @@ static void put_compound_page(struct page *page) * __split_huge_page_refcount cannot race * here. */ - VM_BUG_ON(!PageHead(page_head)); - VM_BUG_ON(page_mapcount(page) != 0); + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); if (put_page_testzero(page_head)) { /* * If this is the tail of a slab @@ -148,7 +148,7 @@ static void put_compound_page(struct page *page) * the compound page enters the buddy * allocator. */ - VM_BUG_ON(PageSlab(page_head)); + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); __put_compound_page(page_head); } return; @@ -199,7 +199,7 @@ out_put_single: __put_single_page(page); return; } - VM_BUG_ON(page_head != page->first_page); + VM_BUG_ON_PAGE(page_head != page->first_page, page); /* * We can release the refcount taken by * get_page_unless_zero() now that @@ -207,12 +207,12 @@ out_put_single: * compound_lock. */ if (put_page_testzero(page_head)) - VM_BUG_ON(1); + VM_BUG_ON_PAGE(1, page_head); /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(page_mapcount(page) <= 0); + VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); atomic_dec(&page->_mapcount); - VM_BUG_ON(atomic_read(&page_head->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); + VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { @@ -223,7 +223,7 @@ out_put_single: } } else { /* page_head is a dangling pointer */ - VM_BUG_ON(PageTail(page)); + VM_BUG_ON_PAGE(PageTail(page), page); goto out_put_single; } } @@ -264,7 +264,7 @@ bool __get_page_tail(struct page *page) * page. __split_huge_page_refcount * cannot race here. */ - VM_BUG_ON(!PageHead(page_head)); + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); __get_page_tail_foll(page, true); return true; } else { @@ -604,8 +604,8 @@ EXPORT_SYMBOL(__lru_cache_add); */ void lru_cache_add(struct page *page) { - VM_BUG_ON(PageActive(page) && PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); __lru_cache_add(page); } @@ -846,7 +846,7 @@ void release_pages(struct page **pages, int nr, int cold) } lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); } @@ -888,9 +888,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, { const int file = 0; - VM_BUG_ON(!PageHead(page)); - VM_BUG_ON(PageCompound(page_tail)); - VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(PageCompound(page_tail), page); + VM_BUG_ON_PAGE(PageLRU(page_tail), page); VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); @@ -929,7 +929,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, int active = PageActive(page); enum lru_list lru = page_lru(page); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); diff --git a/mm/swap_state.c b/mm/swap_state.c index e6f15f8ca2af..98e85e9c2b2d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -83,9 +83,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) int error; struct address_space *address_space; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageSwapCache(page)); - VM_BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); page_cache_get(page); SetPageSwapCache(page); @@ -139,9 +139,9 @@ void __delete_from_swap_cache(struct page *page) swp_entry_t entry; struct address_space *address_space; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageSwapCache(page)); - VM_BUG_ON(PageWriteback(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(PageWriteback(page), page); entry.val = page_private(page); address_space = swap_address_space(entry); @@ -165,8 +165,8 @@ int add_to_swap(struct page *page, struct list_head *list) swp_entry_t entry; int err; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageUptodate(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); entry = get_swap_page(); if (!entry.val) diff --git a/mm/swapfile.c b/mm/swapfile.c index 612a7c9795f6..d443dea95c27 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -906,7 +906,7 @@ int reuse_swap_page(struct page *page) { int count; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return 0; count = page_mapcount(page); @@ -926,7 +926,7 @@ int reuse_swap_page(struct page *page) */ int try_to_free_swap(struct page *page) { - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (!PageSwapCache(page)) return 0; @@ -2714,7 +2714,7 @@ struct swap_info_struct *page_swap_info(struct page *page) */ struct address_space *__page_file_mapping(struct page *page) { - VM_BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); return page_swap_info(page)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(__page_file_mapping); @@ -2722,7 +2722,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); pgoff_t __page_file_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - VM_BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); diff --git a/mm/vmscan.c b/mm/vmscan.c index eea668d9cff6..2254f36b74b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -603,7 +603,7 @@ void putback_lru_page(struct page *page) bool is_unevictable; int was_unevictable = PageUnevictable(page); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); redo: ClearPageUnevictable(page); @@ -794,8 +794,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!trylock_page(page)) goto keep; - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -1079,14 +1079,14 @@ activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } free_hot_cold_page_list(&free_pages, 1); @@ -1240,7 +1240,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); switch (__isolate_lru_page(page, mode)) { case 0: @@ -1295,7 +1295,7 @@ int isolate_lru_page(struct page *page) { int ret = -EBUSY; - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); if (PageLRU(page)) { struct zone *zone = page_zone(page); @@ -1366,7 +1366,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) struct page *page = lru_to_page(page_list); int lru; - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1586,7 +1586,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, page = lru_to_page(list); lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); @@ -3701,7 +3701,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, lru); -- cgit v1.2.3 From 3965fc3652244651006ebb31c8c45318ce84818f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:52:55 -0800 Subject: slab: clean up kmem_cache_create_memcg() error handling Currently kmem_cache_create_memcg() backoffs on failure inside conditionals, without using gotos. This results in the rollback code duplication, which makes the function look cumbersome even though on error we should only free the allocated cache. Since in the next patch I am going to add yet another rollback function call on error path there, let's employ labels instead of conditionals for undoing any changes on failure to keep things clean. Signed-off-by: Vladimir Davydov Reviewed-by: Pekka Enberg Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 65 +++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 0b7bb399b0e4..f70df3ef6f1a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -171,13 +171,14 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, struct kmem_cache *parent_cache) { struct kmem_cache *s = NULL; - int err = 0; + int err; get_online_cpus(); mutex_lock(&slab_mutex); - if (!kmem_cache_sanity_check(memcg, name, size) == 0) - goto out_locked; + err = kmem_cache_sanity_check(memcg, name, size); + if (err) + goto out_unlock; /* * Some allocators will constraint the set of valid flags to a subset @@ -189,45 +190,38 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); if (s) - goto out_locked; + goto out_unlock; + err = -ENOMEM; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); - if (s) { - s->object_size = s->size = size; - s->align = calculate_alignment(flags, align, size); - s->ctor = ctor; + if (!s) + goto out_unlock; - if (memcg_register_cache(memcg, s, parent_cache)) { - kmem_cache_free(kmem_cache, s); - err = -ENOMEM; - goto out_locked; - } + s->object_size = s->size = size; + s->align = calculate_alignment(flags, align, size); + s->ctor = ctor; - s->name = kstrdup(name, GFP_KERNEL); - if (!s->name) { - kmem_cache_free(kmem_cache, s); - err = -ENOMEM; - goto out_locked; - } + s->name = kstrdup(name, GFP_KERNEL); + if (!s->name) + goto out_free_cache; - err = __kmem_cache_create(s, flags); - if (!err) { - s->refcount = 1; - list_add(&s->list, &slab_caches); - memcg_cache_list_add(memcg, s); - } else { - kfree(s->name); - kmem_cache_free(kmem_cache, s); - } - } else - err = -ENOMEM; + err = memcg_register_cache(memcg, s, parent_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); + memcg_cache_list_add(memcg, s); -out_locked: +out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); if (err) { - if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); @@ -236,11 +230,14 @@ out_locked: name, err); dump_stack(); } - return NULL; } - return s; + +out_free_cache: + kfree(s->name); + kmem_cache_free(kmem_cache, s); + goto out_unlock; } struct kmem_cache * -- cgit v1.2.3 From 363a044f739b0f07a8c063b838c5528d10720e02 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:52:56 -0800 Subject: memcg, slab: kmem_cache_create_memcg(): fix memleak on fail path We do not free the cache's memcg_params if __kmem_cache_create fails. Fix this. Plus, rename memcg_register_cache() to memcg_alloc_cache_params(), because it actually does not register the cache anywhere, but simply initialize kmem_cache::memcg_params. [akpm@linux-foundation.org: fix build] Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 ++++++++--- mm/slab_common.c | 3 ++- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 72f2d90e7ef6..b8ebe71f872d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3231,8 +3231,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, - struct kmem_cache *root_cache) +int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) { size_t size; @@ -3260,6 +3260,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, return 0; } +void memcg_free_cache_params(struct kmem_cache *s) +{ + kfree(s->memcg_params); +} + void memcg_release_cache(struct kmem_cache *s) { struct kmem_cache *root; @@ -3288,7 +3293,7 @@ void memcg_release_cache(struct kmem_cache *s) css_put(&memcg->css); out: - kfree(s->memcg_params); + memcg_free_cache_params(s); } /* diff --git a/mm/slab_common.c b/mm/slab_common.c index f70df3ef6f1a..70f9e249ac30 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -205,7 +205,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, if (!s->name) goto out_free_cache; - err = memcg_register_cache(memcg, s, parent_cache); + err = memcg_alloc_cache_params(memcg, s, parent_cache); if (err) goto out_free_cache; @@ -235,6 +235,7 @@ out_unlock: return s; out_free_cache: + memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); goto out_unlock; -- cgit v1.2.3 From 1aa13254259bdef0bca723849ab3bab308d2f0c3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:52:58 -0800 Subject: memcg, slab: clean up memcg cache initialization/destruction Currently, we have rather a messy function set relating to per-memcg kmem cache initialization/destruction. Per-memcg caches are created in memcg_create_kmem_cache(). This function calls kmem_cache_create_memcg() to allocate and initialize a kmem cache and then "registers" the new cache in the memcg_params::memcg_caches array of the parent cache. During its work-flow, kmem_cache_create_memcg() executes the following memcg-related functions: - memcg_alloc_cache_params(), to initialize memcg_params of the newly created cache; - memcg_cache_list_add(), to add the new cache to the memcg_slab_caches list. On the other hand, kmem_cache_destroy() called on a cache destruction only calls memcg_release_cache(), which does all the work: it cleans the reference to the cache in its parent's memcg_params::memcg_caches, removes the cache from the memcg_slab_caches list, and frees memcg_params. Such an inconsistency between destruction and initialization paths make the code difficult to read, so let's clean this up a bit. This patch moves all the code relating to registration of per-memcg caches (adding to memcg list, setting the pointer to a cache from its parent) to the newly created memcg_register_cache() and memcg_unregister_cache() functions making the initialization and destruction paths look symmetrical. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 64 ++++++++++++++++++++++++++------------------------------ mm/slab_common.c | 5 +++-- 2 files changed, 33 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b8ebe71f872d..739383cd3f70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3095,16 +3095,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) css_put(&memcg->css); } -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) -{ - if (!memcg) - return; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); -} - /* * helper for acessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function @@ -3265,21 +3255,41 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -void memcg_release_cache(struct kmem_cache *s) +void memcg_register_cache(struct kmem_cache *s) { struct kmem_cache *root; struct mem_cgroup *memcg; int id; + if (is_root_cache(s)) + return; + + root = s->memcg_params->root_cache; + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + css_get(&memcg->css); + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); + + root->memcg_params->memcg_caches[id] = s; /* - * This happens, for instance, when a root cache goes away before we - * add any memcg. + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason */ - if (!s->memcg_params) - return; + wmb(); +} - if (s->memcg_params->is_root_cache) - goto out; +void memcg_unregister_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + if (is_root_cache(s)) + return; memcg = s->memcg_params->memcg; id = memcg_cache_id(memcg); @@ -3292,8 +3302,6 @@ void memcg_release_cache(struct kmem_cache *s) mutex_unlock(&memcg->slab_caches_mutex); css_put(&memcg->css); -out: - memcg_free_cache_params(s); } /* @@ -3451,26 +3459,13 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&memcg_cache_mutex); new_cachep = cache_from_memcg_idx(cachep, idx); - if (new_cachep) { - css_put(&memcg->css); + if (new_cachep) goto out; - } new_cachep = kmem_cache_dup(memcg, cachep); - if (new_cachep == NULL) { + if (new_cachep == NULL) new_cachep = cachep; - css_put(&memcg->css); - goto out; - } - - atomic_set(&new_cachep->memcg_params->nr_pages , 0); - cachep->memcg_params->memcg_caches[idx] = new_cachep; - /* - * the readers won't lock, make sure everybody sees the updated value, - * so they won't put stuff in the queue again for no reason - */ - wmb(); out: mutex_unlock(&memcg_cache_mutex); return new_cachep; @@ -3550,6 +3545,7 @@ static void memcg_create_cache_work_func(struct work_struct *w) cw = container_of(w, struct create_work, work); memcg_create_kmem_cache(cw->memcg, cw->cachep); + css_put(&cw->memcg->css); kfree(cw); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 70f9e249ac30..db24ec48b946 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -215,7 +215,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, s->refcount = 1; list_add(&s->list, &slab_caches); - memcg_cache_list_add(memcg, s); + memcg_register_cache(s); out_unlock: mutex_unlock(&slab_mutex); @@ -265,7 +265,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); - memcg_release_cache(s); + memcg_unregister_cache(s); + memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } else { -- cgit v1.2.3 From 959c8963fc6c8c9b97e80c55ce77105247040e7d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:52:59 -0800 Subject: memcg, slab: fix barrier usage when accessing memcg_caches Each root kmem_cache has pointers to per-memcg caches stored in its memcg_params::memcg_caches array. Whenever we want to allocate a slab for a memcg, we access this array to get per-memcg cache to allocate from (see memcg_kmem_get_cache()). The access must be lock-free for performance reasons, so we should use barriers to assert the kmem_cache is up-to-date. First, we should place a write barrier immediately before setting the pointer to it in the memcg_caches array in order to make sure nobody will see a partially initialized object. Second, we should issue a read barrier before dereferencing the pointer to conform to the write barrier. However, currently the barrier usage looks rather strange. We have a write barrier *after* setting the pointer and a read barrier *before* reading the pointer, which is incorrect. This patch fixes this. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 ++++++++++-------------- mm/slab.h | 12 +++++++++++- 2 files changed, 21 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 739383cd3f70..322d18dc17f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3274,12 +3274,14 @@ void memcg_register_cache(struct kmem_cache *s) list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); mutex_unlock(&memcg->slab_caches_mutex); - root->memcg_params->memcg_caches[id] = s; /* - * the readers won't lock, make sure everybody sees the updated value, - * so they won't put stuff in the queue again for no reason + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. */ - wmb(); + smp_wmb(); + + root->memcg_params->memcg_caches[id] = s; } void memcg_unregister_cache(struct kmem_cache *s) @@ -3605,7 +3607,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { struct mem_cgroup *memcg; - int idx; + struct kmem_cache *memcg_cachep; VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); @@ -3619,15 +3621,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, if (!memcg_can_account_kmem(memcg)) goto out; - idx = memcg_cache_id(memcg); - - /* - * barrier to mare sure we're always seeing the up to date value. The - * code updating memcg_caches will issue a write barrier to match this. - */ - read_barrier_depends(); - if (likely(cache_from_memcg_idx(cachep, idx))) { - cachep = cache_from_memcg_idx(cachep, idx); + memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); + if (likely(memcg_cachep)) { + cachep = memcg_cachep; goto out; } diff --git a/mm/slab.h b/mm/slab.h index 0859c4241ba1..72d1f9df71bd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -163,9 +163,19 @@ static inline const char *cache_name(struct kmem_cache *s) static inline struct kmem_cache * cache_from_memcg_idx(struct kmem_cache *s, int idx) { + struct kmem_cache *cachep; + if (!s->memcg_params) return NULL; - return s->memcg_params->memcg_caches[idx]; + cachep = s->memcg_params->memcg_caches[idx]; + + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match this (see + * memcg_register_cache()). + */ + smp_read_barrier_depends(); + return cachep; } static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) -- cgit v1.2.3 From 96403da244443d9842dbf290c2a02390b78a158e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:01 -0800 Subject: memcg: fix possible NULL deref while traversing memcg_slab_caches list All caches of the same memory cgroup are linked in the memcg_slab_caches list via kmem_cache::memcg_params::list. This list is traversed, for example, when we read memory.kmem.slabinfo. Since the list actually consists of memcg_cache_params objects, we have to convert an element of the list to a kmem_cache object using memcg_params_to_cache(), which obtains the pointer to the cache from the memcg_params::memcg_caches array of the corresponding root cache. That said the pointer to a kmem_cache in its parent's memcg_params must be initialized before adding the cache to the list, and cleared only after it has been unlinked. Currently it is vice-versa, which can result in a NULL ptr dereference while traversing the memcg_slab_caches list. This patch restores the correct order. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 322d18dc17f0..014a4f1acf1c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3270,9 +3270,6 @@ void memcg_register_cache(struct kmem_cache *s) css_get(&memcg->css); - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3281,7 +3278,16 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); + /* + * Initialize the pointer to this cache in its parent's memcg_params + * before adding it to the memcg_slab_caches list, otherwise we can + * fail to convert memcg_params_to_cache() while traversing the list. + */ root->memcg_params->memcg_caches[id] = s; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); } void memcg_unregister_cache(struct kmem_cache *s) @@ -3293,16 +3299,21 @@ void memcg_unregister_cache(struct kmem_cache *s) if (is_root_cache(s)) return; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); - root = s->memcg_params->root_cache; - root->memcg_params->memcg_caches[id] = NULL; + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); mutex_lock(&memcg->slab_caches_mutex); list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); + /* + * Clear the pointer to this cache in its parent's memcg_params only + * after removing it from the memcg_slab_caches list, otherwise we can + * fail to convert memcg_params_to_cache() while traversing the list. + */ + root->memcg_params->memcg_caches[id] = NULL; + css_put(&memcg->css); } -- cgit v1.2.3 From 2edefe1155b3ad3dc92065f6e1018d363525296e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:02 -0800 Subject: memcg, slab: fix races in per-memcg cache creation/destruction We obtain a per-memcg cache from a root kmem_cache by dereferencing an entry of the root cache's memcg_params::memcg_caches array. If we find no cache for a memcg there on allocation, we initiate the memcg cache creation (see memcg_kmem_get_cache()). The cache creation proceeds asynchronously in memcg_create_kmem_cache() in order to avoid lock clashes, so there can be several threads trying to create the same kmem_cache concurrently, but only one of them may succeed. However, due to a race in the code, it is not always true. The point is that the memcg_caches array can be relocated when we activate kmem accounting for a memcg (see memcg_update_all_caches(), memcg_update_cache_size()). If memcg_update_cache_size() and memcg_create_kmem_cache() proceed concurrently as described below, we can leak a kmem_cache. Asume two threads schedule creation of the same kmem_cache. One of them successfully creates it. Another one should fail then, but if memcg_create_kmem_cache() interleaves with memcg_update_cache_size() as follows, it won't: memcg_create_kmem_cache() memcg_update_cache_size() (called w/o mutexes held) (called with slab_mutex, set_limit_mutex held) ------------------------- ------------------------- mutex_lock(&memcg_cache_mutex) s->memcg_params=kzalloc(...) new_cachep=cache_from_memcg_idx(cachep,idx) // new_cachep==NULL => proceed to creation s->memcg_params->memcg_caches[i] =cur_params->memcg_caches[i] // kmem_cache_create_memcg takes slab_mutex // so we will hang around until // memcg_update_cache_size finishes, but // nothing will prevent it from succeeding so // memcg_caches[idx] will be overwritten in // memcg_register_cache! new_cachep = kmem_cache_create_memcg(...) mutex_unlock(&memcg_cache_mutex) Let's fix this by moving the check for existence of the memcg cache to kmem_cache_create_memcg() to be called under the slab_mutex and make it return NULL if so. A similar race is possible when destroying a memcg cache (see kmem_cache_destroy()). Since memcg_unregister_cache(), which clears the pointer in the memcg_caches array, is called w/o protection, we can race with memcg_update_cache_size() and omit clearing the pointer. Therefore memcg_unregister_cache() should be moved before we release the slab_mutex. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 ++++++++++++++--------- mm/slab_common.c | 14 +++++++++++++- 2 files changed, 27 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 014a4f1acf1c..d2da65c4cd84 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3264,6 +3264,12 @@ void memcg_register_cache(struct kmem_cache *s) if (is_root_cache(s)) return; + /* + * Holding the slab_mutex assures nobody will touch the memcg_caches + * array while we are modifying it. + */ + lockdep_assert_held(&slab_mutex); + root = s->memcg_params->root_cache; memcg = s->memcg_params->memcg; id = memcg_cache_id(memcg); @@ -3283,6 +3289,7 @@ void memcg_register_cache(struct kmem_cache *s) * before adding it to the memcg_slab_caches list, otherwise we can * fail to convert memcg_params_to_cache() while traversing the list. */ + VM_BUG_ON(root->memcg_params->memcg_caches[id]); root->memcg_params->memcg_caches[id] = s; mutex_lock(&memcg->slab_caches_mutex); @@ -3299,6 +3306,12 @@ void memcg_unregister_cache(struct kmem_cache *s) if (is_root_cache(s)) return; + /* + * Holding the slab_mutex assures nobody will touch the memcg_caches + * array while we are modifying it. + */ + lockdep_assert_held(&slab_mutex); + root = s->memcg_params->root_cache; memcg = s->memcg_params->memcg; id = memcg_cache_id(memcg); @@ -3312,6 +3325,7 @@ void memcg_unregister_cache(struct kmem_cache *s) * after removing it from the memcg_slab_caches list, otherwise we can * fail to convert memcg_params_to_cache() while traversing the list. */ + VM_BUG_ON(!root->memcg_params->memcg_caches[id]); root->memcg_params->memcg_caches[id] = NULL; css_put(&memcg->css); @@ -3464,22 +3478,13 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct kmem_cache *new_cachep; - int idx; BUG_ON(!memcg_can_account_kmem(memcg)); - idx = memcg_cache_id(memcg); - mutex_lock(&memcg_cache_mutex); - new_cachep = cache_from_memcg_idx(cachep, idx); - if (new_cachep) - goto out; - new_cachep = kmem_cache_dup(memcg, cachep); if (new_cachep == NULL) new_cachep = cachep; - -out: mutex_unlock(&memcg_cache_mutex); return new_cachep; } diff --git a/mm/slab_common.c b/mm/slab_common.c index db24ec48b946..f34707eeacc7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -180,6 +180,18 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, if (err) goto out_unlock; + if (memcg) { + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can + * try to create the same cache, but only one of them may + * succeed. Therefore if we get here and see the cache has + * already been created, we silently return NULL. + */ + if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) + goto out_unlock; + } + /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -261,11 +273,11 @@ void kmem_cache_destroy(struct kmem_cache *s) list_del(&s->list); if (!__kmem_cache_shutdown(s)) { + memcg_unregister_cache(s); mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); - memcg_unregister_cache(s); memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); -- cgit v1.2.3 From 842e2873697e023d140a8905a41fcf39d4e546f1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:03 -0800 Subject: memcg: get rid of kmem_cache_dup() kmem_cache_dup() is only called from memcg_create_kmem_cache(). The latter, in fact, does nothing besides this, so let's fold kmem_cache_dup() into memcg_create_kmem_cache(). This patch also makes the memcg_cache_mutex private to memcg_create_kmem_cache(), because it is not used anywhere else. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 39 ++++++++------------------------------- 1 file changed, 8 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d2da65c4cd84..80197e544764 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3427,27 +3427,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -/* - * This lock protects updaters, not readers. We want readers to be as fast as - * they can, and they will either see NULL or a valid cache value. Our model - * allow them to see NULL, in which case the root memcg will be selected. - * - * We need this lock because multiple allocations to the same cache from a non - * will span more than one worker. Only one of them can create the cache. - */ -static DEFINE_MUTEX(memcg_cache_mutex); - -/* - * Called with memcg_cache_mutex held - */ -static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, - struct kmem_cache *s) +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *s) { struct kmem_cache *new; static char *tmp_name = NULL; + static DEFINE_MUTEX(mutex); /* protects tmp_name */ - lockdep_assert_held(&memcg_cache_mutex); + BUG_ON(!memcg_can_account_kmem(memcg)); + mutex_lock(&mutex); /* * kmem_cache_create_memcg duplicates the given name and * cgroup_name for this name requires RCU context. @@ -3470,25 +3459,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, if (new) new->allocflags |= __GFP_KMEMCG; + else + new = s; + mutex_unlock(&mutex); return new; } -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - struct kmem_cache *new_cachep; - - BUG_ON(!memcg_can_account_kmem(memcg)); - - mutex_lock(&memcg_cache_mutex); - new_cachep = kmem_cache_dup(memcg, cachep); - if (new_cachep == NULL) - new_cachep = cachep; - mutex_unlock(&memcg_cache_mutex); - return new_cachep; -} - void kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; -- cgit v1.2.3 From f717eb3abb5ea38f60e671dbfdbf512c2c93d22e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:05 -0800 Subject: slab: do not panic if we fail to create memcg cache There is no point in flooding logs with warnings or especially crashing the system if we fail to create a cache for a memcg. In this case we will be accounting the memcg allocation to the root cgroup until we succeed to create its own cache, but it isn't that critical. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index f34707eeacc7..8e40321da091 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -233,7 +233,14 @@ out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); - if (err) { + /* + * There is no point in flooding logs with warnings or especially + * crashing the system if we fail to create a cache for a memcg. In + * this case we will be accounting the memcg allocation to the root + * cgroup until we succeed to create its own cache, but it isn't that + * critical. + */ + if (err && !memcg) { if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); -- cgit v1.2.3 From f8570263ee16eb1d5038b8e20d7db3a68bbb2b49 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:06 -0800 Subject: memcg, slab: RCU protect memcg_params for root caches We relocate root cache's memcg_params whenever we need to grow the memcg_caches array to accommodate all kmem-active memory cgroups. Currently on relocation we free the old version immediately, which can lead to use-after-free, because the memcg_caches array is accessed lock-free (see cache_from_memcg_idx()). This patch fixes this by making memcg_params RCU-protected for root caches. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++++------- mm/slab.h | 16 +++++++++++++++- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80197e544764..216659d4441a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3178,18 +3178,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) if (num_groups > memcg_limited_groups_array_size) { int i; + struct memcg_cache_params *new_params; ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); size += offsetof(struct memcg_cache_params, memcg_caches); - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) { - s->memcg_params = cur_params; + new_params = kzalloc(size, GFP_KERNEL); + if (!new_params) return -ENOMEM; - } - s->memcg_params->is_root_cache = true; + new_params->is_root_cache = true; /* * There is the chance it will be bigger than @@ -3203,7 +3202,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) for (i = 0; i < memcg_limited_groups_array_size; i++) { if (!cur_params->memcg_caches[i]) continue; - s->memcg_params->memcg_caches[i] = + new_params->memcg_caches[i] = cur_params->memcg_caches[i]; } @@ -3216,7 +3215,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) * bigger than the others. And all updates will reset this * anyway. */ - kfree(cur_params); + rcu_assign_pointer(s->memcg_params, new_params); + if (cur_params) + kfree_rcu(cur_params, rcu_head); } return 0; } diff --git a/mm/slab.h b/mm/slab.h index 72d1f9df71bd..8184a7cde272 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -160,14 +160,28 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } +/* + * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. + * That said the caller must assure the memcg's cache won't go away. Since once + * created a memcg's cache is destroyed only along with the root cache, it is + * true if we are going to allocate from the cache or hold a reference to the + * root cache by other means. Otherwise, we should hold either the slab_mutex + * or the memcg's slab_caches_mutex while calling this function and accessing + * the returned value. + */ static inline struct kmem_cache * cache_from_memcg_idx(struct kmem_cache *s, int idx) { struct kmem_cache *cachep; + struct memcg_cache_params *params; if (!s->memcg_params) return NULL; - cachep = s->memcg_params->memcg_caches[idx]; + + rcu_read_lock(); + params = rcu_dereference(s->memcg_params); + cachep = params->memcg_caches[idx]; + rcu_read_unlock(); /* * Make sure we will access the up-to-date value. The code updating -- cgit v1.2.3 From 6de64beb3435ab8f2ac1428dd7dddad5dc679c4b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:08 -0800 Subject: memcg: remove KMEM_ACCOUNTED_ACTIVATED flag Currently we have two state bits in mem_cgroup::kmem_account_flags regarding kmem accounting activation, ACTIVATED and ACTIVE. We start kmem accounting only if both flags are set (memcg_can_account_kmem()), plus throughout the code there are several places where we check only the ACTIVE flag, but we never check the ACTIVATED flag alone. These flags are both set from memcg_update_kmem_limit() under the set_limit_mutex, the ACTIVE flag always being set after ACTIVATED, and they never get cleared. That said checking if both flags are set is equivalent to checking only for the ACTIVE flag, and since there is no ACTIVATED flag checks, we can safely remove the ACTIVATED flag, and nothing will change. Let's try to understand what was the reason for introducing these flags. The purpose of the ACTIVE flag is clear - it states that kmem should be accounting to the cgroup. The only requirement for it is that it should be set after we have fully initialized kmem accounting bits for the cgroup and patched all static branches relating to kmem accounting. Since we always check if static branch is enabled before actually considering if we should account (otherwise we wouldn't benefit from static branching), this guarantees us that we won't skip a commit or uncharge after a charge due to an unpatched static branch. Now let's move on to the ACTIVATED bit. As I proved in the beginning of this message, it is absolutely useless, and removing it will change nothing. So what was the reason introducing it? The ACTIVATED flag was introduced by commit a8964b9b84f9 ("memcg: use static branches when code not in use") in order to guarantee that static_key_slow_inc(&memcg_kmem_enabled_key) would be called only once for each memory cgroup when its kmem accounting was activated. The point was that at that time the memcg_update_kmem_limit() function's work-flow looked like this: bool must_inc_static_branch = false; cgroup_lock(); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { /* The kmem limit is set for the first time */ ret = res_counter_set_limit(&memcg->kmem, val); memcg_kmem_set_activated(memcg); must_inc_static_branch = true; } else ret = res_counter_set_limit(&memcg->kmem, val); mutex_unlock(&set_limit_mutex); cgroup_unlock(); if (must_inc_static_branch) { /* We can't do this under cgroup_lock */ static_key_slow_inc(&memcg_kmem_enabled_key); memcg_kmem_set_active(memcg); } So that without the ACTIVATED flag we could race with other threads trying to set the limit and increment the static branching ref-counter more than once. Today we call the whole memcg_update_kmem_limit() function under the set_limit_mutex and this race is impossible. As now we understand why the ACTIVATED bit was introduced and why we don't need it now, and know that removing it will change nothing anyway, let's get rid of it. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 216659d4441a..706f7bc16db2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -382,15 +382,10 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { - KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ - KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ + KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; -/* We account when limit is on, but only after call sites are patched */ -#define KMEM_ACCOUNTED_MASK \ - ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) - #ifdef CONFIG_MEMCG_KMEM static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) { @@ -402,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } -static void memcg_kmem_set_activated(struct mem_cgroup *memcg) -{ - set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); -} - -static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) -{ - clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); -} - static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { /* @@ -2995,8 +2980,7 @@ static DEFINE_MUTEX(set_limit_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == - KMEM_ACCOUNTED_MASK; + memcg_kmem_is_active(memcg); } /* @@ -3120,19 +3104,10 @@ static int memcg_update_cache_sizes(struct mem_cgroup *memcg) 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); if (num < 0) return num; - /* - * After this point, kmem_accounted (that we test atomically in - * the beginning of this conditional), is no longer 0. This - * guarantees only one process will set the following boolean - * to true. We don't need test_and_set because we're protected - * by the set_limit_mutex anyway. - */ - memcg_kmem_set_activated(memcg); ret = memcg_update_all_caches(num+1); if (ret) { ida_simple_remove(&kmem_limited_groups, num); - memcg_kmem_clear_activated(memcg); return ret; } -- cgit v1.2.3 From d6441637709ba302905f1076f2afcb6d4ea3a901 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:09 -0800 Subject: memcg: rework memcg_update_kmem_limit synchronization Currently we take both the memcg_create_mutex and the set_limit_mutex when we enable kmem accounting for a memory cgroup, which makes kmem activation events serialize with both memcg creations and other memcg limit updates (memory.limit, memory.memsw.limit). However, there is no point in such strict synchronization rules there. First, the set_limit_mutex was introduced to keep the memory.limit and memory.memsw.limit values in sync. Since memory.kmem.limit can be set independently of them, it is better to introduce a separate mutex to synchronize against concurrent kmem limit updates. Second, we take the memcg_create_mutex in order to make sure all children of this memcg will be kmem-active as well. For achieving that, it is enough to hold this mutex only while checking if memcg_has_children() though. This guarantees that if a child is added after we checked that the memcg has no children, the newly added cgroup will see its parent kmem-active (of course if the latter succeeded), and call kmem activation for itself. This patch simplifies the locking rules of memcg_update_kmem_limit() according to these considerations. [vdavydov@parallels.com: fix unintialized var warning] Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 196 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 105 insertions(+), 91 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 706f7bc16db2..c8715056e1ef 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2977,6 +2977,8 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +static DEFINE_MUTEX(activate_kmem_mutex); + static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && @@ -3089,34 +3091,6 @@ int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -/* - * This ends up being protected by the set_limit mutex, during normal - * operation, because that is its main call site. - * - * But when we create a new cache, we can call this as well if its parent - * is kmem-limited. That will have to hold set_limit_mutex as well. - */ -static int memcg_update_cache_sizes(struct mem_cgroup *memcg) -{ - int num, ret; - - num = ida_simple_get(&kmem_limited_groups, - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); - if (num < 0) - return num; - - ret = memcg_update_all_caches(num+1); - if (ret) { - ida_simple_remove(&kmem_limited_groups, num); - return ret; - } - - memcg->kmemcg_id = num; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); - return 0; -} - static size_t memcg_caches_array_size(int num_groups) { ssize_t size; @@ -3459,9 +3433,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) * * Still, we don't want anyone else freeing memcg_caches under our * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the set_limit_mutex to protect ourselves against this. + * we'll take the activate_kmem_mutex to protect ourselves against + * this. */ - mutex_lock(&set_limit_mutex); + mutex_lock(&activate_kmem_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) @@ -3484,7 +3459,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); } - mutex_unlock(&set_limit_mutex); + mutex_unlock(&activate_kmem_mutex); } struct create_work { @@ -5148,11 +5123,23 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return val; } -static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) -{ - int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg = mem_cgroup_from_css(css); +/* should be called with activate_kmem_mutex held */ +static int __memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long long limit) +{ + int err = 0; + int memcg_id; + + if (memcg_kmem_is_active(memcg)) + return 0; + + /* + * We are going to allocate memory for data shared by all memory + * cgroups so let's stop accounting here. + */ + memcg_stop_kmem_account(); + /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had @@ -5166,72 +5153,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) * of course permitted. */ mutex_lock(&memcg_create_mutex); - mutex_lock(&set_limit_mutex); - if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { - if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { - ret = -EBUSY; - goto out; - } - ret = res_counter_set_limit(&memcg->kmem, val); - VM_BUG_ON(ret); + if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) + err = -EBUSY; + mutex_unlock(&memcg_create_mutex); + if (err) + goto out; - ret = memcg_update_cache_sizes(memcg); - if (ret) { - res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); - goto out; - } - static_key_slow_inc(&memcg_kmem_enabled_key); - /* - * setting the active bit after the inc will guarantee no one - * starts accounting before all call sites are patched - */ - memcg_kmem_set_active(memcg); - } else - ret = res_counter_set_limit(&memcg->kmem, val); + memcg_id = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (memcg_id < 0) { + err = memcg_id; + goto out; + } + + /* + * Make sure we have enough space for this cgroup in each root cache's + * memcg_params. + */ + err = memcg_update_all_caches(memcg_id + 1); + if (err) + goto out_rmid; + + memcg->kmemcg_id = memcg_id; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + + /* + * We couldn't have accounted to this cgroup, because it hasn't got the + * active bit set yet, so this should succeed. + */ + err = res_counter_set_limit(&memcg->kmem, limit); + VM_BUG_ON(err); + + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * Setting the active bit after enabling static branching will + * guarantee no one starts accounting before all call sites are + * patched. + */ + memcg_kmem_set_active(memcg); out: - mutex_unlock(&set_limit_mutex); - mutex_unlock(&memcg_create_mutex); -#endif + memcg_resume_kmem_account(); + return err; + +out_rmid: + ida_simple_remove(&kmem_limited_groups, memcg_id); + goto out; +} + +static int memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long long limit) +{ + int ret; + + mutex_lock(&activate_kmem_mutex); + ret = __memcg_activate_kmem(memcg, limit); + mutex_unlock(&activate_kmem_mutex); + return ret; +} + +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long long val) +{ + int ret; + + if (!memcg_kmem_is_active(memcg)) + ret = memcg_activate_kmem(memcg, val); + else + ret = res_counter_set_limit(&memcg->kmem, val); return ret; } -#ifdef CONFIG_MEMCG_KMEM static int memcg_propagate_kmem(struct mem_cgroup *memcg) { int ret = 0; struct mem_cgroup *parent = parent_mem_cgroup(memcg); - if (!parent) - goto out; - memcg->kmem_account_flags = parent->kmem_account_flags; - /* - * When that happen, we need to disable the static branch only on those - * memcgs that enabled it. To achieve this, we would be forced to - * complicate the code by keeping track of which memcgs were the ones - * that actually enabled limits, and which ones got it from its - * parents. - * - * It is a lot simpler just to do static_key_slow_inc() on every child - * that is accounted. - */ - if (!memcg_kmem_is_active(memcg)) - goto out; + if (!parent) + return 0; + mutex_lock(&activate_kmem_mutex); /* - * __mem_cgroup_free() will issue static_key_slow_dec() because this - * memcg is active already. If the later initialization fails then the - * cgroup core triggers the cleanup so we do not have to do it here. + * If the parent cgroup is not kmem-active now, it cannot be activated + * after this point, because it has at least one child already. */ - static_key_slow_inc(&memcg_kmem_enabled_key); - - mutex_lock(&set_limit_mutex); - memcg_stop_kmem_account(); - ret = memcg_update_cache_sizes(memcg); - memcg_resume_kmem_account(); - mutex_unlock(&set_limit_mutex); -out: + if (memcg_kmem_is_active(parent)) + ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); + mutex_unlock(&activate_kmem_mutex); return ret; } +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long long val) +{ + return -EINVAL; +} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -5265,7 +5281,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); else if (type == _KMEM) - ret = memcg_update_kmem_limit(css, val); + ret = memcg_update_kmem_limit(memcg, val); else return -EINVAL; break; @@ -6499,7 +6515,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); - int error = 0; if (css->cgroup->id > MEM_CGROUP_ID_MAX) return -ENOSPC; @@ -6534,10 +6549,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent != root_mem_cgroup) mem_cgroup_subsys.broken_hierarchy = true; } - - error = memcg_init_kmem(memcg, &mem_cgroup_subsys); mutex_unlock(&memcg_create_mutex); - return error; + + return memcg_init_kmem(memcg, &mem_cgroup_subsys); } /* -- cgit v1.2.3 From 87379ec8c2b8ae0acd526b87d2240afca92a7505 Mon Sep 17 00:00:00 2001 From: Philipp Hachtmann Date: Thu, 23 Jan 2014 15:53:10 -0800 Subject: mm/nobootmem.c: add return value check in __alloc_memory_core_early() When memblock_reserve() fails because memblock.reserved.regions cannot be resized, the caller (e.g. alloc_bootmem()) is not informed of the failed allocation. Therefore alloc_bootmem() silently returns the same pointer again and again. This patch adds a check for the return value of memblock_reserve() in __alloc_memory_core(). Signed-off-by: Philipp Hachtmann Reviewed-by: Tejun Heo Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Tang Chen Cc: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 19121ceb8874..bb1a70cc97a7 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -45,7 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (!addr) return NULL; - memblock_reserve(addr, size); + if (memblock_reserve(addr, size)) + return NULL; + ptr = phys_to_virt(addr); memset(ptr, 0, size); /* -- cgit v1.2.3 From 5e270e254885893f8c82ab9b91caa648af3690df Mon Sep 17 00:00:00 2001 From: Philipp Hachtmann Date: Thu, 23 Jan 2014 15:53:11 -0800 Subject: mm: free memblock.memory in free_all_bootmem When calling free_all_bootmem() the free areas under memblock's control are released to the buddy allocator. Additionally the reserved list is freed if it was reallocated by memblock. The same should apply for the memory list. Signed-off-by: Philipp Hachtmann Reviewed-by: Tejun Heo Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Tang Chen Cc: Toshi Kani Cc: Jianguo Wu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 16 ++++++++++++++++ mm/nobootmem.c | 10 +++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 1c2ef2c7edab..64ed2439cf75 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -291,6 +291,22 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( memblock.reserved.max); } +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + +phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( + phys_addr_t *addr) +{ + if (memblock.memory.regions == memblock_memory_init_regions) + return 0; + + *addr = __pa(memblock.memory.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); +} + +#endif + /** * memblock_double_array - double the size of the memblock regions array * @type: memblock type of the regions array being doubled diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bb1a70cc97a7..17c89023184f 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -122,11 +122,19 @@ static unsigned long __init free_low_memory_core_early(void) for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); - /* free range that is used for reserved array if we allocate it */ + /* Free memblock.reserved array if it was allocated */ size = get_allocated_memblock_reserved_regions_info(&start); if (size) count += __free_memory_core(start, start + size); +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + + /* Free memblock.memory array if it was allocated */ + size = get_allocated_memblock_memory_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); +#endif + return count; } -- cgit v1.2.3 From 54a43d54988a3731d644fdeb7a1d6f46b4ac64c7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 23 Jan 2014 15:53:13 -0800 Subject: numa: add a sysctl for numa_balancing Add a working sysctl to enable/disable automatic numa memory balancing at runtime. This allows us to track down performance problems with this feature and is generally a good idea. This was possible earlier through debugfs, but only with special debugging options set. Also fix the boot message. [akpm@linux-foundation.org: s/sched_numa_balancing/sysctl_numa_balancing/] Signed-off-by: Andi Kleen Acked-by: Mel Gorman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0cd2c4d4e270..947293e76533 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2668,7 +2668,7 @@ static void __init check_numabalancing_enable(void) if (nr_node_ids > 1 && !numabalancing_override) { printk(KERN_INFO "Enabling automatic NUMA balancing. " - "Configure with numa_balancing= or sysctl"); + "Configure with numa_balancing= or the kernel.numa_balancing sysctl"); set_numabalancing_state(numabalancing_default); } } -- cgit v1.2.3 From 54b9dd14d09f24927285359a227aa363ce46089e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 23 Jan 2014 15:53:14 -0800 Subject: mm/memory-failure.c: shift page lock from head page to tail page after thp split After thp split in hwpoison_user_mappings(), we hold page lock on the raw error page only between try_to_unmap, hence we are in danger of race condition. I found in the RHEL7 MCE-relay testing that we have "bad page" error when a memory error happens on a thp tail page used by qemu-kvm: Triggering MCE exception on CPU 10 mce: [Hardware Error]: Machine check events logged MCE exception done on CPU 10 MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption MCE 0x38c535: dirty LRU page recovery: Recovered qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000] BUG: Bad page state in process qemu-kvm pfn:38c400 page:ffffea000e310000 count:0 mapcount:0 mapping: (null) index:0x7ffae3c00 page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked) Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ... CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G M -------------- 3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1 Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011 Call Trace: dump_stack+0x19/0x1b bad_page.part.59+0xcf/0xe8 free_pages_prepare+0x148/0x160 free_hot_cold_page+0x31/0x140 free_hot_cold_page_list+0x46/0xa0 release_pages+0x1c1/0x200 free_pages_and_swap_cache+0xad/0xd0 tlb_flush_mmu.part.46+0x4c/0x90 tlb_finish_mmu+0x55/0x60 exit_mmap+0xcb/0x170 mmput+0x67/0xf0 vhost_dev_cleanup+0x231/0x260 [vhost_net] vhost_net_release+0x3f/0x90 [vhost_net] __fput+0xe9/0x270 ____fput+0xe/0x10 task_work_run+0xc4/0xe0 do_exit+0x2bb/0xa40 do_group_exit+0x3f/0xa0 get_signal_to_deliver+0x1d0/0x6e0 do_signal+0x48/0x5e0 do_notify_resume+0x71/0xc0 retint_signal+0x48/0x8c The reason of this bug is that a page fault happens before unlocking the head page at the end of memory_failure(). This strange page fault is trying to access to address 0x20 and I'm not sure why qemu-kvm does this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the way we catch the bad page bug/warning because we try to free a locked page (which was the former head page.) To fix this, this patch suggests to shift page lock from head page to tail page just after thp split. SIGSEGV still happens, but it affects only error affected VMs, not a whole system. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Wanpeng Li Cc: [3.9+] # a3e0f9e47d5ef "mm/memory-failure.c: transfer page count from head page to tail page after split thp" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b25ed321e667..4f08a2d61487 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -856,14 +856,14 @@ static int page_action(struct page_state *ps, struct page *p, * the pages and send SIGBUS to the processes if the data was dirty. */ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, - int trapno, int flags) + int trapno, int flags, struct page **hpagep) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); int ret; int kill = 1, forcekill; - struct page *hpage = compound_head(p); + struct page *hpage = *hpagep; struct page *ppage; if (PageReserved(p) || PageSlab(p)) @@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * We pinned the head page for hwpoison handling, * now we split the thp and we are interested in * the hwpoisoned raw page, so move the refcount - * to it. + * to it. Similarly, page lock is shifted. */ if (hpage != p) { put_page(hpage); get_page(p); + lock_page(p); + unlock_page(hpage); + *hpagep = p; } /* THP is split, so ppage should be the real poisoned page. */ ppage = p; @@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(ppage, &tokill); - if (hpage != ppage) - lock_page(ppage); - ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(ppage)); - if (hpage != ppage) - unlock_page(ppage); - /* * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if @@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. + * + * When the raw error page is thp tail page, hpage points to the raw + * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { + if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) + != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); res = -EBUSY; goto out; -- cgit v1.2.3 From cc81717ed3bc6d4f3738d13a1e097437caada0e9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 23 Jan 2014 15:53:15 -0800 Subject: mm: new_vma_page() cannot see NULL vma for hugetlb pages Commit 11c731e81bb0 ("mm/mempolicy: fix !vma in new_vma_page()") has removed BUG_ON(!vma) from new_vma_page which is partially correct because page_address_in_vma will return EFAULT for non-linear mappings and at least shared shmem might be mapped this way. The patch also tried to prevent NULL ptr for hugetlb pages which is not correct AFAICS because hugetlb pages cannot be mapped as VM_NONLINEAR and other conditions in page_address_in_vma seem to be legit and catch real bugs. This patch restores BUG_ON for PageHuge to catch potential issues when the to-be-migrated page is not setup properly. Signed-off-by: Michal Hocko Reviewed-by: Bob Liu Cc: Sasha Levin Cc: Wanpeng Li Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 947293e76533..463b7fbf0d1d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1199,10 +1199,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * } if (PageHuge(page)) { - if (vma) - return alloc_huge_page_noerr(vma, address, 1); - else - return NULL; + BUG_ON(!vma); + return alloc_huge_page_noerr(vma, address, 1); } /* * if !vma, alloc_page_vma() will use task or system default policy -- cgit v1.2.3 From da8c757b080ee84f219fa2368cb5dd23ac304fc0 Mon Sep 17 00:00:00 2001 From: Han Pingtian Date: Thu, 23 Jan 2014 15:53:17 -0800 Subject: mm: prevent setting of a value less than 0 to min_free_kbytes If echo -1 > /proc/vm/sys/min_free_kbytes, the system will hang. Changing proc_dointvec() to proc_dointvec_minmax() in the min_free_kbytes_sysctl_handler() can prevent this to happen. mhocko said: : You can still do echo $BIG_VALUE > /proc/vm/sys/min_free_kbytes and make : your machine unusable but I agree that proc_dointvec_minmax is more : suitable here as we already have: : : .proc_handler = min_free_kbytes_sysctl_handler, : .extra1 = &zero, : : It used to work properly but then 6fce56ec91b5 ("sysctl: Remove references : to ctl_name and strategy from the generic sysctl table") has removed : sysctl_intvec strategy and so extra1 is ignored. Signed-off-by: Han Pingtian Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f18f016cca80..a818d568ddf3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5754,7 +5754,12 @@ module_init(init_per_zone_wmark_min) int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + if (write) { user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); -- cgit v1.2.3 From c980e66a556659f14da2294e1fc696e1352b5d00 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 23 Jan 2014 15:53:19 -0800 Subject: mm: do_mincore() cleanup Two cleanups: 1. remove redundant codes for hugetlb pages. 2. end = pmd_addr_end(addr, end) restricts [addr, end) within PMD_SIZE, this may increase do_mincore() calls, remove it. Signed-off-by: Jianguo Wu Acked-by: Johannes Weiner Cc: Minchan Kim Cc: qiuxishi Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mincore.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/mincore.c b/mm/mincore.c index da2be56a7b8f..101623378fbf 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -225,13 +225,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); - if (is_vm_hugetlb_page(vma)) { - mincore_hugetlb_page_range(vma, addr, end, vec); - return (end - addr) >> PAGE_SHIFT; - } - - end = pmd_addr_end(addr, end); - if (is_vm_hugetlb_page(vma)) mincore_hugetlb_page_range(vma, addr, end, vec); else -- cgit v1.2.3 From baae911b27b8dbee6830f4e3ef0fcf4dc8e9c07b Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 23 Jan 2014 15:53:21 -0800 Subject: sched/numa: fix setting of cpupid on page migration twice Commit 7851a45cd3f6 ("mm: numa: Copy cpupid on page migration") copiess over the cpupid at page migration time. It is unnecessary to set it again in migrate_misplaced_transhuge_page(). Signed-off-by: Wanpeng Li Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 4b3996eb7f0f..734704f6f29b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1753,8 +1753,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_cpupid_xchg_last(new_page, page_cpupid_last(page)); - isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { put_page(new_page); -- cgit v1.2.3 From 0b1fb40a3b1291f2f12f13f644ac95cf756a00e6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:22 -0800 Subject: mm: vmscan: shrink all slab objects if tight on memory When reclaiming kmem, we currently don't scan slabs that have less than batch_size objects (see shrink_slab_node()): while (total_scan >= batch_size) { shrinkctl->nr_to_scan = batch_size; shrinker->scan_objects(shrinker, shrinkctl); total_scan -= batch_size; } If there are only a few shrinkers available, such a behavior won't cause any problems, because the batch_size is usually small, but if we have a lot of slab shrinkers, which is perfectly possible since FS shrinkers are now per-superblock, we can end up with hundreds of megabytes of practically unreclaimable kmem objects. For instance, mounting a thousand of ext2 FS images with a hundred of files in each and iterating over all the files using du(1) will result in about 200 Mb of FS caches that cannot be dropped even with the aid of the vm.drop_caches sysctl! This problem was initially pointed out by Glauber Costa [*]. Glauber proposed to fix it by making the shrink_slab() always take at least one pass, to put it simply, turning the scan loop above to a do{}while() loop. However, this proposal was rejected, because it could result in more aggressive and frequent slab shrinking even under low memory pressure when total_scan is naturally very small. This patch is a slightly modified version of Glauber's approach. Similarly to Glauber's patch, it makes shrink_slab() scan less than batch_size objects, but only if the total number of objects we want to scan (total_scan) is greater than the total number of objects available (max_pass). Since total_scan is biased as half max_pass if the current delta change is small: if (delta < max_pass / 4) total_scan = min(total_scan, max_pass / 2); this is only possible if we are scanning at high prio. That said, this patch shouldn't change the vmscan behaviour if the memory pressure is low, but if we are tight on memory, we will do our best by trying to reclaim all available objects, which sounds reasonable. [*] http://www.spinics.net/lists/cgroups/msg06913.html Signed-off-by: Vladimir Davydov Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Dave Chinner Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2254f36b74b8..45c1cf61cbed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -281,17 +281,34 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); - while (total_scan >= batch_size) { + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (max_pass), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= max_pass) { unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); - shrinkctl->nr_to_scan = batch_size; + shrinkctl->nr_to_scan = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; cond_resched(); } -- cgit v1.2.3 From ec97097bca147d5718a5d2c024d1ec740b10096d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:23 -0800 Subject: mm: vmscan: call NUMA-unaware shrinkers irrespective of nodemask If a shrinker is not NUMA-aware, shrink_slab() should call it exactly once with nid=0, but currently it is not true: if node 0 is not set in the nodemask or if it is not online, we will not call such shrinkers at all. As a result some slabs will be left untouched under some circumstances. Let us fix it. Signed-off-by: Vladimir Davydov Reported-by: Dave Chinner Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 45c1cf61cbed..90c4075d8d75 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -369,16 +369,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { - if (!node_online(shrinkctl->nid)) - continue; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && - (shrinkctl->nid != 0)) - break; - + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { + shrinkctl->nid = 0; freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); + nr_pages_scanned, lru_pages); + continue; + } + + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (node_online(shrinkctl->nid)) + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); } } -- cgit v1.2.3 From 354f17e1e2512018a603793cc133e2f296f6ebc6 Mon Sep 17 00:00:00 2001 From: Philipp Hachtmann Date: Thu, 23 Jan 2014 15:53:24 -0800 Subject: mm/nobootmem: free_all_bootmem again get_allocated_memblock_reserved_regions_info() should work if it is compiled in. Extended the ifdef around get_allocated_memblock_memory_regions_info() to include get_allocated_memblock_reserved_regions_info() as well. Similar changes in nobootmem.c/free_low_memory_core_early() where the two functions are called. [akpm@linux-foundation.org: cleanup] Signed-off-by: Philipp Hachtmann Cc: qiuxishi Cc: David Howells Cc: Daeseok Youn Cc: Jiang Liu Acked-by: Yinghai Lu Cc: Zhang Yanfei Cc: Santosh Shilimkar Cc: Grygorii Strashko Cc: Tang Chen Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 17 ++--------------- mm/nobootmem.c | 25 ++++++++++++++----------- 2 files changed, 16 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 64ed2439cf75..9c0aeef19440 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -266,33 +266,20 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( phys_addr_t *addr) { if (memblock.reserved.regions == memblock_reserved_init_regions) return 0; - /* - * Don't allow nobootmem allocator to free reserved memory regions - * array if - * - CONFIG_DEBUG_FS is enabled; - * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; - * - reserved memory regions array have been resized during boot. - * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" - * will show garbage instead of state of memory reservations. - */ - if (IS_ENABLED(CONFIG_DEBUG_FS) && - !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) - return 0; - *addr = __pa(memblock.reserved.regions); return PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( phys_addr_t *addr) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 17c89023184f..f73f2987a852 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,23 +116,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start, static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; - phys_addr_t start, end, size; + phys_addr_t start, end; u64 i; for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); - /* Free memblock.reserved array if it was allocated */ - size = get_allocated_memblock_reserved_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - - /* Free memblock.memory array if it was allocated */ - size = get_allocated_memblock_memory_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); + { + phys_addr_t size; + + /* Free memblock.reserved array if it was allocated */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); + + /* Free memblock.memory array if it was allocated */ + size = get_allocated_memblock_memory_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); + } #endif return count; -- cgit v1.2.3 From ac13c4622bda2a9ff8f57bbbfeff48b2a42d0963 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 23 Jan 2014 15:53:26 -0800 Subject: mm/memory_hotplug.c: move register_memory_resource out of the lock_memory_hotplug We don't need to do register_memory_resource() under lock_memory_hotplug() since it has its own lock and doesn't make any callbacks. Also register_memory_resource return NULL on failure so we don't have anything to cleanup at this point. The reason for this rfc is I was doing some experiments with hotplugging of memory on some of our larger systems. While it seems to work, it can be quite slow. With some preliminary digging I found that lock_memory_hotplug is clearly ripe for breakup. It could be broken up per nid or something but it also covers the online_page_callback. The online_page_callback shouldn't be very hard to break out. Also there is the issue of various structures(wmarks come to mind) that are only updated under the lock_memory_hotplug that would need to be dealt with. Cc: Tang Chen Cc: Wen Congyang Cc: Kamezawa Hiroyuki Reviewed-by: Yasuaki Ishimatsu Cc: "Rafael J. Wysocki" Cc: Hedi Cc: Mike Travis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a512a47241a4..a650db29606f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1107,17 +1107,18 @@ int __ref add_memory(int nid, u64 start, u64 size) if (ret) return ret; - lock_memory_hotplug(); - res = register_memory_resource(start, size); ret = -EEXIST; if (!res) - goto out; + return ret; { /* Stupid hack to suppress address-never-null warning */ void *p = NODE_DATA(nid); new_pgdat = !p; } + + lock_memory_hotplug(); + new_node = !node_online(nid); if (new_node) { pgdat = hotadd_new_pgdat(nid, start); -- cgit v1.2.3 From 42aa83cb6757800f4e2b499f5db3127761517a6a Mon Sep 17 00:00:00 2001 From: Han Pingtian Date: Thu, 23 Jan 2014 15:53:28 -0800 Subject: mm: show message when updating min_free_kbytes in thp min_free_kbytes may be raised during THP's initialization. Sometimes, this will change the value which was set by the user. Showing this message will clarify this confusion. Only show this message when changing a value which was set by the user according to Michal Hocko's suggestion. Show the old value of min_free_kbytes according to Dave Hansen's suggestion. This will give user the chance to restore old value of min_free_kbytes. Signed-off-by: Han Pingtian Reviewed-by: Michal Hocko Cc: David Rientjes Cc: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 +++++++- mm/internal.h | 1 + mm/page_alloc.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 25fab7150fa0..afe738358370 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -130,8 +130,14 @@ static int set_recommended_min_free_kbytes(void) (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); - if (recommended_min > min_free_kbytes) + if (recommended_min > min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu " + "to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + min_free_kbytes = recommended_min; + } setup_per_zone_wmarks(); return 0; } diff --git a/mm/internal.h b/mm/internal.h index dc95e979ae56..7e145e8cd1e6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -99,6 +99,7 @@ extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif +extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a818d568ddf3..e3758a09a009 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; -int user_min_free_kbytes; +int user_min_free_kbytes = -1; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; -- cgit v1.2.3 From da29bd36224bfa008df5d83df496c07e31a0da6d Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 23 Jan 2014 15:53:29 -0800 Subject: mm/mm_init.c: make creation of the mm_kobj happen earlier than device_initcall The use of __initcall is to be eventually replaced by choosing one from the prioritized groupings laid out in init.h header: pure_initcall 0 core_initcall 1 postcore_initcall 2 arch_initcall 3 subsys_initcall 4 fs_initcall 5 device_initcall 6 late_initcall 7 In the interim, all __initcall are mapped onto device_initcall, which as can be seen above, comes quite late in the ordering. Currently the mm_kobj is created with __initcall in mm_sysfs_init(). This means that any other initcalls that want to reference the mm_kobj have to be device_initcall (or later), otherwise we will for example, trip the BUG_ON(!kobj) in sysfs's internal_create_group(). This unfairly restricts those users; for example something that clearly makes sense to be an arch_initcall will not be able to choose that. However, upon examination, it is only this way for historical reasons (i.e. simply not reprioritized yet). We see that sysfs is ready quite earlier in init/main.c via: vfs_caches_init |_ mnt_init |_ sysfs_init well ahead of the processing of the prioritized calls listed above. So we can recategorize mm_sysfs_init to be a pure_initcall, which in turn allows any mm_kobj initcall users a wider range (1 --> 7) of initcall priorities to choose from. Signed-off-by: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mm_init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 68562e92d50c..857a6434e3a5 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void) return 0; } - -__initcall(mm_sysfs_init); +pure_initcall(mm_sysfs_init); -- cgit v1.2.3 From a64fb3cd610c8e6806512dbac63f3fc45812d8fd Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 23 Jan 2014 15:53:30 -0800 Subject: mm: audit/fix non-modular users of module_init in core code Code that is obj-y (always built-in) or dependent on a bool Kconfig (built-in or absent) can never be modular. So using module_init as an alias for __initcall can be somewhat misleading. Fix these up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. The audit targets the following module_init users for change: mm/ksm.c bool KSM mm/mmap.c bool MMU mm/huge_memory.c bool TRANSPARENT_HUGEPAGE mm/mmu_notifier.c bool MMU_NOTIFIER Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of subsys_initcall (which makes sense for these files) will thus change this registration from level 6-device to level 4-subsys (i.e. slightly earlier). However no observable impact of that difference has been observed during testing. One might think that core_initcall (l2) or postcore_initcall (l3) would be more appropriate for anything in mm/ but if we look at some actual init functions themselves, we see things like: mm/huge_memory.c --> hugepage_init --> hugepage_init_sysfs mm/mmap.c --> init_user_reserve --> sysctl_user_reserve_kbytes mm/ksm.c --> ksm_init --> sysfs_create_group and hence the choice of subsys_initcall (l4) seems reasonable, and at the same time minimizes the risk of changing the priority too drastically all at once. We can adjust further in the future. Also, several instances of missing ";" at EOL are fixed. Signed-off-by: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/mmap.c | 6 +++--- mm/mmu_notifier.c | 3 +-- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index afe738358370..65c98eb5483c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -661,7 +661,7 @@ out: hugepage_exit_sysfs(hugepage_kobj); return err; } -module_init(hugepage_init) +subsys_initcall(hugepage_init); static int __init setup_transparent_hugepage(char *str) { diff --git a/mm/ksm.c b/mm/ksm.c index f91ddf5c3688..aa4c7c7250c1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2345,4 +2345,4 @@ out_free: out: return err; } -module_init(ksm_init) +subsys_initcall(ksm_init); diff --git a/mm/mmap.c b/mm/mmap.c index a0e7153a79e6..126d8b976bfd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3142,7 +3142,7 @@ static int init_user_reserve(void) sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; } -module_init(init_user_reserve) +subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. @@ -3163,7 +3163,7 @@ static int init_admin_reserve(void) sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; } -module_init(init_admin_reserve) +subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. @@ -3233,4 +3233,4 @@ static int __meminit init_reserve_notifier(void) return 0; } -module_init(init_reserve_notifier) +subsys_initcall(init_reserve_notifier); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 93e6089cb456..41cefdf0aadd 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); } - -module_init(mmu_notifier_init); +subsys_initcall(mmu_notifier_init); -- cgit v1.2.3 From d2ab70aaae74456ed608740915dc82ef52291f69 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 23 Jan 2014 15:53:30 -0800 Subject: mm/memcg: fix last_dead_count memory wastage Shorten mem_cgroup_reclaim_iter.last_dead_count from unsigned long to int: it's assigned from an int and compared with an int, and adjacent to an unsigned int: so there's no point to it being unsigned long, which wasted 104 bytes in every mem_cgroup_per_zone. Signed-off-by: Hugh Dickins Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8715056e1ef..aa66cc4c9e79 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -149,7 +149,7 @@ struct mem_cgroup_reclaim_iter { * matches memcg->dead_count of the hierarchy root group. */ struct mem_cgroup *last_visited; - unsigned long last_dead_count; + int last_dead_count; /* scan generation, increased every round-trip */ unsigned int generation; -- cgit v1.2.3 From d8ad30559715ce97afb7d1a93a12fd90e8fff312 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 23 Jan 2014 15:53:32 -0800 Subject: mm/memcg: iteration skip memcgs not yet fully initialized It is surprising that the mem_cgroup iterator can return memcgs which have not yet been fully initialized. By accident (or trial and error?) this appears not to present an actual problem; but it may be better to prevent such surprises, by skipping memcgs not yet online. Signed-off-by: Hugh Dickins Cc: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index aa66cc4c9e79..9537e1389ee6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1119,10 +1119,8 @@ skip_node: * protected by css_get and the tree walk is rcu safe. */ if (next_css) { - struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - - if (css_tryget(&mem->css)) - return mem; + if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)) + return mem_cgroup_from_css(next_css); else { prev_css = next_css; goto skip_node; -- cgit v1.2.3 From d49ad9355420c743c736bfd1dee9eaa5b1a7722a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 23 Jan 2014 15:53:34 -0800 Subject: mm, oom: prefer thread group leaders for display purposes When two threads have the same badness score, it's preferable to kill the thread group leader so that the actual process name is printed to the kernel log rather than the thread group name which may be shared amongst several processes. This was the behavior when select_bad_process() used to do for_each_process(), but it now iterates threads instead and leads to ambiguity. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 ++++++++++++------- mm/oom_kill.c | 12 ++++++++---- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9537e1389ee6..c8336e8f8df0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1841,13 +1841,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, break; }; points = oom_badness(task, memcg, NULL, totalpages); - if (points > chosen_points) { - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = points; - get_task_struct(chosen); - } + if (!points || points < chosen_points) + continue; + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && + thread_group_leader(chosen)) + continue; + + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = points; + get_task_struct(chosen); } css_task_iter_end(&it); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 054ff47c4478..37b1b1903fb2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -327,10 +327,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, break; }; points = oom_badness(p, NULL, nodemask, totalpages); - if (points > chosen_points) { - chosen = p; - chosen_points = points; - } + if (!points || points < chosen_points) + continue; + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && thread_group_leader(chosen)) + continue; + + chosen = p; + chosen_points = points; } if (chosen) get_task_struct(chosen); -- cgit v1.2.3 From ecc736fc3c71c411a9d201d8588c9e7e049e5d8c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 23 Jan 2014 15:53:35 -0800 Subject: memcg: fix endless loop caused by mem_cgroup_iter Hugh has reported an endless loop when the hardlimit reclaim sees the same group all the time. This might happen when the reclaim races with the memcg removal. shrink_zone [rmdir root] mem_cgroup_iter(root, NULL, reclaim) // prev = NULL rcu_read_lock() mem_cgroup_iter_load last_visited = iter->last_visited // gets root || NULL css_tryget(last_visited) // failed last_visited = NULL [1] memcg = root = __mem_cgroup_iter_next(root, NULL) mem_cgroup_iter_update iter->last_visited = root; reclaim->generation = iter->generation mem_cgroup_iter(root, root, reclaim) // prev = root rcu_read_lock mem_cgroup_iter_load last_visited = iter->last_visited // gets root css_tryget(last_visited) // failed [1] The issue seemed to be introduced by commit 5f5781619718 ("memcg: relax memcg iter caching") which has replaced unconditional css_get/css_put by css_tryget/css_put for the cached iterator. This patch fixes the issue by skipping css_tryget on the root of the tree walk in mem_cgroup_iter_load and symmetrically doesn't release it in mem_cgroup_iter_update. Signed-off-by: Michal Hocko Reported-by: Hugh Dickins Tested-by: Hugh Dickins Cc: Johannes Weiner Cc: Greg Thelen Cc: [3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8336e8f8df0..da07784dde87 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1158,7 +1158,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, if (iter->last_dead_count == *sequence) { smp_rmb(); position = iter->last_visited; - if (position && !css_tryget(&position->css)) + + /* + * We cannot take a reference to root because we might race + * with root removal and returning NULL would end up in + * an endless loop on the iterator user level when root + * would be returned all the time. + */ + if (position && position != root && + !css_tryget(&position->css)) position = NULL; } return position; @@ -1167,9 +1175,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, struct mem_cgroup *last_visited, struct mem_cgroup *new_position, + struct mem_cgroup *root, int sequence) { - if (last_visited) + /* root reference counting symmetric to mem_cgroup_iter_load */ + if (last_visited && last_visited != root) css_put(&last_visited->css); /* * We store the sequence count from the time @last_visited was @@ -1244,7 +1254,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - mem_cgroup_iter_update(iter, last_visited, memcg, seq); + mem_cgroup_iter_update(iter, last_visited, memcg, root, + seq); if (!memcg) iter->generation++; -- cgit v1.2.3 From 0eef615665ede1e0d603ea9ecca88c1da6f02234 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 23 Jan 2014 15:53:37 -0800 Subject: memcg: fix css reference leak and endless loop in mem_cgroup_iter Commit 19f39402864e ("memcg: simplify mem_cgroup_iter") has reorganized mem_cgroup_iter code in order to simplify it. A part of that change was dropping an optimization which didn't call css_tryget on the root of the walked tree. The patch however didn't change the css_put part in mem_cgroup_iter which excludes root. This wasn't an issue at the time because __mem_cgroup_iter_next bailed out for root early without taking a reference as cgroup iterators (css_next_descendant_pre) didn't visit root themselves. Nevertheless cgroup iterators have been reworked to visit root by commit bd8815a6d802 ("cgroup: make css_for_each_descendant() and friends include the origin css in the iteration") when the root bypass have been dropped in __mem_cgroup_iter_next. This means that css_put is not called for root and so css along with mem_cgroup and other cgroup internal object tied by css lifetime are never freed. Fix the issue by reintroducing root check in __mem_cgroup_iter_next and do not take css reference for it. This reference counting magic protects us also from another issue, an endless loop reported by Hugh Dickins when reclaim races with root removal and css_tryget called by iterator internally would fail. There would be no other nodes to visit so __mem_cgroup_iter_next would return NULL and mem_cgroup_iter would interpret it as "start looping from root again" and so mem_cgroup_iter would loop forever internally. Signed-off-by: Michal Hocko Reported-by: Hugh Dickins Tested-by: Hugh Dickins Cc: Johannes Weiner Cc: Greg Thelen Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da07784dde87..98f80beeac7f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1117,14 +1117,22 @@ skip_node: * skipped and we should continue the tree walk. * last_visited css is safe to use because it is * protected by css_get and the tree walk is rcu safe. + * + * We do not take a reference on the root of the tree walk + * because we might race with the root removal when it would + * be the only node in the iterated hierarchy and mem_cgroup_iter + * would end up in an endless loop because it expects that at + * least one valid node will be returned. Root cannot disappear + * because caller of the iterator should hold it already so + * skipping css reference should be safe. */ if (next_css) { - if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)) + if ((next_css->flags & CSS_ONLINE) && + (next_css == &root->css || css_tryget(next_css))) return mem_cgroup_from_css(next_css); - else { - prev_css = next_css; - goto skip_node; - } + + prev_css = next_css; + goto skip_node; } return NULL; -- cgit v1.2.3 From 6c14466cc00ff13121ae782d33d9df0fde20b124 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 Jan 2014 15:53:38 -0800 Subject: mm: improve documentation of page_order Developers occasionally try and optimise PFN scanners by using page_order but miss that in general it requires zone->lock. This has happened twice for compaction.c and rejected both times. This patch clarifies the documentation of page_order and adds a note to compaction.c why page_order is not used. [akpm@linux-foundation.org: tweaks] [lauraa@codeaurora.org: Corrected a page_zone(page)->lock reference] Signed-off-by: Mel Gorman Acked-by: Rafael Aquini Acked-by: Minchan Kim Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 5 ++++- mm/internal.h | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e0ab02d70f13..b48c5259ea33 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -523,7 +523,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!isolation_suitable(cc, page)) goto next_pageblock; - /* Skip if free */ + /* + * Skip if free. page_order cannot be used without zone->lock + * as nothing prevents parallel allocations or buddy merging. + */ if (PageBuddy(page)) continue; diff --git a/mm/internal.h b/mm/internal.h index 7e145e8cd1e6..612c14f5e0f5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -143,9 +143,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* - * function for dealing with page's order in buddy system. - * zone->lock is already acquired when we use these. - * So, we don't need atomic page->flags operations here. + * This function returns the order of a free page in the buddy system. In + * general, page_zone(page)->lock must be held by the caller to prevent the + * page from being allocated in parallel and returning garbage as the order. + * If a caller does not hold page_zone(page)->lock, it must guarantee that the + * page cannot be allocated or merged in parallel. */ static inline unsigned long page_order(struct page *page) { -- cgit v1.2.3 From 0d8a4a3799ab007b7a5e50aff9da9558925e0c15 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 23 Jan 2014 15:53:39 -0800 Subject: memcg: remove unused code from kmem_cache_destroy_work_func Signed-off-by: Vladimir Davydov Reviewed-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 98f80beeac7f..19d5d4274e22 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3359,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) * So if we aren't down to zero, we'll just schedule a worker and try * again */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - return; - } else + else kmem_cache_destroy(cachep); } -- cgit v1.2.3 From a5998061daab27802c418debe662be98a6e42874 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 23 Jan 2014 15:53:40 -0800 Subject: mm/swapfile.c: do not skip lowest_bit in scan_swap_map() scan loop In the second half of scan_swap_map()'s scan loop, offset is set to si->lowest_bit and then incremented before entering the loop for the first time, causing si->swap_map[si->lowest_bit] to be skipped. Signed-off-by: Jamie Liu Cc: Shaohua Li Acked-by: Hugh Dickins Cc: Minchan Kim Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index d443dea95c27..c6c13b050a58 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -616,7 +616,7 @@ scan: } } offset = si->lowest_bit; - while (++offset < scan_base) { + while (offset < scan_base) { if (!si->swap_map[offset]) { spin_lock(&si->lock); goto checks; @@ -629,6 +629,7 @@ scan: cond_resched(); latency_ration = LATENCY_LIMIT; } + offset++; } spin_lock(&si->lock); -- cgit v1.2.3 From 871beb8c313ab270242022d314e37db5044e2bab Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 23 Jan 2014 15:53:41 -0800 Subject: mm/rmap: fix coccinelle warnings mm/rmap.c:851:9-10: WARNING: return of 0/1 in function 'invalid_mkclean_vma' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: coccinelle/misc/boolreturn.cocci Signed-off-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 2dcd3353c3f6..d9d42316a99a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -848,9 +848,9 @@ out: static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { if (vma->vm_flags & VM_SHARED) - return 0; + return false; - return 1; + return true; } int page_mkclean(struct page *page) -- cgit v1.2.3 From 34228d473efe764d4db7c0536375f0c993e6e06a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jan 2014 15:53:42 -0800 Subject: mm: ignore VM_SOFTDIRTY on VMA merging The VM_SOFTDIRTY bit affects vma merge routine: if two VMAs has all bits in vm_flags matched except dirty bit the kernel can't longer merge them and this forces the kernel to generate new VMAs instead. It finally may lead to the situation when userspace application reaches vm.max_map_count limit and get crashed in worse case | (gimp:11768): GLib-ERROR **: gmem.c:110: failed to allocate 4096 bytes | | (file-tiff-load:12038): LibGimpBase-WARNING **: file-tiff-load: gimp_wire_read(): error | xinit: connection to X server lost | | waiting for X server to shut down | /usr/lib64/gimp/2.0/plug-ins/file-tiff-load terminated: Hangup | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup https://bugzilla.kernel.org/show_bug.cgi?id=67651 https://bugzilla.gnome.org/show_bug.cgi?id=719619#c0 Initial problem came from missed VM_SOFTDIRTY in do_brk() routine but even if we would set up VM_SOFTDIRTY here, there is still a way to prevent VMAs from merging: one can call | echo 4 > /proc/$PID/clear_refs and clear all VM_SOFTDIRTY over all VMAs presented in memory map, then new do_brk() will try to extend old VMA and finds that dirty bit doesn't match thus new VMA will be generated. As discussed with Pavel, the right approach should be to ignore VM_SOFTDIRTY bit when we're trying to merge VMAs and if merge successed we mark extended VMA with dirty bit where needed. Signed-off-by: Cyrill Gorcunov Reported-by: Bastian Hougaard Reported-by: Mel Gorman Cc: Pavel Emelyanov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 126d8b976bfd..20ff0c33274c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -894,7 +894,15 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags ^ vm_flags) + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressue on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) return 0; if (vma->vm_file != file) return 0; @@ -1083,7 +1091,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } -- cgit v1.2.3 From feda821e76f3bbbba4bd54d30b4d4005a7848aa5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Dec 2013 05:16:54 -0800 Subject: fs: remove generic_acl And instead convert tmpfs to use the new generic ACL code, with two stub methods provided for in-memory filesystems. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- mm/shmem.c | 57 +++++++++++++++++++++++---------------------------------- 1 file changed, 23 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 902a14842b74..b21ca543458c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt; #include #include #include -#include +#include #include #include #include @@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); -#ifdef CONFIG_TMPFS_POSIX_ACL if (attr->ia_valid & ATTR_MODE) - error = generic_acl_chmod(inode); -#endif + error = posix_acl_chmod(inode, inode->i_mode); return error; } @@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#endif + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - } + if (error && error != -EOPNOTSUPP) + goto out_iput; error = 0; dir->i_size += BOGO_DIRENT_SIZE; @@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) dget(dentry); /* Extra count - pin the dentry in core */ } return error; +out_iput: + iput(inode); + return error; } static int @@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - } -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#else - error = 0; -#endif + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; d_tmpfile(dentry, inode); } return error; +out_iput: + iput(inode); + return error; } static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode, static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL - &generic_acl_access_handler, - &generic_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; @@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, .removexattr = shmem_removexattr, + .set_acl = simple_set_acl, #endif }; @@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; @@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; -- cgit v1.2.3 From 9fe55eea7e4b444bafc42fa0000cc2d1d2847275 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 24 Jan 2014 14:42:22 +0000 Subject: Fix race when checking i_size on direct i/o read So far I've had one ACK for this, and no other comments. So I think it is probably time to send this via some suitable tree. I'm guessing that the vfs tree would be the most appropriate route, but not sure that there is one at the moment (don't see anything recent at kernel.org) so in that case I think -mm is the "back up plan". Al, please let me know if you will take this? Steve. --------------------- Following on from the "Re: [PATCH v3] vfs: fix a bug when we do some dio reads with append dio writes" thread on linux-fsdevel, this patch is my current version of the fix proposed as option (b) in that thread. Removing the i_size test from the direct i/o read path at vfs level means that filesystems now have to deal with requests which are beyond i_size themselves. These I've divided into three sets: a) Those with "no op" ->direct_IO (9p, cifs, ceph) These are obviously not going to be an issue b) Those with "home brew" ->direct_IO (nfs, fuse) I've been told that NFS should not have any problem with the larger i_size, however I've added an extra test to FUSE to duplicate the original behaviour just to be on the safe side. c) Those using __blockdev_direct_IO() These call through to ->get_block() which should deal with the EOF condition correctly. I've verified that with GFS2 and I believe that Zheng has verified it for ext4. I've also run the test on XFS and it passes both before and after this change. The part of the patch in filemap.c looks a lot larger than it really is - there are only two lines of real change. The rest is just indentation of the contained code. There remains a test of i_size though, which was added for btrfs. It doesn't cause the other filesystems a problem as the test is performed after ->direct_IO has been called. It is possible that there is a race that does matter to btrfs, however this patch doesn't change that, so its still an overall improvement. Signed-off-by: Steven Whitehouse Reported-by: Zheng Liu Cc: Jan Kara Cc: Dave Chinner Acked-by: Miklos Szeredi Cc: Chris Mason Cc: Josef Bacik Cc: Christoph Hellwig Cc: Alexander Viro Signed-off-by: Al Viro --- mm/filemap.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index b7749a92021c..01842867c9d2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; /* skip atime */ size = i_size_read(inode); - if (pos < size) { - retval = filemap_write_and_wait_range(mapping, pos, + retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); - if (!retval) { - retval = mapping->a_ops->direct_IO(READ, iocb, - iov, pos, nr_segs); - } - if (retval > 0) { - *ppos = pos + retval; - count -= retval; - } + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } + if (retval > 0) { + *ppos = pos + retval; + count -= retval; + } - /* - * Btrfs can have a short DIO read if we encounter - * compressed extents, so if there was an error, or if - * we've already read everything we wanted to, or if - * there was a short read because we hit EOF, go ahead - * and return. Otherwise fallthrough to buffered io for - * the rest of the read. - */ - if (retval < 0 || !count || *ppos >= size) { - file_accessed(filp); - goto out; - } + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { + file_accessed(filp); + goto out; } } -- cgit v1.2.3 From fb5bb60cd004a00c1d11db680a37942ecdedb1c5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Jan 2014 17:06:52 -0800 Subject: memblock: don't silently align size in memblock_virt_alloc() In original __alloc_memory_core_early() for bootmem wrapper, we do not align size silently. We should not do that, as later free with old size will leave some range not freed. It's obvious that code is copied from memblock_base_nid(), and that code is wrong for the same reason. Also remove that in memblock_alloc_base. Signed-off-by: Yinghai Lu Acked-by: Santosh Shilimkar Cc: Dave Hansen Cc: Russell King Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 9c0aeef19440..87d21a6ff63c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -984,9 +984,6 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, if (!align) align = SMP_CACHE_BYTES; - /* align @size to avoid excessive fragmentation on reserved array */ - size = round_up(size, align); - found = memblock_find_in_range_node(size, align, 0, max_addr, nid); if (found && !memblock_reserve(found, size)) return found; @@ -1080,9 +1077,6 @@ static void * __init memblock_virt_alloc_internal( if (!align) align = SMP_CACHE_BYTES; - /* align @size to avoid excessive fragmentation on reserved array */ - size = round_up(size, align); - again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, nid); -- cgit v1.2.3 From add688fbd32158440dbe62c07269a39ed969c059 Mon Sep 17 00:00:00 2001 From: malc Date: Mon, 27 Jan 2014 17:06:53 -0800 Subject: Revert "mm/vmalloc: interchage the implementation of vmalloc_to_{pfn,page}" Revert commit ece86e222db4, which was intended as a small performance improvement. Despite the claim that the patch doesn't introduce any functional changes in fact it does. The "no page" path behaves different now. Originally, vmalloc_to_page might return NULL under some conditions, with new implementation it returns pfn_to_page(0) which is not the same as NULL. Simple test shows the difference. test.c #include #include #include #include int __init myi(void) { struct page *p; void *v; v = vmalloc(PAGE_SIZE); /* trigger the "no page" path in vmalloc_to_page*/ vfree(v); p = vmalloc_to_page(v); pr_err("expected val = NULL, returned val = %p", p); return -EBUSY; } void __exit mye(void) { } module_init(myi) module_exit(mye) Before interchange: expected val = NULL, returned val = (null) After interchange: expected val = NULL, returned val = c7ebe000 Signed-off-by: Vladimir Murzin Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e4f0db2a3eae..0fdf96803c5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the physical pfn it maps to. + * Walk a vmap address to the struct page it maps. */ -unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; - unsigned long pfn = 0; + struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); /* @@ -244,23 +244,23 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) - pfn = pte_pfn(pte); + page = pte_page(pte); pte_unmap(ptep); } } } - return pfn; + return page; } -EXPORT_SYMBOL(vmalloc_to_pfn); +EXPORT_SYMBOL(vmalloc_to_page); /* - * Map a vmalloc()-space virtual address to the struct page. + * Map a vmalloc()-space virtual address to the physical page frame number. */ -struct page *vmalloc_to_page(const void *vmalloc_addr) +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { - return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } -EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ -- cgit v1.2.3 From e82cb95d626a6bb0e4fe7db1f311dc22039c2ed3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 27 Jan 2014 17:06:55 -0800 Subject: mm: bring back /sys/kernel/mm Commit da29bd36224b ("mm/mm_init.c: make creation of the mm_kobj happen earlier than device_initcall") changed to pure_initcall(mm_sysfs_init). That's too early: mm_sysfs_init() depends on core_initcall(ksysfs_init) to have made the kernel_kobj directory "kernel" in which to create "mm". Make it postcore_initcall(mm_sysfs_init). We could use core_initcall(), and depend upon Makefile link order kernel/ mm/ fs/ ipc/ security/ ... as core_initcall(debugfs_init) and core_initcall(securityfs_init) do; but better not. Signed-off-by: Hugh Dickins Acked-by: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 857a6434e3a5..4074caf9936b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -202,4 +202,4 @@ static int __init mm_sysfs_init(void) return 0; } -pure_initcall(mm_sysfs_init); +postcore_initcall(mm_sysfs_init); -- cgit v1.2.3 From a3978a519461b095b776f44a86079f5448c96963 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 27 Jan 2014 17:07:17 -0800 Subject: mm/migrate.c: fix setting of cpupid on page migration twice against normal page Commit 7851a45cd3f6 ("mm: numa: Copy cpupid on page migration") copies over the cpupid at page migration time. It is unnecessary to set it again in alloc_misplaced_dst_page(). Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 734704f6f29b..482a33d89134 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1548,8 +1548,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); - if (newpage) - page_cpupid_xchg_last(newpage, page_cpupid_last(page)); return newpage; } -- cgit v1.2.3 From a804552b9a15c931cfc2a92a2e0aed1add8b580a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Jan 2014 14:05:39 -0800 Subject: mm/page-writeback.c: fix dirty_balance_reserve subtraction from dirtyable memory Tejun reported stuttering and latency spikes on a system where random tasks would enter direct reclaim and get stuck on dirty pages. Around 50% of memory was occupied by tmpfs backed by an SSD, and another disk (rotating) was reading and writing at max speed to shrink a partition. : The problem was pretty ridiculous. It's a 8gig machine w/ one ssd and 10k : rpm harddrive and I could reliably reproduce constant stuttering every : several seconds for as long as buffered IO was going on on the hard drive : either with tmpfs occupying somewhere above 4gig or a test program which : allocates about the same amount of anon memory. Although swap usage was : zero, turning off swap also made the problem go away too. : : The trigger conditions seem quite plausible - high anon memory usage w/ : heavy buffered IO and swap configured - and it's highly likely that this : is happening in the wild too. (this can happen with copying large files : to usb sticks too, right?) This patch (of 2): The dirty_balance_reserve is an approximation of the fraction of free pages that the page allocator does not make available for page cache allocations. As a result, it has to be taken into account when calculating the amount of "dirtyable memory", the baseline to which dirty_background_ratio and dirty_ratio are applied. However, currently the reserve is subtracted from the sum of free and reclaimable pages, which is non-sensical and leads to erroneous results when the system is dominated by unreclaimable pages and the dirty_balance_reserve is bigger than free+reclaimable. In that case, at least the already allocated cache should be considered dirtyable. Fix the calculation by subtracting the reserve from the amount of free pages, then adding the reclaimable pages on top. [akpm@linux-foundation.org: fix CONFIG_HIGHMEM build] Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Tested-by: Tejun Heo Reviewed-by: Rik van Riel Cc: Mel Gorman Cc: Wu Fengguang Reviewed-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 55 +++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 63807583d8e8..61119b8a11e6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -191,6 +191,25 @@ static unsigned long writeout_period_time = 0; * global dirtyable memory first. */ +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + unsigned long nr_pages; + + nr_pages = zone_page_state(zone, NR_FREE_PAGES); + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + + nr_pages += zone_reclaimable_pages(zone); + + return nr_pages; +} + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -198,11 +217,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) unsigned long x = 0; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z) - z->dirty_balance_reserve; + x += zone_dirtyable_memory(z); } /* * Unreclaimable memory (kernel memory or anonymous memory @@ -238,9 +255,11 @@ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x = global_page_state(NR_FREE_PAGES); x -= min(x, dirty_balance_reserve); + x += global_reclaimable_pages(); + if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -288,32 +307,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) trace_global_dirty_state(background, dirty); } -/** - * zone_dirtyable_memory - number of dirtyable pages in a zone - * @zone: the zone - * - * Returns the zone's number of pages potentially available for dirty - * page cache. This is the base value for the per-zone dirty limits. - */ -static unsigned long zone_dirtyable_memory(struct zone *zone) -{ - /* - * The effective global number of dirtyable pages may exclude - * highmem as a big-picture measure to keep the ratio between - * dirty memory and lowmem reasonable. - * - * But this function is purely about the individual zone and a - * highmem zone can hold its share of dirty pages, so we don't - * care about vm_highmem_is_dirtyable here. - */ - unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + - zone_reclaimable_pages(zone); - - /* don't allow this to underflow */ - nr_pages -= min(nr_pages, zone->dirty_balance_reserve); - return nr_pages; -} - /** * zone_dirty_limit - maximum number of dirty pages allowed in a zone * @zone: the zone -- cgit v1.2.3 From a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Jan 2014 14:05:41 -0800 Subject: mm/page-writeback.c: do not count anon pages as dirtyable memory The VM is currently heavily tuned to avoid swapping. Whether that is good or bad is a separate discussion, but as long as the VM won't swap to make room for dirty cache, we can not consider anonymous pages when calculating the amount of dirtyable memory, the baseline to which dirty_background_ratio and dirty_ratio are applied. A simple workload that occupies a significant size (40+%, depending on memory layout, storage speeds etc.) of memory with anon/tmpfs pages and uses the remainder for a streaming writer demonstrates this problem. In that case, the actual cache pages are a small fraction of what is considered dirtyable overall, which results in an relatively large portion of the cache pages to be dirtied. As kswapd starts rotating these, random tasks enter direct reclaim and stall on IO. Only consider free pages and file pages dirtyable. Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Tested-by: Tejun Heo Reviewed-by: Rik van Riel Cc: Mel Gorman Cc: Wu Fengguang Reviewed-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 1 - mm/page-writeback.c | 6 ++++-- mm/vmscan.c | 23 +---------------------- 3 files changed, 5 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 612c14f5e0f5..29e1e761f9eb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -83,7 +83,6 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern unsigned long zone_reclaimable_pages(struct zone *zone); extern bool zone_reclaimable(struct zone *zone); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 61119b8a11e6..2d30e2cfe804 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -205,7 +205,8 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) nr_pages = zone_page_state(zone, NR_FREE_PAGES); nr_pages -= min(nr_pages, zone->dirty_balance_reserve); - nr_pages += zone_reclaimable_pages(zone); + nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); + nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); return nr_pages; } @@ -258,7 +259,8 @@ static unsigned long global_dirtyable_memory(void) x = global_page_state(NR_FREE_PAGES); x -= min(x, dirty_balance_reserve); - x += global_reclaimable_pages(); + x += global_page_state(NR_INACTIVE_FILE); + x += global_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/vmscan.c b/mm/vmscan.c index 90c4075d8d75..a9c74b409681 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc) } #endif -unsigned long zone_reclaimable_pages(struct zone *zone) +static unsigned long zone_reclaimable_pages(struct zone *zone) { int nr; @@ -3315,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) wake_up_interruptible(&pgdat->kswapd_wait); } -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ - int nr; - - nr = global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of -- cgit v1.2.3 From c297663c0b3930491a3cb2aba4b6e5a7159c3503 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jan 2014 14:05:42 -0800 Subject: mm: numa: initialise numa balancing after jump label initialisation The command line parsing takes place before jump labels are initialised which generates a warning if numa_balancing= is specified and CONFIG_JUMP_LABEL is set. On older kernels before commit c4b2c0c5f647 ("static_key: WARN on usage before jump_label_init was called") the kernel would have crashed. This patch enables automatic numa balancing later in the initialisation process if numa_balancing= is specified. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: stable Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cb46cddf61..79cea01f9f78 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2654,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p) } #ifdef CONFIG_NUMA_BALANCING -static bool __initdata numabalancing_override; +static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { @@ -2663,9 +2663,15 @@ static void __init check_numabalancing_enable(void) if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + if (nr_node_ids > 1 && !numabalancing_override) { - printk(KERN_INFO "Enabling automatic NUMA balancing. " - "Configure with numa_balancing= or the kernel.numa_balancing sysctl"); + printk(KERN_INFO "%s automatic NUMA balancing. " + "Configure with numa_balancing= or the " + "kernel.numa_balancing sysctl", + numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } @@ -2675,13 +2681,12 @@ static int __init setup_numabalancing(char *str) int ret = 0; if (!str) goto out; - numabalancing_override = true; if (!strcmp(str, "enable")) { - set_numabalancing_state(true); + numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { - set_numabalancing_state(false); + numabalancing_override = -1; ret = 1; } out: -- cgit v1.2.3 From 4a404bea941ac3c62e11b88c9d16197334eee2f1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 29 Jan 2014 14:05:43 -0800 Subject: mm/mempolicy.c: convert to pr_foo() A few printk(KERN_*'s have snuck in there. Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 79cea01f9f78..873de7e542bc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2668,7 +2668,7 @@ static void __init check_numabalancing_enable(void) set_numabalancing_state(numabalancing_override == 1); if (nr_node_ids > 1 && !numabalancing_override) { - printk(KERN_INFO "%s automatic NUMA balancing. " + pr_info("%s automatic NUMA balancing. " "Configure with numa_balancing= or the " "kernel.numa_balancing sysctl", numabalancing_default ? "Enabling" : "Disabling"); @@ -2691,7 +2691,7 @@ static int __init setup_numabalancing(char *str) } out: if (!ret) - printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + pr_warn("Unable to parse numa_balancing=\n"); return ret; } -- cgit v1.2.3 From ba3253c78d7443d2c80c544b1e7aec9f39938395 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 29 Jan 2014 14:05:48 -0800 Subject: slab: fix wrong retval on kmem_cache_create_memcg error path On kmem_cache_create_memcg() error path we set 'err', but leave 's' (the new cache ptr) undefined. The latter can be NULL if we could not allocate the cache, or pointing to a freed area if we failed somewhere later while trying to initialize it. Initially we checked 'err' immediately before exiting the function and returned NULL if it was set ignoring the value of 's': out_unlock: ... if (err) { /* report error */ return NULL; } return s; Recently this check was, in fact, broken by commit f717eb3abb5e ("slab: do not panic if we fail to create memcg cache"), which turned it to: out_unlock: ... if (err && !memcg) { /* report error */ return NULL; } return s; As a result, if we are failing creating a cache for a memcg, we will skip the check and return 's' that can contain crap. Obviously, commit f717eb3abb5e intended not to return crap on error allocating a cache for a memcg, but only to remove the error reporting in this case, so the check should look like this: out_unlock: ... if (err) { if (!memcg) return NULL; /* report error */ return NULL; } return s; [rientjes@google.com: despaghettification] [vdavydov@parallels.com: patch monkeying] Signed-off-by: David Rientjes Signed-off-by: Vladimir Davydov Signed-off-by: Dave Jones Reported-by: Dave Jones Acked-by: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 8e40321da091..1ec3c619ba04 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -233,14 +233,17 @@ out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); - /* - * There is no point in flooding logs with warnings or especially - * crashing the system if we fail to create a cache for a memcg. In - * this case we will be accounting the memcg allocation to the root - * cgroup until we succeed to create its own cache, but it isn't that - * critical. - */ - if (err && !memcg) { + if (err) { + /* + * There is no point in flooding logs with warnings or + * especially crashing the system if we fail to create a cache + * for a memcg. In this case we will be accounting the memcg + * allocation to the root cgroup until we succeed to create its + * own cache, but it isn't that critical. + */ + if (!memcg) + return NULL; + if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); -- cgit v1.2.3 From a0132ac0f275434db32111b8cf7372d991899da3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 29 Jan 2014 14:05:50 -0800 Subject: mm/slub.c: do not VM_BUG_ON_PAGE() for temporary on-stack pages Commit 309381feaee5 ("mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE") added a bunch of VM_BUG_ON_PAGE() calls. But, most of the ones in the slub code are for _temporary_ 'struct page's which are declared on the stack and likely have lots of gunk in them. Dumping their contents out will just confuse folks looking at bad_page() output. Plus, if we try to page_to_pfn() on them or soemthing, we'll probably oops anyway. Turn them back in to VM_BUG_ON()s. Signed-off-by: Dave Hansen Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 34bb8c65a2d8..545a170ebf9f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1559,7 +1559,7 @@ static inline void *acquire_slab(struct kmem_cache *s, new.freelist = freelist; } - VM_BUG_ON_PAGE(new.frozen, &new); + VM_BUG_ON(new.frozen); new.frozen = 1; if (!__cmpxchg_double_slab(s, page, @@ -1812,7 +1812,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, set_freepointer(s, freelist, prior); new.counters = counters; new.inuse--; - VM_BUG_ON_PAGE(!new.frozen, &new); + VM_BUG_ON(!new.frozen); } while (!__cmpxchg_double_slab(s, page, prior, counters, @@ -1840,7 +1840,7 @@ redo: old.freelist = page->freelist; old.counters = page->counters; - VM_BUG_ON_PAGE(!old.frozen, &old); + VM_BUG_ON(!old.frozen); /* Determine target state of the slab */ new.counters = old.counters; @@ -1952,7 +1952,7 @@ static void unfreeze_partials(struct kmem_cache *s, old.freelist = page->freelist; old.counters = page->counters; - VM_BUG_ON_PAGE(!old.frozen, &old); + VM_BUG_ON(!old.frozen); new.counters = old.counters; new.freelist = old.freelist; @@ -2225,7 +2225,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) counters = page->counters; new.counters = counters; - VM_BUG_ON_PAGE(!new.frozen, &new); + VM_BUG_ON(!new.frozen); new.inuse = page->objects; new.frozen = freelist != NULL; @@ -2319,7 +2319,7 @@ load_freelist: * page is pointing to the page from which the objects are obtained. * That page must be frozen for per cpu allocations to work. */ - VM_BUG_ON_PAGE(!c->page->frozen, c->page); + VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_irq_restore(flags); -- cgit v1.2.3 From 58d5640ebdb273cc817b0d0cda7bcf2efbbc2ff7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 29 Jan 2014 14:05:51 -0800 Subject: mm/readahead.c: fix do_readahead() for no readpage(s) Commit 63d0f0a3c7e1 ("mm/readahead.c:do_readhead(): don't check for ->readpage") unintentionally made do_readahead return 0 for all valid files regardless of whether readahead was supported, rather than the expected -EINVAL. This gets forwarded on to userspace, and results in sys_readahead appearing to succeed in cases that don't make sense (e.g. when called on pipes or sockets). This issue is detected by the LTP readahead01 testcase. As the exact return value of force_page_cache_readahead is currently never used, we can simplify it to return only 0 or -EINVAL (when readpage or readpages is missing). With that in place we can simply forward on the return value of force_page_cache_readahead in do_readahead. This patch performs said change, restoring the expected semantics. Signed-off-by: Mark Rutland Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 7cdbb44aa90b..0de2360d65f3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -211,8 +211,6 @@ out: int force_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read) { - int ret = 0; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; @@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); - if (err < 0) { - ret = err; - break; - } - ret += err; + if (err < 0) + return err; + offset += this_chunk; nr_to_read -= this_chunk; } - return ret; + return 0; } /* @@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, nr); - return 0; + return force_page_cache_readahead(mapping, filp, index, nr); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) -- cgit v1.2.3 From f544e14f3e765b5241d7f234fee677506b8ce07f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 29 Jan 2014 14:05:52 -0800 Subject: memblock: add limit checking to memblock_virt_alloc In original bootmem wrapper for memblock, we have limit checking. Add it to memblock_virt_alloc, to address arm and x86 booting crash. Signed-off-by: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Reported-by: Kevin Hilman Tested-by: Kevin Hilman Reported-by: Olof Johansson Tested-by: Olof Johansson Reported-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk Cc: Dave Hansen Cc: Santosh Shilimkar Cc: "Strashko, Grygorii" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 87d21a6ff63c..39a31e7f0045 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1077,6 +1077,9 @@ static void * __init memblock_virt_alloc_internal( if (!align) align = SMP_CACHE_BYTES; + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, nid); -- cgit v1.2.3 From bcf1647d0899666f0fb90d176abf63bae22abb7c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:50 -0800 Subject: zsmalloc: move it under mm This patch moves zsmalloc under mm directory. Before that, description will explain why we have needed custom allocator. Zsmalloc is a new slab-based memory allocator for storing compressed pages. It is designed for low fragmentation and high allocation success rate on large object, but <= PAGE_SIZE allocations. zsmalloc differs from the kernel slab allocator in two primary ways to achieve these design goals. zsmalloc never requires high order page allocations to back slabs, or "size classes" in zsmalloc terms. Instead it allows multiple single-order pages to be stitched together into a "zspage" which backs the slab. This allows for higher allocation success rate under memory pressure. Also, zsmalloc allows objects to span page boundaries within the zspage. This allows for lower fragmentation than could be had with the kernel slab allocator for objects between PAGE_SIZE/2 and PAGE_SIZE. With the kernel slab allocator, if a page compresses to 60% of it original size, the memory savings gained through compression is lost in fragmentation because another object of the same size can't be stored in the leftover space. This ability to span pages results in zsmalloc allocations not being directly addressable by the user. The user is given an non-dereferencable handle in response to an allocation request. That handle must be mapped, using zs_map_object(), which returns a pointer to the mapped region that can be used. The mapping is necessary since the object data may reside in two different noncontigious pages. The zsmalloc fulfills the allocation needs for zram perfectly [sjenning@linux.vnet.ibm.com: borrow Seth's quote] Signed-off-by: Minchan Kim Acked-by: Nitin Gupta Reviewed-by: Konrad Rzeszutek Wilk Cc: Bob Liu Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Jens Axboe Cc: Luigi Semenzato Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 25 ++ mm/Makefile | 1 + mm/zsmalloc.c | 1105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1131 insertions(+) create mode 100644 mm/zsmalloc.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 723bbe04a0b0..2d9f1504d75e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY it can be cleared by hands. See Documentation/vm/soft-dirty.txt for more details. + +config ZSMALLOC + bool "Memory allocator for compressed pages" + depends on MMU + default n + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. + +config PGTABLE_MAPPING + bool "Use page table mapping to access object in zsmalloc" + depends on ZSMALLOC + help + By default, zsmalloc uses a copy-based object mapping method to + access allocations that span two pages. However, if a particular + architecture (ex, ARM) performs VM mapping faster than copying, + then you should select this. This causes zsmalloc to use page table + mapping rather than copying for object mapping. + + You can check speed with zsmalloc benchmark[1]. + [1] https://github.com/spartacus06/zsmalloc diff --git a/mm/Makefile b/mm/Makefile index 305d10acd081..310c90a09264 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c new file mode 100644 index 000000000000..5d42adfcb67b --- /dev/null +++ b/mm/zsmalloc.c @@ -0,0 +1,1105 @@ +/* + * zsmalloc memory allocator + * + * Copyright (C) 2011 Nitin Gupta + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the license that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + */ + +/* + * This allocator is designed for use with zram. Thus, the allocator is + * supposed to work well under low memory conditions. In particular, it + * never attempts higher order page allocation which is very likely to + * fail under memory pressure. On the other hand, if we just use single + * (0-order) pages, it would suffer from very high fragmentation -- + * any object of size PAGE_SIZE/2 or larger would occupy an entire page. + * This was one of the major issues with its predecessor (xvmalloc). + * + * To overcome these issues, zsmalloc allocates a bunch of 0-order pages + * and links them together using various 'struct page' fields. These linked + * pages act as a single higher-order page i.e. an object can span 0-order + * page boundaries. The code refers to these linked pages as a single entity + * called zspage. + * + * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE + * since this satisfies the requirements of all its current users (in the + * worst case, page is incompressible and is thus stored "as-is" i.e. in + * uncompressed form). For allocation requests larger than this size, failure + * is returned (see zs_malloc). + * + * Additionally, zs_malloc() does not return a dereferenceable pointer. + * Instead, it returns an opaque handle (unsigned long) which encodes actual + * location of the allocated object. The reason for this indirection is that + * zsmalloc does not keep zspages permanently mapped since that would cause + * issues on 32-bit systems where the VA region for kernel space mappings + * is very small. So, before using the allocating memory, the object has to + * be mapped using zs_map_object() to get a usable pointer and subsequently + * unmapped using zs_unmap_object(). + * + * Following is how we use various fields and flags of underlying + * struct page(s) to form a zspage. + * + * Usage of struct page fields: + * page->first_page: points to the first component (0-order) page + * page->index (union with page->freelist): offset of the first object + * starting in this page. For the first page, this is + * always 0, so we use this field (aka freelist) to point + * to the first free object in zspage. + * page->lru: links together all component pages (except the first page) + * of a zspage + * + * For _first_ page only: + * + * page->private (union with page->first_page): refers to the + * component page after the first page + * page->freelist: points to the first free object in zspage. + * Free objects are linked together using in-place + * metadata. + * page->objects: maximum number of objects we can store in this + * zspage (class->zspage_order * PAGE_SIZE / class->size) + * page->lru: links together first pages of various zspages. + * Basically forming list of zspages in a fullness group. + * page->mapping: class index and fullness group of the zspage + * + * Usage of struct page flags: + * PG_private: identifies the first component page + * PG_private2: identifies the last component page + * + */ + +#ifdef CONFIG_ZSMALLOC_DEBUG +#define DEBUG +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This must be power of 2 and greater than of equal to sizeof(link_free). + * These two conditions ensure that any 'struct link_free' itself doesn't + * span more than 1 page which avoids complex case of mapping 2 pages simply + * to restore link_free pointer values. + */ +#define ZS_ALIGN 8 + +/* + * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) + * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. + */ +#define ZS_MAX_ZSPAGE_ORDER 2 +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) + +/* + * Object location (, ) is encoded as + * as single (unsigned long) handle value. + * + * Note that object index is relative to system + * page it is stored in, so for each sub-page belonging + * to a zspage, obj_idx starts with 0. + * + * This is made more complicated by various memory models and PAE. + */ + +#ifndef MAX_PHYSMEM_BITS +#ifdef CONFIG_HIGHMEM64G +#define MAX_PHYSMEM_BITS 36 +#else /* !CONFIG_HIGHMEM64G */ +/* + * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just + * be PAGE_SHIFT + */ +#define MAX_PHYSMEM_BITS BITS_PER_LONG +#endif +#endif +#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) +#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) + +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ +#define ZS_MIN_ALLOC_SIZE \ + MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +#define ZS_MAX_ALLOC_SIZE PAGE_SIZE + +/* + * On systems with 4K page size, this gives 254 size classes! There is a + * trader-off here: + * - Large number of size classes is potentially wasteful as free page are + * spread across these classes + * - Small number of size classes causes large internal fragmentation + * - Probably its better to use specific size classes (empirically + * determined). NOTE: all those class sizes must be set as multiple of + * ZS_ALIGN to make sure link_free itself never has to span 2 pages. + * + * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN + * (reason above) + */ +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) +#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ + ZS_SIZE_CLASS_DELTA + 1) + +/* + * We do not maintain any list for completely empty or full pages + */ +enum fullness_group { + ZS_ALMOST_FULL, + ZS_ALMOST_EMPTY, + _ZS_NR_FULLNESS_GROUPS, + + ZS_EMPTY, + ZS_FULL +}; + +/* + * We assign a page to ZS_ALMOST_EMPTY fullness group when: + * n <= N / f, where + * n = number of allocated objects + * N = total number of objects zspage can store + * f = 1/fullness_threshold_frac + * + * Similarly, we assign zspage to: + * ZS_ALMOST_FULL when n > N / f + * ZS_EMPTY when n == 0 + * ZS_FULL when n == N + * + * (see: fix_fullness_group()) + */ +static const int fullness_threshold_frac = 4; + +struct size_class { + /* + * Size of objects stored in this class. Must be multiple + * of ZS_ALIGN. + */ + int size; + unsigned int index; + + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; + + spinlock_t lock; + + /* stats */ + u64 pages_allocated; + + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; +}; + +/* + * Placed within free objects to form a singly linked list. + * For every zspage, first_page->freelist gives head of this list. + * + * This must be power of 2 and less than or equal to ZS_ALIGN + */ +struct link_free { + /* Handle of next free chunk (encodes ) */ + void *next; +}; + +struct zs_pool { + struct size_class size_class[ZS_SIZE_CLASSES]; + + gfp_t flags; /* allocation flags used when growing pool */ +}; + +/* + * A zspage's class index and fullness group + * are encoded in its (first)page->mapping + */ +#define CLASS_IDX_BITS 28 +#define FULLNESS_BITS 4 +#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) +#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) + +struct mapping_area { +#ifdef CONFIG_PGTABLE_MAPPING + struct vm_struct *vm; /* vm area for mapping object that span pages */ +#else + char *vm_buf; /* copy buffer for objects that span pages */ +#endif + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +}; + + +/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +static DEFINE_PER_CPU(struct mapping_area, zs_map_area); + +static int is_first_page(struct page *page) +{ + return PagePrivate(page); +} + +static int is_last_page(struct page *page) +{ + return PagePrivate2(page); +} + +static void get_zspage_mapping(struct page *page, unsigned int *class_idx, + enum fullness_group *fullness) +{ + unsigned long m; + BUG_ON(!is_first_page(page)); + + m = (unsigned long)page->mapping; + *fullness = m & FULLNESS_MASK; + *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; +} + +static void set_zspage_mapping(struct page *page, unsigned int class_idx, + enum fullness_group fullness) +{ + unsigned long m; + BUG_ON(!is_first_page(page)); + + m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | + (fullness & FULLNESS_MASK); + page->mapping = (struct address_space *)m; +} + +/* + * zsmalloc divides the pool into various size classes where each + * class maintains a list of zspages where each zspage is divided + * into equal sized chunks. Each allocation falls into one of these + * classes depending on its size. This function returns index of the + * size class which has chunk size big enough to hold the give size. + */ +static int get_size_class_index(int size) +{ + int idx = 0; + + if (likely(size > ZS_MIN_ALLOC_SIZE)) + idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, + ZS_SIZE_CLASS_DELTA); + + return idx; +} + +/* + * For each size class, zspages are divided into different groups + * depending on how "full" they are. This was done so that we could + * easily find empty or nearly empty zspages when we try to shrink + * the pool (not yet implemented). This function returns fullness + * status of the given page. + */ +static enum fullness_group get_fullness_group(struct page *page) +{ + int inuse, max_objects; + enum fullness_group fg; + BUG_ON(!is_first_page(page)); + + inuse = page->inuse; + max_objects = page->objects; + + if (inuse == 0) + fg = ZS_EMPTY; + else if (inuse == max_objects) + fg = ZS_FULL; + else if (inuse <= max_objects / fullness_threshold_frac) + fg = ZS_ALMOST_EMPTY; + else + fg = ZS_ALMOST_FULL; + + return fg; +} + +/* + * Each size class maintains various freelists and zspages are assigned + * to one of these freelists based on the number of live objects they + * have. This functions inserts the given zspage into the freelist + * identified by . + */ +static void insert_zspage(struct page *page, struct size_class *class, + enum fullness_group fullness) +{ + struct page **head; + + BUG_ON(!is_first_page(page)); + + if (fullness >= _ZS_NR_FULLNESS_GROUPS) + return; + + head = &class->fullness_list[fullness]; + if (*head) + list_add_tail(&page->lru, &(*head)->lru); + + *head = page; +} + +/* + * This function removes the given zspage from the freelist identified + * by . + */ +static void remove_zspage(struct page *page, struct size_class *class, + enum fullness_group fullness) +{ + struct page **head; + + BUG_ON(!is_first_page(page)); + + if (fullness >= _ZS_NR_FULLNESS_GROUPS) + return; + + head = &class->fullness_list[fullness]; + BUG_ON(!*head); + if (list_empty(&(*head)->lru)) + *head = NULL; + else if (*head == page) + *head = (struct page *)list_entry((*head)->lru.next, + struct page, lru); + + list_del_init(&page->lru); +} + +/* + * Each size class maintains zspages in different fullness groups depending + * on the number of live objects they contain. When allocating or freeing + * objects, the fullness status of the page can change, say, from ALMOST_FULL + * to ALMOST_EMPTY when freeing an object. This function checks if such + * a status change has occurred for the given page and accordingly moves the + * page from the freelist of the old fullness group to that of the new + * fullness group. + */ +static enum fullness_group fix_fullness_group(struct zs_pool *pool, + struct page *page) +{ + int class_idx; + struct size_class *class; + enum fullness_group currfg, newfg; + + BUG_ON(!is_first_page(page)); + + get_zspage_mapping(page, &class_idx, &currfg); + newfg = get_fullness_group(page); + if (newfg == currfg) + goto out; + + class = &pool->size_class[class_idx]; + remove_zspage(page, class, currfg); + insert_zspage(page, class, newfg); + set_zspage_mapping(page, class_idx, newfg); + +out: + return newfg; +} + +/* + * We have to decide on how many pages to link together + * to form a zspage for each size class. This is important + * to reduce wastage due to unusable space left at end of + * each zspage which is given as: + * wastage = Zp - Zp % size_class + * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... + * + * For example, for size class of 3/8 * PAGE_SIZE, we should + * link together 3 PAGE_SIZE sized pages to form a zspage + * since then we can perfectly fit in 8 such objects. + */ +static int get_pages_per_zspage(int class_size) +{ + int i, max_usedpc = 0; + /* zspage order which gives maximum used size per KB */ + int max_usedpc_order = 1; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int zspage_size; + int waste, usedpc; + + zspage_size = i * PAGE_SIZE; + waste = zspage_size % class_size; + usedpc = (zspage_size - waste) * 100 / zspage_size; + + if (usedpc > max_usedpc) { + max_usedpc = usedpc; + max_usedpc_order = i; + } + } + + return max_usedpc_order; +} + +/* + * A single 'zspage' is composed of many system pages which are + * linked together using fields in struct page. This function finds + * the first/head page, given any component page of a zspage. + */ +static struct page *get_first_page(struct page *page) +{ + if (is_first_page(page)) + return page; + else + return page->first_page; +} + +static struct page *get_next_page(struct page *page) +{ + struct page *next; + + if (is_last_page(page)) + next = NULL; + else if (is_first_page(page)) + next = (struct page *)page_private(page); + else + next = list_entry(page->lru.next, struct page, lru); + + return next; +} + +/* + * Encode as a single handle value. + * On hardware platforms with physical memory starting at 0x0 the pfn + * could be 0 so we ensure that the handle will never be 0 by adjusting the + * encoded obj_idx value before encoding. + */ +static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +{ + unsigned long handle; + + if (!page) { + BUG_ON(obj_idx); + return NULL; + } + + handle = page_to_pfn(page) << OBJ_INDEX_BITS; + handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); + + return (void *)handle; +} + +/* + * Decode pair from the given object handle. We adjust the + * decoded obj_idx back to its original value since it was adjusted in + * obj_location_to_handle(). + */ +static void obj_handle_to_location(unsigned long handle, struct page **page, + unsigned long *obj_idx) +{ + *page = pfn_to_page(handle >> OBJ_INDEX_BITS); + *obj_idx = (handle & OBJ_INDEX_MASK) - 1; +} + +static unsigned long obj_idx_to_offset(struct page *page, + unsigned long obj_idx, int class_size) +{ + unsigned long off = 0; + + if (!is_first_page(page)) + off = page->index; + + return off + obj_idx * class_size; +} + +static void reset_page(struct page *page) +{ + clear_bit(PG_private, &page->flags); + clear_bit(PG_private_2, &page->flags); + set_page_private(page, 0); + page->mapping = NULL; + page->freelist = NULL; + page_mapcount_reset(page); +} + +static void free_zspage(struct page *first_page) +{ + struct page *nextp, *tmp, *head_extra; + + BUG_ON(!is_first_page(first_page)); + BUG_ON(first_page->inuse); + + head_extra = (struct page *)page_private(first_page); + + reset_page(first_page); + __free_page(first_page); + + /* zspage with only 1 system page */ + if (!head_extra) + return; + + list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { + list_del(&nextp->lru); + reset_page(nextp); + __free_page(nextp); + } + reset_page(head_extra); + __free_page(head_extra); +} + +/* Initialize a newly allocated zspage */ +static void init_zspage(struct page *first_page, struct size_class *class) +{ + unsigned long off = 0; + struct page *page = first_page; + + BUG_ON(!is_first_page(first_page)); + while (page) { + struct page *next_page; + struct link_free *link; + unsigned int i, objs_on_page; + + /* + * page->index stores offset of first object starting + * in the page. For the first page, this is always 0, + * so we use first_page->index (aka ->freelist) to store + * head of corresponding zspage's freelist. + */ + if (page != first_page) + page->index = off; + + link = (struct link_free *)kmap_atomic(page) + + off / sizeof(*link); + objs_on_page = (PAGE_SIZE - off) / class->size; + + for (i = 1; i <= objs_on_page; i++) { + off += class->size; + if (off < PAGE_SIZE) { + link->next = obj_location_to_handle(page, i); + link += class->size / sizeof(*link); + } + } + + /* + * We now come to the last (full or partial) object on this + * page, which must point to the first object on the next + * page (if present) + */ + next_page = get_next_page(page); + link->next = obj_location_to_handle(next_page, 0); + kunmap_atomic(link); + page = next_page; + off = (off + class->size) % PAGE_SIZE; + } +} + +/* + * Allocate a zspage for the given size class + */ +static struct page *alloc_zspage(struct size_class *class, gfp_t flags) +{ + int i, error; + struct page *first_page = NULL, *uninitialized_var(prev_page); + + /* + * Allocate individual pages and link them together as: + * 1. first page->private = first sub-page + * 2. all sub-pages are linked together using page->lru + * 3. each sub-page is linked to the first page using page->first_page + * + * For each size class, First/Head pages are linked together using + * page->lru. Also, we set PG_private to identify the first page + * (i.e. no other sub-page has this flag set) and PG_private_2 to + * identify the last page. + */ + error = -ENOMEM; + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; + + page = alloc_page(flags); + if (!page) + goto cleanup; + + INIT_LIST_HEAD(&page->lru); + if (i == 0) { /* first page */ + SetPagePrivate(page); + set_page_private(page, 0); + first_page = page; + first_page->inuse = 0; + } + if (i == 1) + set_page_private(first_page, (unsigned long)page); + if (i >= 1) + page->first_page = first_page; + if (i >= 2) + list_add(&page->lru, &prev_page->lru); + if (i == class->pages_per_zspage - 1) /* last page */ + SetPagePrivate2(page); + prev_page = page; + } + + init_zspage(first_page, class); + + first_page->freelist = obj_location_to_handle(first_page, 0); + /* Maximum number of objects we can store in this zspage */ + first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; + + error = 0; /* Success */ + +cleanup: + if (unlikely(error) && first_page) { + free_zspage(first_page); + first_page = NULL; + } + + return first_page; +} + +static struct page *find_get_zspage(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) + break; + } + + return page; +} + +#ifdef CONFIG_PGTABLE_MAPPING +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm) + return 0; + area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); + if (!area->vm) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + if (area->vm) + free_vm_area(area->vm); + area->vm = NULL; +} + +static inline void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + area->vm_addr = area->vm->addr; + return area->vm_addr + off; +} + +static inline void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + unsigned long addr = (unsigned long)area->vm_addr; + + unmap_kernel_range(addr, PAGE_SIZE * 2); +} + +#else /* CONFIG_PGTABLE_MAPPING */ + +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm_buf) + return 0; + area->vm_buf = (char *)__get_free_page(GFP_KERNEL); + if (!area->vm_buf) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + if (area->vm_buf) + free_page((unsigned long)area->vm_buf); + area->vm_buf = NULL; +} + +static void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf = area->vm_buf; + + /* disable page faults to match kmap_atomic() return conditions */ + pagefault_disable(); + + /* no read fastpath */ + if (area->vm_mm == ZS_MM_WO) + goto out; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy object to per-cpu buffer */ + addr = kmap_atomic(pages[0]); + memcpy(buf, addr + off, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(buf + sizes[0], addr, sizes[1]); + kunmap_atomic(addr); +out: + return area->vm_buf; +} + +static void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf = area->vm_buf; + + /* no write fastpath */ + if (area->vm_mm == ZS_MM_RO) + goto out; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy per-cpu buffer to object */ + addr = kmap_atomic(pages[0]); + memcpy(addr + off, buf, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(addr, buf + sizes[0], sizes[1]); + kunmap_atomic(addr); + +out: + /* enable page faults to match kunmap_atomic() return conditions */ + pagefault_enable(); +} + +#endif /* CONFIG_PGTABLE_MAPPING */ + +static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, + void *pcpu) +{ + int ret, cpu = (long)pcpu; + struct mapping_area *area; + + switch (action) { + case CPU_UP_PREPARE: + area = &per_cpu(zs_map_area, cpu); + ret = __zs_cpu_up(area); + if (ret) + return notifier_from_errno(ret); + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + area = &per_cpu(zs_map_area, cpu); + __zs_cpu_down(area); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block zs_cpu_nb = { + .notifier_call = zs_cpu_notifier +}; + +static void zs_exit(void) +{ + int cpu; + + for_each_online_cpu(cpu) + zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); + unregister_cpu_notifier(&zs_cpu_nb); +} + +static int zs_init(void) +{ + int cpu, ret; + + register_cpu_notifier(&zs_cpu_nb); + for_each_online_cpu(cpu) { + ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + if (notifier_to_errno(ret)) + goto fail; + } + return 0; +fail: + zs_exit(); + return notifier_to_errno(ret); +} + +/** + * zs_create_pool - Creates an allocation pool to work from. + * @flags: allocation flags used to allocate pool metadata + * + * This function must be called before anything when using + * the zsmalloc allocator. + * + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. + */ +struct zs_pool *zs_create_pool(gfp_t flags) +{ + int i, ovhd_size; + struct zs_pool *pool; + + ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); + pool = kzalloc(ovhd_size, GFP_KERNEL); + if (!pool) + return NULL; + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + int size; + struct size_class *class; + + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + + class = &pool->size_class[i]; + class->size = size; + class->index = i; + spin_lock_init(&class->lock); + class->pages_per_zspage = get_pages_per_zspage(size); + + } + + pool->flags = flags; + + return pool; +} +EXPORT_SYMBOL_GPL(zs_create_pool); + +void zs_destroy_pool(struct zs_pool *pool) +{ + int i; + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + int fg; + struct size_class *class = &pool->size_class[i]; + + for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { + if (class->fullness_list[fg]) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + } + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); + +/** + * zs_malloc - Allocate block of given size from pool. + * @pool: pool to allocate from + * @size: size of block to allocate + * + * On success, handle to the allocated object is returned, + * otherwise 0. + * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. + */ +unsigned long zs_malloc(struct zs_pool *pool, size_t size) +{ + unsigned long obj; + struct link_free *link; + int class_idx; + struct size_class *class; + + struct page *first_page, *m_page; + unsigned long m_objidx, m_offset; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + return 0; + + class_idx = get_size_class_index(size); + class = &pool->size_class[class_idx]; + BUG_ON(class_idx != class->index); + + spin_lock(&class->lock); + first_page = find_get_zspage(class); + + if (!first_page) { + spin_unlock(&class->lock); + first_page = alloc_zspage(class, pool->flags); + if (unlikely(!first_page)) + return 0; + + set_zspage_mapping(first_page, class->index, ZS_EMPTY); + spin_lock(&class->lock); + class->pages_allocated += class->pages_per_zspage; + } + + obj = (unsigned long)first_page->freelist; + obj_handle_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + link = (struct link_free *)kmap_atomic(m_page) + + m_offset / sizeof(*link); + first_page->freelist = link->next; + memset(link, POISON_INUSE, sizeof(*link)); + kunmap_atomic(link); + + first_page->inuse++; + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(pool, first_page); + spin_unlock(&class->lock); + + return obj; +} +EXPORT_SYMBOL_GPL(zs_malloc); + +void zs_free(struct zs_pool *pool, unsigned long obj) +{ + struct link_free *link; + struct page *first_page, *f_page; + unsigned long f_objidx, f_offset; + + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!obj)) + return; + + obj_handle_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = &pool->size_class[class_idx]; + f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + + spin_lock(&class->lock); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) + + f_offset); + link->next = first_page->freelist; + kunmap_atomic(link); + first_page->freelist = (void *)obj; + + first_page->inuse--; + fullness = fix_fullness_group(pool, first_page); + + if (fullness == ZS_EMPTY) + class->pages_allocated -= class->pages_per_zspage; + + spin_unlock(&class->lock); + + if (fullness == ZS_EMPTY) + free_zspage(first_page); +} +EXPORT_SYMBOL_GPL(zs_free); + +/** + * zs_map_object - get address of allocated object from handle. + * @pool: pool from which the object was allocated + * @handle: handle returned from zs_malloc + * + * Before using an object allocated from zs_malloc, it must be mapped using + * this function. When done with the object, it must be unmapped using + * zs_unmap_object. + * + * Only one object can be mapped per cpu at a time. There is no protection + * against nested mappings. + * + * This function returns with preemption and page faults disabled. + */ +void *zs_map_object(struct zs_pool *pool, unsigned long handle, + enum zs_mapmode mm) +{ + struct page *page; + unsigned long obj_idx, off; + + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; + + BUG_ON(!handle); + + /* + * Because we use per-cpu mapping areas shared among the + * pools/users, we can't allow mapping in interrupt context + * because it can corrupt another users mappings. + */ + BUG_ON(in_interrupt()); + + obj_handle_to_location(handle, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = &pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = &get_cpu_var(zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + area->vm_addr = kmap_atomic(page); + return area->vm_addr + off; + } + + /* this object spans two pages */ + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + return __zs_map_object(area, pages, off, class->size); +} +EXPORT_SYMBOL_GPL(zs_map_object); + +void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +{ + struct page *page; + unsigned long obj_idx, off; + + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + + BUG_ON(!handle); + + obj_handle_to_location(handle, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = &pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = &__get_cpu_var(zs_map_area); + if (off + class->size <= PAGE_SIZE) + kunmap_atomic(area->vm_addr); + else { + struct page *pages[2]; + + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + __zs_unmap_object(area, pages, off, class->size); + } + put_cpu_var(zs_map_area); +} +EXPORT_SYMBOL_GPL(zs_unmap_object); + +u64 zs_get_total_size_bytes(struct zs_pool *pool) +{ + int i; + u64 npages = 0; + + for (i = 0; i < ZS_SIZE_CLASSES; i++) + npages += pool->size_class[i].pages_allocated; + + return npages << PAGE_SHIFT; +} +EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); + +module_init(zs_init); +module_exit(zs_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta "); -- cgit v1.2.3 From 31fc00bb788ffde7d8d861d8b2bba798ab445992 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:55 -0800 Subject: zsmalloc: add copyright Add my copyright to the zsmalloc source code which I maintain. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d42adfcb67b..c03ca5e9fe15 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. -- cgit v1.2.3 From 8790c71a18e5d2d93532ae250bcf5eddbba729cd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 30 Jan 2014 15:46:08 -0800 Subject: mm/mempolicy.c: fix mempolicy printing in numa_maps As a result of commit 5606e3877ad8 ("mm: numa: Migrate on reference policy"), /proc//numa_maps prints the mempolicy for any as "prefer:N" for the local node, N, of the process reading the file. This should only be printed when the mempolicy of is MPOL_PREFERRED for node N. If the process is actually only using the default mempolicy for local node allocation, make sure "default" is printed as expected. Signed-off-by: David Rientjes Reported-by: Robert Lippert Cc: Peter Zijlstra Acked-by: Mel Gorman Cc: Ingo Molnar Cc: [3.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 873de7e542bc..ae3c8f3595d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2930,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy) { + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } -- cgit v1.2.3 From a03208652dad18232e9ec3432df69f9c19c856ec Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 30 Jan 2014 15:46:09 -0800 Subject: mm/slub.c: fix page->_count corruption (again) Commit abca7c496584 ("mm: fix slab->page _count corruption when using slub") notes that we can not _set_ a page->counters directly, except when using a real double-cmpxchg. Doing so can lose updates to ->_count. That is an absolute rule: You may not *set* page->counters except via a cmpxchg. Commit abca7c496584 fixed this for the folks who have the slub cmpxchg_double code turned off at compile time, but it left the bad case alone. It can still be reached, and the same bug triggered in two cases: 1. Turning on slub debugging at runtime, which is available on the distro kernels that I looked at. 2. On 64-bit CPUs with no CMPXCHG16B (some early AMD x86-64 cpus, evidently) There are at least 3 ways we could fix this: 1. Take all of the exising calls to cmpxchg_double_slab() and __cmpxchg_double_slab() and convert them to take an old, new and target 'struct page'. 2. Do (1), but with the newly-introduced 'slub_data'. 3. Do some magic inside the two cmpxchg...slab() functions to pull the counters out of new_counters and only set those fields in page->{inuse,frozen,objects}. I've done (2) as well, but it's a bunch more code. This patch is an attempt at (3). This was the most straightforward and foolproof way that I could think to do this. This would also technically allow us to get rid of the ugly #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) in 'struct page', but leaving it alone has the added benefit that 'counters' stays 'unsigned' instead of 'unsigned long', so all the copies that the slub code does stay a bit smaller. Signed-off-by: Dave Hansen Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Cc: Pravin B Shelar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 545a170ebf9f..2b1a6970e46f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); return 1; } @@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); return 1; -- cgit v1.2.3 From 778c14affaf94a9e4953179d3e13a544ccce7707 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 30 Jan 2014 15:46:11 -0800 Subject: mm, oom: base root bonus on current usage A 3% of system memory bonus is sometimes too excessive in comparison to other processes. With commit a63d83f427fb ("oom: badness heuristic rewrite"), the OOM killer tries to avoid killing privileged tasks by subtracting 3% of overall memory (system or cgroup) from their per-task consumption. But as a result, all root tasks that consume less than 3% of overall memory are considered equal, and so it only takes 33+ privileged tasks pushing the system out of memory for the OOM killer to do something stupid and kill dhclient or other root-owned processes. For example, on a 32G machine it can't tell the difference between the 1M agetty and the 10G fork bomb member. The changelog describes this 3% boost as the equivalent to the global overcommit limit being 3% higher for privileged tasks, but this is not the same as discounting 3% of overall memory from _every privileged task individually_ during OOM selection. Replace the 3% of system memory bonus with a 3% of current memory usage bonus. By giving root tasks a bonus that is proportional to their actual size, they remain comparable even when relatively small. In the example above, the OOM killer will discount the 1M agetty's 256 badness points down to 179, and the 10G fork bomb's 262144 points down to 183500 points and make the right choice, instead of discounting both to 0 and killing agetty because it's first in the task list. Signed-off-by: David Rientjes Reported-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 37b1b1903fb2..3291e82d4352 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -178,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - adj -= 30; + points -= (points * 3) / 100; /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; -- cgit v1.2.3 From 7c094fd698de2f333fa39b6da213f880d40b9bfe Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 30 Jan 2014 15:46:14 -0800 Subject: memcg: fix mutex not unlocked on memcg_create_kmem_cache fail path Commit 842e2873697e ("memcg: get rid of kmem_cache_dup()") introduced a mutex for memcg_create_kmem_cache() to protect the tmp_name buffer that holds the memcg name. It failed to unlock the mutex if this buffer could not be allocated. This patch fixes the issue by appropriately unlocking the mutex if the allocation fails. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: Glauber Costa Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 19d5d4274e22..53385cd4e6f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3400,7 +3400,7 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { - struct kmem_cache *new; + struct kmem_cache *new = NULL; static char *tmp_name = NULL; static DEFINE_MUTEX(mutex); /* protects tmp_name */ @@ -3416,7 +3416,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, if (!tmp_name) { tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); if (!tmp_name) - return NULL; + goto out; } rcu_read_lock(); @@ -3426,12 +3426,11 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); - if (new) new->allocflags |= __GFP_KMEMCG; else new = s; - +out: mutex_unlock(&mutex); return new; } -- cgit v1.2.3 From 67b6c900dc6dce65478d6fe37b60cd1e65bb80c2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 24 Jan 2014 07:20:23 -0800 Subject: mm: slub: work around unneeded lockdep warning The slub code does some setup during early boot in early_kmem_cache_node_alloc() with some local data. There is no possible way that another CPU can see this data, so the slub code doesn't unnecessarily lock it. However, some new lockdep asserts check to make sure that add_partial() _always_ has the list_lock held. Just add the locking, even though it is technically unnecessary. Cc: Peter Zijlstra Cc: Russell King Acked-by: David Rientjes Signed-off-by: Dave Hansen Signed-off-by: Pekka Enberg --- mm/slub.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a99e9e67c60e..432bddf484bb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2890,7 +2890,13 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); + /* + * the lock is for lockdep's sake, not for any actual + * race protection + */ + spin_lock(&n->list_lock); add_partial(n, page, DEACTIVATE_TO_HEAD); + spin_unlock(&n->list_lock); } static void free_kmem_cache_nodes(struct kmem_cache *s) -- cgit v1.2.3 From cb8ee1a3d429f8898972c869dd4792afb04e961a Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 28 Jan 2014 02:57:08 +0900 Subject: mm: Fix warning on make htmldocs caused by slab.c This patch fixed following errors while make htmldocs Warning(/mm/slab.c:1956): No description found for parameter 'page' Warning(/mm/slab.c:1956): Excess function parameter 'slabp' description in 'slab_destroy' Incorrect function parameter "slabp" was set instead of "page" Acked-by: Christoph Lameter Signed-off-by: Masanari Iida Signed-off-by: Pekka Enberg --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index eb043bf05f4c..b264214c77ea 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1946,7 +1946,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed + * @page: page pointer being destroyed * * Destroy all the objs in a slab, and release the mem back to the system. * Before calling the slab must have been unlinked from the cache. The -- cgit v1.2.3