aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c448
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c110
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c462
-rw-r--r--mm/memory-failure.c120
-rw-r--r--mm/memory.c58
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mlock.c19
-rw-r--r--mm/mmap.c68
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c687
-rw-r--r--mm/page-writeback.c282
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/rmap.c203
-rw-r--r--mm/shmem.c139
-rw-r--r--mm/slab.c6
-rw-r--r--mm/swapfile.c100
-rw-r--r--mm/truncate.c38
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c11
-rw-r--r--mm/vmscan.c548
-rw-r--r--mm/vmstat.c8
29 files changed, 2220 insertions, 1433 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f9fd3dd3916..eaa4a5bbe06 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include <trace/events/writeback.h>
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -49,8 +50,6 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -65,31 +64,25 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- struct bdi_writeback *wb;
+ struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
struct inode *inode;
- /*
- * inode lock is enough here, the bdi->wb_list is protected by
- * RCU on the reader side
- */
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(wb, &bdi->wb_list, list) {
- nr_wb++;
- list_for_each_entry(inode, &wb->b_dirty, i_list)
- nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
- nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
- nr_more_io++;
- }
+ list_for_each_entry(inode, &wb->b_dirty, i_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_list)
+ nr_more_io++;
spin_unlock(&inode_lock);
- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n"
- "WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
"bdi_list: %8u\n"
- "state: %8lx\n"
- "wb_list: %8u\n",
+ "state: %8lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
- !list_empty(&bdi->bdi_list), bdi->state,
- !list_empty(&bdi->wb_list));
+ K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
#undef K
return 0;
@@ -247,7 +237,6 @@ static int __init default_bdi_init(void)
sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
BUG_ON(IS_ERR(sync_supers_tsk));
- init_timer(&sync_supers_timer);
setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
bdi_arm_supers_timer();
@@ -259,77 +248,6 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
- memset(wb, 0, sizeof(*wb));
-
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
-}
-
-static void bdi_task_init(struct backing_dev_info *bdi,
- struct bdi_writeback *wb)
-{
- struct task_struct *tsk = current;
-
- spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&wb->list, &bdi->wb_list);
- spin_unlock(&bdi->wb_lock);
-
- tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(tsk, 0);
-}
-
-static int bdi_start_fn(void *ptr)
-{
- struct bdi_writeback *wb = ptr;
- struct backing_dev_info *bdi = wb->bdi;
- int ret;
-
- /*
- * Add us to the active bdi_list
- */
- spin_lock_bh(&bdi_lock);
- list_add_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_task_init(bdi, wb);
-
- /*
- * Clear pending bit and wakeup anybody waiting to tear us down
- */
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
-
- ret = bdi_writeback_task(wb);
-
- /*
- * Remove us from the list
- */
- spin_lock(&bdi->wb_lock);
- list_del_rcu(&wb->list);
- spin_unlock(&bdi->wb_lock);
-
- /*
- * Flush any work that raced with us exiting. No new work
- * will be added, since this bdi isn't discoverable anymore.
- */
- if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
-
- wb->task = NULL;
- return ret;
-}
-
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
@@ -348,10 +266,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
}
/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
* to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
*/
static int bdi_sync_supers(void *unused)
{
@@ -387,144 +305,198 @@ static void sync_supers_timer_fn(unsigned long unused)
bdi_arm_supers_timer();
}
-static int bdi_forker_task(void *ptr)
+static void wakeup_timer_fn(unsigned long data)
+{
+ struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+
+ spin_lock_bh(&bdi->wb_lock);
+ if (bdi->wb.task) {
+ trace_writeback_wake_thread(bdi);
+ wake_up_process(bdi->wb.task);
+ } else {
+ /*
+ * When bdi tasks are inactive for long time, they are killed.
+ * In this case we have to wake-up the forker thread which
+ * should create and run the bdi thread.
+ */
+ trace_writeback_wake_forker_thread(bdi);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+ spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+ return max(5UL * 60 * HZ, interval);
+}
+
+static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
- bdi_task_init(me->bdi, me);
+ current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(current, 0);
for (;;) {
- struct backing_dev_info *bdi, *tmp;
- struct bdi_writeback *wb;
+ struct task_struct *task = NULL;
+ struct backing_dev_info *bdi;
+ enum {
+ NO_ACTION, /* Nothing to do */
+ FORK_THREAD, /* Fork bdi thread */
+ KILL_THREAD, /* Kill inactive bdi thread */
+ } action = NO_ACTION;
/*
* Temporary measure, we want to make sure we don't see
* dirty data on the default backing_dev_info
*/
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+ del_timer(&me->wakeup_timer);
wb_do_writeback(me, 0);
+ }
spin_lock_bh(&bdi_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Check if any existing bdi's have dirty data without
- * a thread registered. If so, set that up.
- */
- list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
- if (bdi->wb.task)
- continue;
- if (list_empty(&bdi->work_list) &&
- !bdi_has_dirty_io(bdi))
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ bool have_dirty_io;
+
+ if (!bdi_cap_writeback_dirty(bdi) ||
+ bdi_cap_flush_forker(bdi))
continue;
- bdi_add_default_flusher_task(bdi);
- }
+ WARN(!test_bit(BDI_registered, &bdi->state),
+ "bdi %p/%s is not registered!\n", bdi, bdi->name);
- set_current_state(TASK_INTERRUPTIBLE);
+ have_dirty_io = !list_empty(&bdi->work_list) ||
+ wb_has_dirty_io(&bdi->wb);
- if (list_empty(&bdi_pending_list)) {
- unsigned long wait;
+ /*
+ * If the bdi has work to do, but the thread does not
+ * exist - create it.
+ */
+ if (!bdi->wb.task && have_dirty_io) {
+ /*
+ * Set the pending bit - if someone will try to
+ * unregister this bdi - it'll wait on this bit.
+ */
+ set_bit(BDI_pending, &bdi->state);
+ action = FORK_THREAD;
+ break;
+ }
+
+ spin_lock(&bdi->wb_lock);
+
+ /*
+ * If there is no work to do and the bdi thread was
+ * inactive long enough - kill it. The wb_lock is taken
+ * to make sure no-one adds more work to this bdi and
+ * wakes the bdi thread up.
+ */
+ if (bdi->wb.task && !have_dirty_io &&
+ time_after(jiffies, bdi->wb.last_active +
+ bdi_longest_inactive())) {
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock(&bdi->wb_lock);
+ set_bit(BDI_pending, &bdi->state);
+ action = KILL_THREAD;
+ break;
+ }
+ spin_unlock(&bdi->wb_lock);
+ }
+ spin_unlock_bh(&bdi_lock);
- spin_unlock_bh(&bdi_lock);
- wait = msecs_to_jiffies(dirty_writeback_interval * 10);
- if (wait)
- schedule_timeout(wait);
+ /* Keep working if default bdi still has things to do */
+ if (!list_empty(&me->bdi->work_list))
+ __set_current_state(TASK_RUNNING);
+
+ switch (action) {
+ case FORK_THREAD:
+ __set_current_state(TASK_RUNNING);
+ task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
+ dev_name(bdi->dev));
+ if (IS_ERR(task)) {
+ /*
+ * If thread creation fails, force writeout of
+ * the bdi from the thread.
+ */
+ bdi_flush_io(bdi);
+ } else {
+ /*
+ * The spinlock makes sure we do not lose
+ * wake-ups when racing with 'bdi_queue_work()'.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->wb.task = task;
+ spin_unlock_bh(&bdi->wb_lock);
+ }
+ break;
+
+ case KILL_THREAD:
+ __set_current_state(TASK_RUNNING);
+ kthread_stop(task);
+ break;
+
+ case NO_ACTION:
+ if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+ /*
+ * There are no dirty data. The only thing we
+ * should now care about is checking for
+ * inactive bdi threads and killing them. Thus,
+ * let's sleep for longer time, save energy and
+ * be friendly for battery-driven devices.
+ */
+ schedule_timeout(bdi_longest_inactive());
else
- schedule();
+ schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
try_to_freeze();
+ /* Back to the main loop */
continue;
}
- __set_current_state(TASK_RUNNING);
-
- /*
- * This is our real job - check for pending entries in
- * bdi_pending_list, and create the tasks that got added
- */
- bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
- bdi_list);
- list_del_init(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- wb = &bdi->wb;
- wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
- dev_name(bdi->dev));
/*
- * If task creation fails, then readd the bdi to
- * the pending list and force writeout of the bdi
- * from this forker thread. That will free some memory
- * and we can try again.
+ * Clear pending bit and wakeup anybody waiting to tear us down.
*/
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
-
- /*
- * Add this 'bdi' to the back, so we get
- * a chance to flush other bdi's to free
- * memory.
- */
- spin_lock_bh(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_flush_io(bdi);
- }
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
}
return 0;
}
-static void bdi_add_to_pending(struct rcu_head *head)
-{
- struct backing_dev_info *bdi;
-
- bdi = container_of(head, struct backing_dev_info, rcu_head);
- INIT_LIST_HEAD(&bdi->bdi_list);
-
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
-
- /*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
- */
- wake_up_process(default_backing_dev_info.wb.task);
-}
-
-/*
- * Add the default flusher task that gets created for any bdi
- * that has dirty data pending writeout
- */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
-{
- if (!bdi_cap_writeback_dirty(bdi))
- return;
-
- if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
- printk(KERN_ERR "bdi %p/%s is not registered!\n",
- bdi, bdi->name);
- return;
- }
-
- /*
- * Check with the helper whether to proceed adding a task. Will only
- * abort if we two or more simultanous calls to
- * bdi_add_default_flusher_task() occured, further additions will block
- * waiting for previous additions to finish.
- */
- if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_del_rcu(&bdi->bdi_list);
-
- /*
- * We must wait for the current RCU period to end before
- * moving to the pending list. So schedule that operation
- * from an RCU callback.
- */
- call_rcu(&bdi->rcu_head, bdi_add_to_pending);
- }
-}
-
/*
* Remove bdi from bdi_list, and ensure that it is no longer visible
*/
@@ -541,23 +513,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
- int ret = 0;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
- goto exit;
+ return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- goto exit;
- }
-
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
bdi->dev = dev;
@@ -569,21 +534,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;
- wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
dev_name(dev));
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
- ret = -ENOMEM;
-
- bdi_remove_from_list(bdi);
- goto exit;
- }
+ if (IS_ERR(wb->task))
+ return PTR_ERR(wb->task);
}
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
-exit:
- return ret;
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
EXPORT_SYMBOL(bdi_register);
@@ -598,31 +563,29 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- struct bdi_writeback *wb;
-
if (!bdi_cap_writeback_dirty(bdi))
return;
/*
- * If setup is pending, wait for that to complete first
+ * Make sure nobody finds us on the bdi_list anymore
*/
- wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
+ bdi_remove_from_list(bdi);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * If setup is pending, wait for that to complete first
*/
- bdi_remove_from_list(bdi);
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
/*
- * Finally, kill the kernel threads. We don't need to be RCU
+ * Finally, kill the kernel thread. We don't need to be RCU
* safe anymore, since the bdi is gone from visibility. Force
* unfreeze of the thread before calling kthread_stop(), otherwise
* it would never exet if it is currently stuck in the refrigerator.
*/
- list_for_each_entry(wb, &bdi->wb_list, list) {
- thaw_process(wb->task);
- kthread_stop(wb->task);
+ if (bdi->wb.task) {
+ thaw_process(bdi->wb.task);
+ kthread_stop(bdi->wb.task);
}
}
@@ -644,7 +607,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
+ del_timer_sync(&bdi->wb.wakeup_timer);
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
@@ -655,6 +620,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_unregister);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -666,7 +643,6 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9..3d4df44e422 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
do {
struct page *page;
- pgoff_t index; /* Pagecache index for current page */
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));
- index = pos >> PAGE_CACHE_SHIFT;
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009db..cc5be788a39 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
+EXPORT_SYMBOL_GPL(PageHuge);
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
+ page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (huge_pte_none(pte))
continue;
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ continue;
+
page = pte_page(pte);
if (pte_dirty(pte))
set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ page_remove_rmap(page);
list_del(&page->lru);
put_page(page);
}
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
return 1;
}
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_count(old_page) == 1);
+ avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
+ if (!trylock_page(old_page)) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
+ } else
+ unlock_page(old_page);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
return -PTR_ERR(new_page);
}
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
copy_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
@@ -2349,11 +2397,19 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
+ mmu_notifier_invalidate_range_start(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
+ mmu_notifier_invalidate_range_end(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
}
page_cache_release(new_page);
page_cache_release(old_page);
@@ -2452,10 +2508,29 @@ retry:
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
+ page_dup_rmap(page);
} else {
lock_page(page);
- page->mapping = HUGETLB_POISON;
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ hugepage_add_new_anon_rmap(page, vma, address);
}
+ } else {
+ page_dup_rmap(page);
+ }
+
+ /*
+ * Since memory error handler replaces pte into hwpoison swap entry
+ * at the time of error handling, a process which reserved but not have
+ * the mapping to the error hugepage does not have hwpoison swap entry.
+ * So we need to block accesses from such a process by checking
+ * PG_hwpoison bit here.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON;
+ goto backout_unlocked;
}
/*
@@ -2507,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *page = NULL;
struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ ptep = huge_pte_offset(mm, address);
+ if (ptep) {
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON;
+ }
+
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
@@ -2548,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
+ if (!pagecache_page) {
+ page = pte_page(entry);
+ lock_page(page);
+ }
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2573,6 +2661,8 @@ out_page_table_lock:
if (pagecache_page) {
unlock_page(pagecache_page);
put_page(pagecache_page);
+ } else {
+ unlock_page(page);
}
out_mutex:
@@ -2785,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_put_quota(inode->i_mapping, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+void __isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ spin_lock(&hugetlb_lock);
+ list_del(&hpage->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ spin_unlock(&hugetlb_lock);
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1..0948f1072d6 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
#include "internal.h"
static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
{
unsigned long pfn = val;
struct page *p;
+ struct page *hpage;
int err;
if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
return -ENXIO;
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
/*
* This implies unable to support free buddy pages.
*/
- if (!get_page_unless_zero(p))
+ if (!get_page_unless_zero(hpage))
return 0;
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
/*
* This implies unable to support non-LRU pages.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
return 0;
/*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
* We temporarily take page lock for try_get_mem_cgroup_from_page().
* __memory_failure() will redo the check reliably inside page lock.
*/
- lock_page(p);
- err = hwpoison_filter(p);
- unlock_page(p);
+ lock_page(hpage);
+ err = hwpoison_filter(hpage);
+ unlock_page(hpage);
if (err)
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da966..1d29cdfe8eb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac89..bd9bc214091 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -211,6 +211,9 @@ static signed long jiffies_scan_wait;
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
object = prio_tree_entry(node, struct kmemleak_object,
tree_node);
if (!alias && object->pointer != ptr) {
- kmemleak_warn("Found object by alias");
+ pr_warning("Found object by alias at 0x%08lx\n", ptr);
+ dump_stack();
+ dump_object_info(object);
object = NULL;
}
} else
@@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
}
/*
- * Make a object permanently as gray-colored so that it can no longer be
+ * Mark an object permanently as gray-colored so that it can no longer be
* reported as a leak. This is used in general to mark a false positive.
*/
static void make_gray_object(unsigned long ptr)
@@ -838,10 +843,19 @@ out:
rcu_read_unlock();
}
-/*
- * Memory allocation function callback. This function is called from the
- * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
- * vmalloc etc.).
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
@@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
-/*
- * Memory freeing function callback. This function is called from the kernel
- * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
*/
void __ref kmemleak_free(const void *ptr)
{
@@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
}
EXPORT_SYMBOL_GPL(kmemleak_free);
-/*
- * Partial memory freeing function callback. This function is usually called
- * from bootmem allocator when (part of) a memory block is freed.
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
*/
void __ref kmemleak_free_part(const void *ptr, size_t size)
{
@@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
-/*
- * Mark an already allocated memory block as a false positive. This will cause
- * the block to no longer be reported as leak and always be scanned.
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
*/
void __ref kmemleak_not_leak(const void *ptr)
{
@@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_not_leak);
-/*
- * Ignore a memory block. This is usually done when it is known that the
- * corresponding block is not a leak and does not contain any references to
- * other allocated memory blocks.
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
*/
void __ref kmemleak_ignore(const void *ptr)
{
@@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_ignore);
-/*
- * Limit the range to be scanned in an allocated memory block.
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
*/
void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
@@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(kmemleak_scan_area);
-/*
- * Inform kmemleak not to scan the given memory block.
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
*/
void __ref kmemleak_no_scan(const void *ptr)
{
@@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") != 0)
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
return -EINVAL;
return 0;
}
@@ -1616,6 +1661,13 @@ void __init kmemleak_init(void)
int i;
unsigned long flags;
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ kmemleak_disable();
+ return;
+ }
+#endif
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7..e2ae0045832 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
+#include <linux/hash.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
static struct rb_root root_stable_tree = RB_ROOT;
static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_HEADS 1024
-static struct hlist_head *mm_slots_hash;
+#define MM_SLOTS_HASH_SHIFT 10
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
kmem_cache_free(mm_slot_cache, mm_slot);
}
-static int __init mm_slots_hash_init(void)
-{
- mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!mm_slots_hash)
- return -ENOMEM;
- return 0;
-}
-
-static void __init mm_slots_hash_free(void)
-{
- kfree(mm_slots_hash);
-}
-
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
struct hlist_head *bucket;
struct hlist_node *node;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
hlist_for_each_entry(mm_slot, node, bucket, link) {
if (mm == mm_slot->mm)
return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
{
struct hlist_head *bucket;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
mm_slot->mm = mm;
hlist_add_head(&mm_slot->link, bucket);
}
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
struct anon_vma *anon_vma)
{
rmap_item->anon_vma = anon_vma;
- atomic_inc(&anon_vma->external_refcount);
+ get_anon_vma(anon_vma);
}
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
{
struct anon_vma *anon_vma = rmap_item->anon_vma;
- if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
+ drop_anon_vma(anon_vma);
}
/*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (!mapcount)
goto out;
}
@@ -1619,7 +1600,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1671,7 +1652,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
if (err)
goto out;
- err = mm_slots_hash_init();
- if (err)
- goto out_free1;
-
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
printk(KERN_ERR "ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
if (err) {
printk(KERN_ERR "ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
#endif
return 0;
-out_free2:
- mm_slots_hash_free();
-out_free1:
+out_free:
ksm_slab_free();
out:
return err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 3024eb30fc2..43840b305ec 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -504,7 +504,7 @@ int __init memblock_is_reserved(u64 addr)
int memblock_is_region_reserved(u64 base, u64 size)
{
- return memblock_overlaps_region(&memblock.reserved, base, size);
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
}
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af..3eed583895a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
#include "internal.h"
#include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
+
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +214,6 @@ struct mem_cgroup {
*/
spinlock_t reclaim_param_lock;
- int prev_priority; /* for recording reclaim priority */
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
@@ -268,6 +269,7 @@ enum move_type {
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
+ spinlock_t lock; /* for from, to, moving_task */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
struct mem_cgroup *curr = NULL;
+ struct task_struct *p;
- task_lock(task);
- rcu_read_lock();
- curr = try_get_mem_cgroup_from_mm(task->mm);
- rcu_read_unlock();
- task_unlock(task);
+ p = find_lock_task_mm(task);
+ if (!p)
+ return 0;
+ curr = try_get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
if (!curr)
return 0;
/*
@@ -858,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
return ret;
}
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
- int prev_priority;
-
- spin_lock(&mem->reclaim_param_lock);
- prev_priority = mem->prev_priority;
- spin_unlock(&mem->reclaim_param_lock);
-
- return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- if (priority < mem->prev_priority)
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
@@ -944,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -954,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -999,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
LIST_HEAD(pc_list);
struct list_head *src;
struct page_cgroup *pc, *tmp;
- int nid = z->zone_pgdat->node_id;
+ int nid = zone_to_nid(z);
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;
@@ -1038,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
}
*scanned = scan;
+
+ trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+ 0, 0, 0, mode);
+
return nr_taken;
}
@@ -1072,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ bool ret = false;
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1158,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
}
/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+ u64 limit;
+ u64 memsw;
+
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+ total_swap_pages;
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ /*
+ * If memsw is finite and limits the amount of swap space available
+ * to this memcg, return that limit.
+ */
+ return min(limit, memsw);
+}
+
+/*
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
@@ -1262,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/* we use swappiness of local cgroup */
if (check_soft)
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone,
- zone->zone_pgdat->node_id);
+ noswap, get_swappiness(victim), zone);
else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
@@ -1370,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
static void memcg_oom_recover(struct mem_cgroup *mem)
{
- if (atomic_read(&mem->oom_lock))
+ if (mem && atomic_read(&mem->oom_lock))
memcg_wakeup_oom(mem);
}
@@ -1582,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem = NULL;
+ int ret;
int csize = CHARGE_SIZE;
/*
@@ -1609,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
- mem = try_get_mem_cgroup_from_mm(mm);
- *memcg = mem;
- } else {
- css_get(&mem->css);
- }
- if (unlikely(!mem))
- return 0;
-
- VM_BUG_ON(css_is_removed(&mem->css));
- if (mem_cgroup_is_root(mem))
- goto done;
-
- while (1) {
- int ret = 0;
- unsigned long flags = 0;
-
+ if (!*memcg && !mm)
+ goto bypass;
+again:
+ if (*memcg) { /* css should be a valid one */
+ mem = *memcg;
+ VM_BUG_ON(css_is_removed(&mem->css));
+ if (mem_cgroup_is_root(mem))
+ goto done;
if (consume_stock(mem))
goto done;
+ css_get(&mem->css);
+ } else {
+ struct task_struct *p;
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
-
- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
- }
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;
-
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;
-
+ rcu_read_lock();
+ p = rcu_dereference(mm->owner);
+ VM_BUG_ON(!p);
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
+ * because we don't have task_lock(), "p" can exit while
+ * we're here. In that case, "mem" can point to root
+ * cgroup but never be NULL. (and task_struct itself is freed
+ * by RCU, cgroup itself is RCU safe.) Then, we have small
+ * risk here to get wrong cgroup. But such kind of mis-account
+ * by race always happens because we don't have cgroup_mutex().
+ * It's overkill and we allow that small race, here.
*/
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
-
- /* try to avoid oom while someone is moving charge */
- if (mc.moving_task && current != mc.moving_task) {
- struct mem_cgroup *from, *to;
- bool do_continue = false;
+ mem = mem_cgroup_from_task(p);
+ VM_BUG_ON(!mem);
+ if (mem_cgroup_is_root(mem)) {
+ rcu_read_unlock();
+ goto done;
+ }
+ if (consume_stock(mem)) {
/*
- * There is a small race that "from" or "to" can be
- * freed by rmdir, so we use css_tryget().
+ * It seems dagerous to access memcg without css_get().
+ * But considering how consume_stok works, it's not
+ * necessary. If consume_stock success, some charges
+ * from this memcg are cached on this cpu. So, we
+ * don't need to call css_get()/css_tryget() before
+ * calling consume_stock().
*/
- from = mc.from;
- to = mc.to;
- if (from && css_tryget(&from->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &from->css,
- &mem_over_limit->css);
- else
- do_continue = (from == mem_over_limit);
- css_put(&from->css);
- }
- if (!do_continue && to && css_tryget(&to->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &to->css,
- &mem_over_limit->css);
- else
- do_continue = (to == mem_over_limit);
- css_put(&to->css);
- }
- if (do_continue) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait,
- TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- continue;
- }
+ rcu_read_unlock();
+ goto done;
+ }
+ /* after here, we may be blocked. we need to get refcnt */
+ if (!css_tryget(&mem->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+ }
+
+ do {
+ bool oom_check;
+
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current)) {
+ css_put(&mem->css);
+ goto bypass;
+ }
+
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- if (!nr_retries--) {
- if (!oom)
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
+
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = PAGE_SIZE;
+ css_put(&mem->css);
+ mem = NULL;
+ goto again;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ css_put(&mem->css);
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
+ if (!oom) {
+ css_put(&mem->css);
goto nomem;
- if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- continue;
}
- /* When we reach here, current task is dying .*/
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
css_put(&mem->css);
goto bypass;
}
- }
+ } while (ret != CHARGE_OK);
+
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
+ css_put(&mem->css);
done:
+ *memcg = mem;
return 0;
nomem:
- css_put(&mem->css);
+ *memcg = NULL;
return -ENOMEM;
bypass:
*memcg = NULL;
@@ -1747,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
res_counter_uncharge(&mem->res, PAGE_SIZE * count);
if (do_swap_account)
res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
- VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
- WARN_ON_ONCE(count > INT_MAX);
- __css_put(&mem->css, (int)count);
}
- /* we don't need css_put for root */
}
static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1979,10 +2061,9 @@ out:
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
@@ -1992,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
return 0;
prefetchw(pc);
- mem = memcg;
ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
if (ret || !mem)
return ret;
@@ -2020,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
static void
@@ -2030,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
int ret;
if (mem_cgroup_disabled())
@@ -2051,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
-
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
@@ -2063,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
unlock_page_cgroup(pc);
}
- if (unlikely(!mm && !mem))
+ if (unlikely(!mm))
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
/* shmem */
if (PageSwapCache(page)) {
+ struct mem_cgroup *mem = NULL;
+
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+ MEM_CGROUP_CHARGE_TYPE_SHMEM);
return ret;
}
@@ -2114,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
goto charge_cur_mm;
*ptr = mem;
ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
- /* drop extra refcnt from tryget */
css_put(&mem->css);
return ret;
charge_cur_mm:
@@ -2245,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
- struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
@@ -2285,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
- if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
@@ -2299,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
* special functions.
*/
- mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
-
+ /*
+ * even after unlock, we have mem->res.usage here and this memcg
+ * will never be freed.
+ */
memcg_check_events(mem, page);
- /* at swapout, this memcg will be accessed to record to swap */
- if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- css_put(&mem->css);
+ if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+ mem_cgroup_swap_statistics(mem, true);
+ mem_cgroup_get(mem);
+ }
+ if (!mem_cgroup_is_root(mem))
+ __do_uncharge(mem, ctype);
return mem;
@@ -2392,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
memcg = __mem_cgroup_uncharge_common(page, ctype);
- /* record memcg information */
- if (do_swap_account && swapout && memcg) {
+ /*
+ * record memcg information, if swapout && memcg != NULL,
+ * mem_cgroup_get() was called in uncharge().
+ */
+ if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
- mem_cgroup_get(memcg);
- }
- if (swapout && memcg)
- css_put(&memcg->css);
}
#endif
@@ -2476,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
*/
if (!mem_cgroup_is_root(to))
res_counter_uncharge(&to->res, PAGE_SIZE);
- css_put(&to->css);
}
return 0;
}
@@ -2611,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
- if (unused != oldpage)
- pc = lookup_page_cgroup(unused);
__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
- pc = lookup_page_cgroup(used);
/*
* If a page is a file cache, radix-tree replacement is very atomic
* and we can skip this check. When it was an Anon page, its mapcount
@@ -2791,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask, int nid,
- int zid)
+ gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2804,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
if (order > 0)
return 0;
- mctz = soft_limit_tree_node_zone(nid, zid);
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
@@ -3759,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
return 0;
}
-/*
- */
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
@@ -4180,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
goto one_by_one;
}
mc.precharge += count;
- VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
- WARN_ON_ONCE(count > INT_MAX);
- __css_get(&mem->css, (int)count);
return ret;
}
one_by_one:
@@ -4400,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
static void mem_cgroup_clear_mc(void)
{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
- memcg_oom_recover(mc.to);
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4413,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
- memcg_oom_recover(mc.from);
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
- WARN_ON_ONCE(mc.moved_swap > INT_MAX);
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
@@ -4431,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
- VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
- __css_put(&mc.to->css, mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
+ spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.moving_task = NULL;
+ spin_unlock(&mc.lock);
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
@@ -4469,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
mc.moving_task = current;
+ spin_unlock(&mc.lock);
ret = mem_cgroup_precharge_mc(mm);
if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52caca..9c26eeca134 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
#include <linux/suspend.h>
#include <linux/slab.h>
#include <linux/swapops.h>
+#include <linux/hugetlb.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
/*
* Huge pages. Needs work.
* Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
+ * - To support soft-offlining for hugepage, we need to support hugepage
+ * migration.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
- return FAILED;
+ struct page *hpage = compound_head(p);
+ /*
+ * We can safely recover from error on free or reserved (i.e.
+ * not in-use) hugepage by dequeuing it from freelist.
+ * To check whether a hugepage is in-use or not, we can't use
+ * page->lru because it can be used in other hugepage operations,
+ * such as __unmap_hugepage_range() and gather_surplus_pages().
+ * So instead we use page_mapping() and PageAnon().
+ * We assume that this function is called with page lock held,
+ * so there is no race between isolation and mapping/unmapping.
+ */
+ if (!(page_mapping(hpage) || PageAnon(hpage))) {
+ __isolate_hwpoisoned_huge_page(hpage);
+ return RECOVERED;
+ }
+ return DELAYED;
}
/*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int i;
int kill = 1;
+ struct page *hpage = compound_head(p);
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(p))
+ if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageCompound(p) || PageKsm(p))
+ if (PageKsm(p))
return SWAP_FAIL;
if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
- mapping = page_mapping(p);
- if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
- if (page_mkclean(p)) {
- SetPageDirty(p);
+ mapping = page_mapping(hpage);
+ if (!PageDirty(hpage) && mapping &&
+ mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(p, &tokill);
+ collect_procs(hpage, &tokill);
/*
* try_to_unmap can fail temporarily due to races.
* Try a few times (RED-PEN better strategy?)
*/
for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(p, ttu);
+ ret = try_to_unmap(hpage, ttu);
if (ret == SWAP_SUCCESS)
break;
pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(p));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+ kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
ret != SWAP_SUCCESS, pfn);
return ret;
}
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ ClearPageHWPoison(hpage + i);
+}
+
int __memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
+ struct page *hpage;
int res;
+ unsigned int nr_pages;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
- atomic_long_add(1, &mce_bad_pages);
+ nr_pages = 1 << compound_order(hpage);
+ atomic_long_add(nr_pages, &mce_bad_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(compound_head(p))) {
+ !get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
- if (!PageLRU(p)) {
+ if (!PageLRU(p) && !PageHuge(p)) {
/*
* shake_page could have turned it free.
*/
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(p);
+ lock_page_nosync(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- unlock_page(p);
- put_page(p);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+
+ /*
+ * For error on the tail page, we should set PG_hwpoison
+ * on the head page to show that the hugepage is hwpoisoned
+ */
+ if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ action_result(pfn, "hugepage already hardware poisoned",
+ IGNORED);
+ unlock_page(hpage);
+ put_page(hpage);
return 0;
}
+ /*
+ * Set PG_hwpoison on all pages in an error hugepage,
+ * because containment is done in hugepage unit for now.
+ * Since we have done TestSetPageHWPoison() for the head page with
+ * page lock held, we can safely set PG_hwpoison bits on tail pages.
+ */
+ if (PageHuge(p))
+ set_page_hwpoison_huge_page(hpage);
wait_on_page_writeback(p);
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
}
out:
- unlock_page(p);
+ unlock_page(hpage);
return res;
}
EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
+ unsigned int nr_pages;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
+ nr_pages = 1 << compound_order(page);
+
if (!get_page_unless_zero(page)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
* the PG_hwpoison page will be caught and isolated on the entrance to
* the free buddy page pool.
*/
- if (TestClearPageHWPoison(p)) {
+ if (TestClearPageHWPoison(page)) {
pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_dec(&mce_bad_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
}
+ if (PageHuge(p))
+ clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d363..6b2ab105185 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start;
/*
* The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
if (addr > end - 1)
return;
- start = addr;
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2008,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr, end = addr + size;
+ unsigned long end = addr + size;
int err;
BUG_ON(addr >= end);
- mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2020,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- mmu_notifier_invalidate_range_end(mm, start, end);
+
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2630,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t pte;
struct mem_cgroup *ptr = NULL;
+ int exclusive = 0;
int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2724,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ exclusive = 1;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
- page_add_anon_rmap(page, vma, address);
+ do_page_add_anon_rmap(page, vma, address, exclusive);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
@@ -2760,6 +2760,40 @@ out_release:
}
/*
+ * This is like a special single-page "expand_{down|up}wards()",
+ * except we must first make sure that 'address{-|+}PAGE_SIZE'
+ * doesn't hit another vma.
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+ address &= PAGE_MASK;
+ if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+ struct vm_area_struct *prev = vma->vm_prev;
+
+ /*
+ * Is there a mapping abutting this one below?
+ *
+ * That's only ok if it's the same stack mapping
+ * that has gotten split..
+ */
+ if (prev && prev->vm_end == address)
+ return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
+
+ expand_stack(vma, address - PAGE_SIZE);
+ }
+ if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
+ struct vm_area_struct *next = vma->vm_next;
+
+ /* As VM_GROWSDOWN but s/below/above/ */
+ if (next && next->vm_start == address + PAGE_SIZE)
+ return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
+
+ expand_upwards(vma, address + PAGE_SIZE);
+ }
+ return 0;
+}
+
+/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,19 +2806,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t entry;
+ pte_unmap(page_table);
+
+ /* Check if we need to add a guard page to the stack */
+ if (check_stack_guard_page(vma, address) < 0)
+ return VM_FAULT_SIGBUS;
+
+ /* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/* Allocate our own private page. */
- pte_unmap(page_table);
-
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb5..f969da5dd8a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1275,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, new_nodes)
{
const struct cred *cred = current_cred(), *tcred;
- struct mm_struct *mm;
+ struct mm_struct *mm = NULL;
struct task_struct *task;
- nodemask_t old;
- nodemask_t new;
nodemask_t task_nodes;
int err;
+ nodemask_t *old;
+ nodemask_t *new;
+ NODEMASK_SCRATCH(scratch);
+
+ if (!scratch)
+ return -ENOMEM;
+
+ old = &scratch->mask1;
+ new = &scratch->mask2;
- err = get_nodes(&old, old_nodes, maxnode);
+ err = get_nodes(old, old_nodes, maxnode);
if (err)
- return err;
+ goto out;
- err = get_nodes(&new, new_nodes, maxnode);
+ err = get_nodes(new, new_nodes, maxnode);
if (err)
- return err;
+ goto out;
/* Find the mm_struct */
read_lock(&tasklist_lock);
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
read_unlock(&tasklist_lock);
- return -ESRCH;
+ err = -ESRCH;
+ goto out;
}
mm = get_task_mm(task);
read_unlock(&tasklist_lock);
+ err = -EINVAL;
if (!mm)
- return -EINVAL;
+ goto out;
/*
* Check if this process has the right to modify the specified
@@ -1322,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
- if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+ if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
goto out;
}
@@ -1336,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
if (err)
goto out;
- err = do_migrate_pages(mm, &old, &new,
+ err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
- mmput(mm);
+ if (mm)
+ mmput(mm);
+ NODEMASK_SCRATCH_FREE(scratch);
+
return err;
}
@@ -1712,6 +1724,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
}
#endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy. Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ bool ret = true;
+
+ if (!mask)
+ return ret;
+ task_lock(tsk);
+ mempolicy = tsk->mempolicy;
+ if (!mempolicy)
+ goto out;
+
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+ * allocate from, they may fallback to other nodes when oom.
+ * Thus, it's possible for tsk to have allocated memory from
+ * nodes in mask.
+ */
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ ret = nodes_intersects(mempolicy->v.nodes, *mask);
+ break;
+ default:
+ BUG();
+ }
+out:
+ task_unlock(tsk);
+ return ret;
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049..38e7cad782f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* exist when the page is remapped later
*/
anon_vma = page_anon_vma(page);
- atomic_inc(&anon_vma->external_refcount);
+ get_anon_vma(anon_vma);
}
}
@@ -682,12 +682,8 @@ skip_unmap:
rcu_unlock:
/* Drop an anon_vma reference if we took one */
- if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
if (rcu_locked)
rcu_read_unlock();
diff --git a/mm/mlock.c b/mm/mlock.c
index 3f82720e051..cbae7c5b956 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,6 +135,19 @@ void munlock_vma_page(struct page *page)
}
}
+/* Is the vma a continuation of the stack vma above it? */
+static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
+{
+ return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
+}
+
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return (vma->vm_flags & VM_GROWSDOWN) &&
+ (vma->vm_start == addr) &&
+ !vma_stack_continue(vma->vm_prev, addr);
+}
+
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -167,6 +180,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & VM_WRITE)
gup_flags |= FOLL_WRITE;
+ /* We don't try to access the guard page of a stack vma */
+ if (stack_guard_page(vma, start)) {
+ addr += PAGE_SIZE;
+ nr_pages--;
+ }
+
while (nr_pages > 0) {
int i;
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb75..6128dc8e5ed 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -388,17 +388,23 @@ static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
if (prev) {
- vma->vm_next = prev->vm_next;
+ next = prev->vm_next;
prev->vm_next = vma;
} else {
mm->mmap = vma;
if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
+ next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
else
- vma->vm_next = NULL;
+ next = NULL;
}
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -452,12 +458,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mapping->i_mmap_lock);
vma->vm_truncate_count = mapping->truncate_count;
}
- anon_vma_lock(vma);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
- anon_vma_unlock(vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -485,7 +489,11 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
+ struct vm_area_struct *next = vma->vm_next;
+
+ prev->vm_next = next;
+ if (next)
+ next->vm_prev = prev;
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
@@ -506,6 +514,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
+ struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
long adjust_next = 0;
int remove_next = 0;
@@ -578,6 +587,17 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ /*
+ * When changing only vma->vm_end, we don't really need anon_vma
+ * lock. This is a fairly rare case by itself, but the anon_vma
+ * lock may be shared between many sibling processes. Skipping
+ * the lock for brk adjustments makes a difference sometimes.
+ */
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+ anon_vma = vma->anon_vma;
+ anon_vma_lock(anon_vma);
+ }
+
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -617,6 +637,8 @@ again: remove_next = 1 + (end > next->vm_end);
__insert_vm_struct(mm, insert);
}
+ if (anon_vma)
+ anon_vma_unlock(anon_vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -1694,9 +1716,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-#ifndef CONFIG_IA64
-static
-#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1710,7 +1729,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1740,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else {
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
@@ -1739,7 +1758,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
perf_event_mmap(vma);
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1783,7 @@ static int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1805,7 @@ static int expand_downwards(struct vm_area_struct *vma,
perf_event_mmap(vma);
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return error;
}
@@ -1903,6 +1922,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ vma->vm_prev = NULL;
do {
rb_erase(&vma->vm_rb, &mm->mm_rb);
mm->map_count--;
@@ -1910,6 +1930,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
+ if (vma)
+ vma->vm_prev = prev;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2470,23 +2492,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->lock. If some other vma in this mm shares
+ * anon_vma->root->lock. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
}
}
@@ -2577,7 +2599,7 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
@@ -2588,12 +2610,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
}
diff --git a/mm/nommu.c b/mm/nommu.c
index b76f3ee0abe..88ff091eb07 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,11 +36,6 @@
#include <asm/mmu_context.h>
#include "internal.h"
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
-
#if 0
#define kenter(FMT, ...) \
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -609,7 +604,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, **pp;
+ struct vm_area_struct *pvma, **pp, *next;
struct address_space *mapping;
struct rb_node **p, *parent;
@@ -669,8 +664,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
break;
}
- vma->vm_next = *pp;
+ next = *pp;
*pp = vma;
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 709aedfaa01..fc81cb22869 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
+ * Copyright (C) 2010 Google, Inc.
+ * Rewritten by David Rientjes
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,188 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
#include <linux/security.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks;
+int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
+
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
+ */
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct task_struct *start = tsk;
+
+ do {
+ if (mask) {
+ /*
+ * If this is a mempolicy constrained oom, tsk's
+ * cpuset is irrelevant. Only return true if its
+ * mempolicy intersects current, otherwise it may be
+ * needlessly killed.
+ */
+ if (mempolicy_nodemask_intersects(tsk, mask))
+ return true;
+ } else {
+ /*
+ * This is not a mempolicy constrained oom, so only
+ * check the mems of tsk's cpuset.
+ */
+ if (cpuset_mems_allowed_intersects(current, tsk))
+ return true;
+ }
+ } while_each_thread(start, tsk);
+
+ return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
/*
- * Is all threads of the target process nodes overlap ours?
+ * If this is a system OOM (not a memcg OOM) and the task selected to be
+ * killed is not already running at high (RT) priorities, speed up the
+ * recovery by boosting the dying task to the lowest FIFO priority.
+ * That helps with the recovery and avoids interfering with RT tasks.
*/
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+static void boost_dying_task_prio(struct task_struct *p,
+ struct mem_cgroup *mem)
{
- struct task_struct *t;
+ struct sched_param param = { .sched_priority = 1 };
+
+ if (mem)
+ return;
+
+ if (!rt_task(p))
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+}
+
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer. Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
+struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+ struct task_struct *t = p;
- t = tsk;
do {
- if (cpuset_mems_allowed_intersects(current, t))
- return 1;
- t = next_thread(t);
- } while (t != tsk);
+ task_lock(t);
+ if (likely(t->mm))
+ return t;
+ task_unlock(t);
+ } while_each_thread(p, t);
- return 0;
+ return NULL;
+}
+
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
+ const nodemask_t *nodemask)
+{
+ if (is_global_init(p))
+ return true;
+ if (p->flags & PF_KTHREAD)
+ return true;
+
+ /* When mem_cgroup_out_of_memory() and p is not member of the group */
+ if (mem && !task_in_mem_cgroup(p, mem))
+ return true;
+
+ /* p may not have freeable memory in nodemask */
+ if (!has_intersects_mems_allowed(p, nodemask))
+ return true;
+
+ return false;
}
/**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
+ * @totalpages: total present RAM allowed for page allocation
*
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- * algorithm has been meticulously tuned to meet the principle
- * of least surprise ... (be careful when you change it)
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible. The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
*/
-
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+ const nodemask_t *nodemask, unsigned long totalpages)
{
- unsigned long points, cpu_time, run_time;
- struct mm_struct *mm;
- struct task_struct *child;
- int oom_adj = p->signal->oom_adj;
- struct task_cputime task_time;
- unsigned long utime;
- unsigned long stime;
+ int points;
- if (oom_adj == OOM_DISABLE)
+ if (oom_unkillable_task(p, mem, nodemask))
return 0;
- task_lock(p);
- mm = p->mm;
- if (!mm) {
- task_unlock(p);
+ p = find_lock_task_mm(p);
+ if (!p)
return 0;
- }
-
- /*
- * The memory size of the process is the basis for the badness.
- */
- points = mm->total_vm;
/*
- * After this unlock we can no longer dereference local variable `mm'
+ * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+ * need to be executed for something that cannot be killed.
*/
- task_unlock(p);
-
- /*
- * swapoff can easily use up all memory, so kill those first.
- */
- if (p->flags & PF_OOM_ORIGIN)
- return ULONG_MAX;
-
- /*
- * Processes which fork a lot of child processes are likely
- * a good choice. We add half the vmsize of the children if they
- * have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children. In case a single
- * child is eating the vast majority of memory, adding only half
- * to the parents will make the child our kill candidate of choice.
- */
- list_for_each_entry(child, &p->children, sibling) {
- task_lock(child);
- if (child->mm != mm && child->mm)
- points += child->mm->total_vm/2 + 1;
- task_unlock(child);
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ task_unlock(p);
+ return 0;
}
/*
- * CPU time is in tens of seconds and run time is in thousands
- * of seconds. There is no particular reason for this other than
- * that it turned out to work very well in practice.
- */
- thread_group_cputime(p, &task_time);
- utime = cputime_to_jiffies(task_time.utime);
- stime = cputime_to_jiffies(task_time.stime);
- cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-
-
- if (uptime >= p->start_time.tv_sec)
- run_time = (uptime - p->start_time.tv_sec) >> 10;
- else
- run_time = 0;
-
- if (cpu_time)
- points /= int_sqrt(cpu_time);
- if (run_time)
- points /= int_sqrt(int_sqrt(run_time));
-
- /*
- * Niced processes are most likely less important, so double
- * their badness points.
+ * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
+ * priority for oom killing.
*/
- if (task_nice(p) > 0)
- points *= 2;
+ if (p->flags & PF_OOM_ORIGIN) {
+ task_unlock(p);
+ return 1000;
+ }
/*
- * Superuser processes are usually more important, so we make it
- * less likely that we kill those.
+ * The memory controller may have a limit of 0 bytes, so avoid a divide
+ * by zero, if necessary.
*/
- if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
- has_capability_noaudit(p, CAP_SYS_RESOURCE))
- points /= 4;
+ if (!totalpages)
+ totalpages = 1;
/*
- * We don't want to kill a process with direct hardware access.
- * Not only could that mess up the hardware, but usually users
- * tend to only have this flag set on applications they think
- * of as important.
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss and swap space use.
*/
- if (has_capability_noaudit(p, CAP_SYS_RAWIO))
- points /= 4;
+ points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+ totalpages;
+ task_unlock(p);
/*
- * If p's nodes don't overlap ours, it may still help to kill p
- * because p may have allocated or otherwise mapped memory on
- * this node before. However it will be less likely.
+ * Root processes get 3% bonus, just like the __vm_enough_memory()
+ * implementation used by LSMs.
*/
- if (!has_intersects_mems_allowed(p))
- points /= 8;
+ if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+ points -= 30;
/*
- * Adjust the score by oom_adj.
+ * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+ * either completely disable oom killing or always prefer a certain
+ * task.
*/
- if (oom_adj) {
- if (oom_adj > 0) {
- if (!points)
- points = 1;
- points <<= oom_adj;
- } else
- points >>= -(oom_adj);
- }
+ points += p->signal->oom_score_adj;
-#ifdef DEBUG
- printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
- p->pid, p->comm, points);
-#endif
- return points;
+ if (points < 0)
+ return 0;
+ return (points < 1000) ? points : 1000;
}
/*
@@ -199,12 +218,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
*/
#ifdef CONFIG_NUMA
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask)
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ bool cpuset_limited = false;
+ int nid;
+
+ /* Default to all available memory */
+ *totalpages = totalram_pages + total_swap_pages;
+ if (!zonelist)
+ return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
@@ -214,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
/*
- * The nodemask here is a nodemask passed to alloc_pages(). Now,
- * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
- * feature. mempolicy is an only user of nodemask here.
- * check mempolicy's nodemask contains all N_HIGH_MEMORY
+ * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+ * the page allocator means a mempolicy is in effect. Cpuset policy
+ * is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+ if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, *nodemask)
+ *totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
+ }
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask)
if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
- return CONSTRAINT_CPUSET;
+ cpuset_limited = true;
+ if (cpuset_limited) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, cpuset_current_mems_allowed)
+ *totalpages += node_spanned_pages(nid);
+ return CONSTRAINT_CPUSET;
+ }
return CONSTRAINT_NONE;
}
#else
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask)
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
{
+ *totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
@@ -244,28 +282,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints,
- struct mem_cgroup *mem)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+ unsigned long totalpages, struct mem_cgroup *mem,
+ const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *chosen = NULL;
- struct timespec uptime;
*ppoints = 0;
- do_posix_clock_monotonic_gettime(&uptime);
for_each_process(p) {
- unsigned long points;
+ unsigned int points;
- /*
- * skip kernel threads and tasks which have already released
- * their mm.
- */
- if (!p->mm)
- continue;
- /* skip the init task */
- if (is_global_init(p))
- continue;
- if (mem && !task_in_mem_cgroup(p, mem))
+ if (oom_unkillable_task(p, mem, nodemask))
continue;
/*
@@ -290,19 +318,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
* the process of exiting and releasing its resources.
* Otherwise we could get an easy OOM deadlock.
*/
- if (p->flags & PF_EXITING) {
+ if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
if (p != current)
return ERR_PTR(-1UL);
chosen = p;
- *ppoints = ULONG_MAX;
+ *ppoints = 1000;
}
- if (p->signal->oom_adj == OOM_DISABLE)
- continue;
-
- points = badness(p, uptime.tv_sec);
- if (points > *ppoints || !chosen) {
+ points = oom_badness(p, mem, nodemask, totalpages);
+ if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
@@ -313,11 +338,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
- * @mem: target memory controller
+ * @mem: current's memory controller, if constrained
*
* Dumps the current memory state of all system tasks, excluding kernel threads.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
*
* If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
* shown.
@@ -326,44 +351,43 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*/
static void dump_tasks(const struct mem_cgroup *mem)
{
- struct task_struct *g, *p;
-
- printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
- "name\n");
- do_each_thread(g, p) {
- struct mm_struct *mm;
+ struct task_struct *p;
+ struct task_struct *task;
- if (mem && !task_in_mem_cgroup(p, mem))
+ pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
+ for_each_process(p) {
+ if (p->flags & PF_KTHREAD)
continue;
- if (!thread_group_leader(p))
+ if (mem && !task_in_mem_cgroup(p, mem))
continue;
- task_lock(p);
- mm = p->mm;
- if (!mm) {
+ task = find_lock_task_mm(p);
+ if (!task) {
/*
- * total_vm and rss sizes do not exist for tasks with no
- * mm so there's no need to report them; they can't be
- * oom killed anyway.
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
*/
- task_unlock(p);
continue;
}
- printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
- p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
- get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
- p->comm);
- task_unlock(p);
- } while_each_thread(g, p);
+
+ pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
+ task->pid, task_uid(task), task->tgid,
+ task->mm->total_vm, get_mm_rss(task->mm),
+ task_cpu(task), task->signal->oom_adj,
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+ }
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
struct mem_cgroup *mem)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_adj=%d\n",
- current->comm, gfp_mask, order, current->signal->oom_adj);
task_lock(current);
+ pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+ "oom_adj=%d, oom_score_adj=%d\n",
+ current->comm, gfp_mask, order, current->signal->oom_adj,
+ current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
@@ -374,72 +398,42 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
#define K(x) ((x) << (PAGE_SHIFT-10))
-
-/*
- * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
- * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
- * set.
- */
-static void __oom_kill_task(struct task_struct *p, int verbose)
+static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
- if (is_global_init(p)) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill init!\n");
- return;
- }
-
- task_lock(p);
- if (!p->mm) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
- task_pid_nr(p), p->comm);
- task_unlock(p);
- return;
- }
+ p = find_lock_task_mm(p);
+ if (!p)
+ return 1;
- if (verbose)
- printk(KERN_ERR "Killed process %d (%s) "
- "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
- task_pid_nr(p), p->comm,
- K(p->mm->total_vm),
- K(get_mm_counter(p->mm, MM_ANONPAGES)),
- K(get_mm_counter(p->mm, MM_FILEPAGES)));
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(p), p->comm, K(p->mm->total_vm),
+ K(get_mm_counter(p->mm, MM_ANONPAGES)),
+ K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
+
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+ force_sig(SIGKILL, p);
+
/*
* We give our sacrificial lamb high priority and access to
* all the memory it needs. That way it should be able to
* exit() and clear out its resources quickly...
*/
- p->rt.time_slice = HZ;
- set_tsk_thread_flag(p, TIF_MEMDIE);
-
- force_sig(SIGKILL, p);
-}
-
-static int oom_kill_task(struct task_struct *p)
-{
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
- *
- * Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
- * However, this is of no concern to us.
- */
- if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
- return 1;
-
- __oom_kill_task(p, 1);
+ boost_dying_task_prio(p, mem);
return 0;
}
+#undef K
static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned long points, struct mem_cgroup *mem,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *mem, nodemask_t *nodemask,
const char *message)
{
- struct task_struct *c;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t = p;
+ unsigned int victim_points = 0;
if (printk_ratelimit())
dump_header(p, gfp_mask, order, mem);
@@ -449,40 +443,81 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
- __oom_kill_task(p, 0);
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+ boost_dying_task_prio(p, mem);
return 0;
}
- printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
- message, task_pid_nr(p), p->comm, points);
+ task_lock(p);
+ pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+ task_unlock(p);
- /* Try to kill a child first */
- list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == p->mm)
- continue;
- if (mem && !task_in_mem_cgroup(c, mem))
- continue;
- if (!oom_kill_task(c))
- return 0;
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ do {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child, mem, nodemask,
+ totalpages);
+ if (child_points > victim_points) {
+ victim = child;
+ victim_points = child_points;
+ }
+ }
+ } while_each_thread(p, t);
+
+ return oom_kill_task(victim, mem);
+}
+
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+ int order)
+{
+ if (likely(!sysctl_panic_on_oom))
+ return;
+ if (sysctl_panic_on_oom != 2) {
+ /*
+ * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+ * does not panic for cpuset, mempolicy, or memcg allocation
+ * failures.
+ */
+ if (constraint != CONSTRAINT_NONE)
+ return;
}
- return oom_kill_task(p);
+ read_lock(&tasklist_lock);
+ dump_header(NULL, gfp_mask, order, NULL);
+ read_unlock(&tasklist_lock);
+ panic("Out of memory: %s panic_on_oom is enabled\n",
+ sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
- unsigned long points = 0;
+ unsigned long limit;
+ unsigned int points = 0;
struct task_struct *p;
- if (sysctl_panic_on_oom == 2)
- panic("out of memory(memcg). panic_on_oom is selected.\n");
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+ limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
- p = select_bad_process(&points, mem);
+ p = select_bad_process(&points, limit, mem, NULL);
if (!p || PTR_ERR(p) == -1UL)
goto out;
- if (oom_kill_process(p, gfp_mask, 0, points, mem,
+ if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
"Memory cgroup out of memory"))
goto retry;
out:
@@ -509,7 +544,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
* if a parallel OOM killing is already taking place that includes a zone in
* the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
*/
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
struct zoneref *z;
struct zone *zone;
@@ -526,7 +561,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
/*
* Lock each zone in the zonelist under zone_scan_lock so a
- * parallel invocation of try_set_zone_oom() doesn't succeed
+ * parallel invocation of try_set_zonelist_oom() doesn't succeed
* when it shouldn't.
*/
zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -555,65 +590,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
}
/*
- * Must be called with tasklist_lock held for read.
+ * Try to acquire the oom killer lock for all system zones. Returns zero if a
+ * parallel oom killing is taking place, otherwise locks all zones and returns
+ * non-zero.
*/
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static int try_set_system_oom(void)
{
- struct task_struct *p;
- unsigned long points;
-
- if (sysctl_oom_kill_allocating_task)
- if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
- "Out of memory (oom_kill_allocating_task)"))
- return;
-retry:
- /*
- * Rambo mode: Shoot down a process and hope it solves whatever
- * issues we may have.
- */
- p = select_bad_process(&points, NULL);
-
- if (PTR_ERR(p) == -1UL)
- return;
-
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- dump_header(NULL, gfp_mask, order, NULL);
- panic("Out of memory and no killable processes...\n");
- }
+ struct zone *zone;
+ int ret = 1;
- if (oom_kill_process(p, gfp_mask, order, points, NULL,
- "Out of memory"))
- goto retry;
+ spin_lock(&zone_scan_lock);
+ for_each_populated_zone(zone)
+ if (zone_is_oom_locked(zone)) {
+ ret = 0;
+ goto out;
+ }
+ for_each_populated_zone(zone)
+ zone_set_flag(zone, ZONE_OOM_LOCKED);
+out:
+ spin_unlock(&zone_scan_lock);
+ return ret;
}
/*
- * pagefault handler calls into here because it is out of memory but
- * doesn't know exactly how or why.
+ * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
+ * attempts or page faults may now recall the oom killer, if necessary.
*/
-void pagefault_out_of_memory(void)
+static void clear_system_oom(void)
{
- unsigned long freed = 0;
-
- blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
- /* Got some memory back in the last second. */
- return;
-
- if (sysctl_panic_on_oom)
- panic("out of memory from page fault. panic_on_oom is selected.\n");
-
- read_lock(&tasklist_lock);
- __out_of_memory(0, 0); /* unknown gfp_mask and order */
- read_unlock(&tasklist_lock);
+ struct zone *zone;
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory.
- */
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ spin_lock(&zone_scan_lock);
+ for_each_populated_zone(zone)
+ zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ spin_unlock(&zone_scan_lock);
}
/**
@@ -621,6 +631,7 @@ void pagefault_out_of_memory(void)
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
@@ -630,49 +641,93 @@ void pagefault_out_of_memory(void)
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{
+ struct task_struct *p;
+ unsigned long totalpages;
unsigned long freed = 0;
- enum oom_constraint constraint;
+ unsigned int points;
+ enum oom_constraint constraint = CONSTRAINT_NONE;
+ int killed = 0;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return;
- if (sysctl_panic_on_oom == 2) {
- dump_header(NULL, gfp_mask, order, NULL);
- panic("out of memory. Compulsory panic_on_oom is selected.\n");
+ /*
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
+ */
+ if (fatal_signal_pending(current)) {
+ set_thread_flag(TIF_MEMDIE);
+ boost_dying_task_prio(current, NULL);
+ return;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+ constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+ &totalpages);
+ check_panic_on_oom(constraint, gfp_mask, order);
+
read_lock(&tasklist_lock);
+ if (sysctl_oom_kill_allocating_task &&
+ !oom_unkillable_task(current, NULL, nodemask) &&
+ (current->signal->oom_adj != OOM_DISABLE)) {
+ /*
+ * oom_kill_process() needs tasklist_lock held. If it returns
+ * non-zero, current could not be killed so we must fallback to
+ * the tasklist scan.
+ */
+ if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+ NULL, nodemask,
+ "Out of memory (oom_kill_allocating_task)"))
+ goto out;
+ }
- switch (constraint) {
- case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, gfp_mask, order, 0, NULL,
- "No available memory (MPOL_BIND)");
- break;
+retry:
+ p = select_bad_process(&points, totalpages, NULL,
+ constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
+ NULL);
+ if (PTR_ERR(p) == -1UL)
+ goto out;
- case CONSTRAINT_NONE:
- if (sysctl_panic_on_oom) {
- dump_header(NULL, gfp_mask, order, NULL);
- panic("out of memory. panic_on_oom is selected\n");
- }
- /* Fall-through */
- case CONSTRAINT_CPUSET:
- __out_of_memory(gfp_mask, order);
- break;
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ dump_header(NULL, gfp_mask, order, NULL);
+ read_unlock(&tasklist_lock);
+ panic("Out of memory and no killable processes...\n");
}
+ if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+ nodemask, "Out of memory"))
+ goto retry;
+ killed = 1;
+out:
read_unlock(&tasklist_lock);
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
*/
+ if (killed && !test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * The pagefault handler calls here because it is out of memory, so kill a
+ * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
+ * oom killing is already in progress so do nothing. If a task is found with
+ * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ */
+void pagefault_out_of_memory(void)
+{
+ if (try_set_system_oom()) {
+ out_of_memory(NULL, 0, 0, NULL);
+ clear_system_oom();
+ }
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef6154..e3bccac1f02 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <trace/events/writeback.h>
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
}
}
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
- unsigned long dirty, unsigned long *pbdi_dirty)
-{
- unsigned long avail_dirty;
-
- avail_dirty = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK_TEMP);
-
- if (avail_dirty < dirty)
- avail_dirty = dirty - avail_dirty;
- else
- avail_dirty = 0;
-
- avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
- bdi_stat(bdi, BDI_WRITEBACK);
-
- *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
-
static inline void task_dirties_fraction(struct task_struct *tsk,
long *numerator, long *denominator)
{
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
}
/*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
*
* task specific dirty limit:
*
* dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
*/
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+ unsigned long bdi_dirty)
{
long numerator, denominator;
- unsigned long dirty = *pdirty;
+ unsigned long dirty = bdi_dirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
do_div(inv, denominator);
dirty -= inv;
- if (dirty < *pdirty/2)
- dirty = *pdirty/2;
- *pdirty = dirty;
+ return max(dirty, bdi_dirty/2);
}
/*
@@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
- unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * runtime tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
@@ -450,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+}
- if (bdi) {
- u64 bdi_dirty;
- long numerator, denominator;
+/*
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+ u64 bdi_dirty;
+ long numerator, denominator;
- /*
- * Calculate this BDI's share of the dirty ratio.
- */
- bdi_writeout_fraction(bdi, &numerator, &denominator);
-
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
-
- *pbdi_dirty = bdi_dirty;
- clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
- task_dirty_limit(current, pbdi_dirty);
- }
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ return bdi_dirty;
}
/*
@@ -490,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
-
+ bool dirty_exceeded = false;
struct backing_dev_info *bdi = mapping->backing_dev_info;
for (;;) {
@@ -501,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_writeback = global_page_state(NR_WRITEBACK);
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
- break;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Throttle it only when the background writeback cannot
@@ -523,24 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
(background_thresh + dirty_thresh) / 2)
break;
- if (!bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
- */
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wb(&bdi->wb, &wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
- }
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -555,16 +530,45 @@ static void balance_dirty_pages(struct address_space *mapping,
if (bdi_thresh < 2*bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else if (bdi_nr_reclaimable) {
+ } else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
}
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ /*
+ * The bdi thresh is somehow "soft" limit derived from the
+ * global "hard" limit. The former helps to prevent heavy IO
+ * bdi or process from holding back light ones; The latter is
+ * the last resort safeguard.
+ */
+ dirty_exceeded =
+ (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+ || (nr_reclaimable + nr_writeback >= dirty_thresh);
+
+ if (!dirty_exceeded)
break;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ * Only move pages to writeback if this bdi is over its
+ * threshold otherwise wait until the disk writes catch
+ * up.
+ */
+ trace_wbc_balance_dirty_start(&wbc, bdi);
+ if (bdi_nr_reclaimable > bdi_thresh) {
+ writeback_inodes_wb(&bdi->wb, &wbc);
+ pages_written += write_chunk - wbc.nr_to_write;
+ trace_wbc_balance_dirty_written(&wbc, bdi);
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+ }
+ trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_INTERRUPTIBLE);
io_schedule_timeout(pause);
@@ -577,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
- bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -593,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
bdi_start_background_writeback(bdi);
}
@@ -659,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -805,6 +806,42 @@ void __init page_writeback_init(void)
}
/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+#define WRITEBACK_TAG_BATCH 4096
+ unsigned long tagged;
+
+ do {
+ spin_lock_irq(&mapping->tree_lock);
+ tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+ &start, end, WRITEBACK_TAG_BATCH,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+ cond_resched();
+ /* We check 'start' to handle wrapping when end == ~0UL */
+ } while (tagged >= WRITEBACK_TAG_BATCH && start);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +855,13 @@ void __init page_writeback_init(void)
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +877,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -849,29 +894,19 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
-
- /*
- * If this is a data integrity sync, cap the writeback to the
- * current end of file. Any extension to the file that occurs
- * after this is a new write and we don't need to write those
- * pages out to fulfil our data integrity requirements. If we
- * try to write them out, we can get stuck in this scan until
- * the concurrent writer stops adding dirty pages and extending
- * EOF.
- */
- if (wbc->sync_mode == WB_SYNC_ALL &&
- wbc->range_end == LLONG_MAX) {
- end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
- }
}
-
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
@@ -929,6 +964,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
+ trace_wbc_writepage(wbc, mapping->backing_dev_info);
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -949,22 +985,16 @@ continue_unlock:
}
}
- if (wbc->nr_to_write > 0) {
- if (--wbc->nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
}
}
pagevec_release(&pvec);
@@ -1096,6 +1126,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_CACHE_SIZE);
}
}
+EXPORT_SYMBOL(account_page_dirtied);
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
@@ -1327,6 +1358,9 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c..a9649f4b261 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct page *page;
/* Acquire the OOM killer lock for the zones in zonelist */
- if (!try_set_zone_oom(zonelist, gfp_mask)) {
+ if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
return NULL;
}
@@ -1759,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2052,15 +2055,23 @@ rebalance:
if (page)
goto got_pg;
- /*
- * The OOM killer does not trigger for high-order
- * ~__GFP_NOFAIL allocations so if no progress is being
- * made, there are no other options and retrying is
- * unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /*
+ * The oom killer is not called for high-order
+ * allocations that may fail, so if no progress
+ * is being made, there are no other options and
+ * retrying is unlikely to help.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto nopage;
+ /*
+ * The oom killer is not called for lowmem
+ * allocations to prevent needlessly killing
+ * innocent tasks.
+ */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto nopage;
+ }
goto restart;
}
@@ -4089,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
- zone->prev_priority = DEF_PRIORITY;
-
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/page_io.c b/mm/page_io.c
index 31a3b962230..2dee975bf46 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
- rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+ rw |= REQ_SYNC | REQ_UNPLUG;
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea..f6f0d2dda2e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
@@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
+ /*
+ * This VMA had no anon_vma yet. This anon_vma is
+ * the root of any anon_vma tree that might form.
+ */
+ anon_vma->root = anon_vma;
}
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
avc->anon_vma = anon_vma;
avc->vma = vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- list_add(&avc->same_anon_vma, &anon_vma->head);
+ list_add_tail(&avc->same_anon_vma, &anon_vma->head);
allocated = NULL;
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (unlikely(allocated))
anon_vma_free(allocated);
@@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
/*
@@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
avc = anon_vma_chain_alloc();
if (!avc)
goto out_error_free_anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
+
+ /*
+ * The root anon_vma's spinlock is the lock actually used when we
+ * lock any of the anon_vmas in this anon_vma tree.
+ */
+ anon_vma->root = pvma->anon_vma->root;
+ /*
+ * With KSM refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned
+ * until this anon_vma is freed, because the lock lives in the root.
+ */
+ get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
return 0;
@@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
if (!anon_vma)
return;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_del(&anon_vma_chain->same_anon_vma);
/* We must garbage collect the anon_vma if it's empty */
empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
- if (empty)
+ if (empty) {
+ /* We no longer need the root anon_vma */
+ if (anon_vma->root != anon_vma)
+ drop_anon_vma(anon_vma->root);
anon_vma_free(anon_vma);
+ }
}
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
- /* Unlink each anon_vma chained to the VMA. */
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
anon_vma_unlink(avc);
list_del(&avc->same_vma);
@@ -291,7 +316,7 @@ void __init anon_vma_init(void)
*/
struct anon_vma *page_lock_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma;
+ struct anon_vma *anon_vma, *root_anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
@@ -302,8 +327,21 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
- return anon_vma;
+ root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ spin_lock(&root_anon_vma->lock);
+
+ /*
+ * If this page is still mapped, then its anon_vma cannot have been
+ * freed. But if it has been unmapped, we have no security against
+ * the anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
+ * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
+ * anon_vma->root before page_unlock_anon_vma() is called to unlock.
+ */
+ if (page_mapped(page))
+ return anon_vma;
+
+ spin_unlock(&root_anon_vma->lock);
out:
rcu_read_unlock();
return NULL;
@@ -311,7 +349,7 @@ out:
void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
rcu_read_unlock();
}
@@ -326,6 +364,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned long address;
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ pgoff = page->index << huge_page_order(page_hstate(page));
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
/* page should be within @vma mapping range */
@@ -340,9 +380,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- if (PageAnon(page))
- ;
- else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+ if (PageAnon(page)) {
+ if (vma->anon_vma->root != page_anon_vma(page)->root)
+ return -EFAULT;
+ } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
@@ -369,6 +410,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte_t *pte;
spinlock_t *ptl;
+ if (unlikely(PageHuge(page))) {
+ pte = huge_pte_offset(mm, address);
+ ptl = &mm->page_table_lock;
+ goto check;
+ }
+
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
@@ -389,6 +436,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
}
ptl = pte_lockptr(mm, pmd);
+check:
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
@@ -743,14 +791,20 @@ static void __page_set_anon_rmap(struct page *page,
* If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the
* page mapping!
- *
- * So take the last AVC chain entry in the vma, which is
- * the deepest ancestor, and use the anon_vma from that.
*/
if (!exclusive) {
- struct anon_vma_chain *avc;
- avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
- anon_vma = avc->anon_vma;
+ if (PageAnon(page))
+ return;
+ anon_vma = anon_vma->root;
+ } else {
+ /*
+ * In this case, swapped-out-but-not-discarded swap-cache
+ * is remapped. So, no need to update page->mapping here.
+ * We convice anon_vma poitned by page->mapping is not obsolete
+ * because vma->anon_vma is necessary to be a family of it.
+ */
+ if (PageAnon(page))
+ return;
}
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +834,7 @@ static void __page_check_anon_rmap(struct page *page,
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
+ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
BUG_ON(page->index != linear_page_index(vma, address));
#endif
}
@@ -798,6 +853,17 @@ static void __page_check_anon_rmap(struct page *page,
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ do_page_add_anon_rmap(page, vma, address, 0);
+}
+
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
int first = atomic_inc_and_test(&page->_mapcount);
if (first)
__inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +873,7 @@ void page_add_anon_rmap(struct page *page,
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
if (first)
- __page_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(page, vma, address, exclusive);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -873,6 +939,12 @@ void page_remove_rmap(struct page *page)
page_clear_dirty(page);
set_page_dirty(page);
}
+ /*
+ * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+ * and not charged by memcg for now.
+ */
+ if (unlikely(PageHuge(page)))
+ return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1368,6 +1440,42 @@ int try_to_munlock(struct page *page)
return try_to_unmap_file(page, TTU_MUNLOCK);
}
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary. Be careful to do all the tests under the lock. Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+ BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
+ if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+ struct anon_vma *root = anon_vma->root;
+ int empty = list_empty(&anon_vma->head);
+ int last_root_user = 0;
+ int root_empty = 0;
+
+ /*
+ * The refcount on a non-root anon_vma got dropped. Drop
+ * the refcount on the root and check if we need to free it.
+ */
+ if (empty && anon_vma != root) {
+ BUG_ON(atomic_read(&root->external_refcount) <= 0);
+ last_root_user = atomic_dec_and_test(&root->external_refcount);
+ root_empty = list_empty(&root->head);
+ }
+ anon_vma_unlock(anon_vma);
+
+ if (empty) {
+ anon_vma_free(anon_vma);
+ if (root_empty && last_root_user)
+ anon_vma_free(root);
+ }
+ }
+}
+#endif
+
#ifdef CONFIG_MIGRATION
/*
* rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1497,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1399,7 +1507,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
return ret;
}
@@ -1445,3 +1553,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
return rmap_walk_file(page, rmap_one, arg);
}
#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ BUG_ON(!anon_vma);
+ if (!exclusive) {
+ struct anon_vma_chain *avc;
+ avc = list_entry(vma->anon_vma_chain.prev,
+ struct anon_vma_chain, same_vma);
+ anon_vma = avc->anon_vma;
+ }
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+ page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int first;
+ BUG_ON(!anon_vma);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(&page->_mapcount, 0);
+ __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db..080b09a57a8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/percpu_counter.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += pages;
+ percpu_counter_add(&sbinfo->used_blocks, -pages);
+ spin_lock(&inode->i_lock);
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
}
}
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
if (sgp == SGP_READ)
return shmem_swp_map(ZERO_PAGE(0));
/*
- * Test free_blocks against 1 not 0, since we have 1 data
+ * Test used_blocks against 1 less max_blocks, since we have 1 data
* page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data.
*/
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks <= 1) {
- spin_unlock(&sbinfo->stat_lock);
+ if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
return ERR_PTR(-ENOSPC);
- }
- sbinfo->free_blocks--;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
}
spin_unlock(&info->lock);
@@ -767,6 +766,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
loff_t newsize = attr->ia_size;
int error;
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
&& newsize != inode->i_size) {
struct page *page = NULL;
@@ -801,25 +804,22 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
}
}
- error = simple_setsize(inode, newsize);
+ /* XXX(truncate): truncate_setsize should be called last */
+ truncate_setsize(inode, newsize);
if (page)
page_cache_release(page);
- if (error)
- return error;
shmem_truncate_range(inode, newsize, (loff_t)-1);
}
- error = inode_change_ok(inode, attr);
- if (!error)
- generic_setattr(inode, attr);
+ setattr_copy(inode, attr);
#ifdef CONFIG_TMPFS_POSIX_ACL
- if (!error && (attr->ia_valid & ATTR_MODE))
+ if (attr->ia_valid & ATTR_MODE)
error = generic_acl_chmod(inode);
#endif
return error;
}
-static void shmem_delete_inode(struct inode *inode)
+static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -836,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode)
}
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
- clear_inode(inode);
+ end_writeback(inode);
}
static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
@@ -933,7 +933,7 @@ found:
/*
* Move _head_ to start search for next from here.
- * But be careful: shmem_delete_inode checks list_empty without taking
+ * But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
* would appear empty, if it were the only one on shmem_swaplist. We
* could avoid doing it if inode NULL; or use this minor optimization.
@@ -1223,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
struct shmem_sb_info *sbinfo;
struct page *filepage = *pagep;
struct page *swappage;
+ struct page *prealloc_page = NULL;
swp_entry_t *entry;
swp_entry_t swap;
gfp_t gfp;
@@ -1247,7 +1248,6 @@ repeat:
filepage = find_lock_page(mapping, idx);
if (filepage && PageUptodate(filepage))
goto done;
- error = 0;
gfp = mapping_gfp_mask(mapping);
if (!filepage) {
/*
@@ -1258,7 +1258,19 @@ repeat:
if (error)
goto failed;
radix_tree_preload_end();
+ if (sgp != SGP_READ && !prealloc_page) {
+ /* We don't care if this fails */
+ prealloc_page = shmem_alloc_page(gfp, info, idx);
+ if (prealloc_page) {
+ if (mem_cgroup_cache_charge(prealloc_page,
+ current->mm, GFP_KERNEL)) {
+ page_cache_release(prealloc_page);
+ prealloc_page = NULL;
+ }
+ }
+ }
}
+ error = 0;
spin_lock(&info->lock);
shmem_recalc_inode(inode);
@@ -1387,17 +1399,16 @@ repeat:
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0 ||
+ if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
shmem_acct_block(info->flags)) {
- spin_unlock(&sbinfo->stat_lock);
spin_unlock(&info->lock);
error = -ENOSPC;
goto failed;
}
- sbinfo->free_blocks--;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
+ spin_unlock(&inode->i_lock);
} else if (shmem_acct_block(info->flags)) {
spin_unlock(&info->lock);
error = -ENOSPC;
@@ -1407,28 +1418,38 @@ repeat:
if (!filepage) {
int ret;
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
+ if (!prealloc_page) {
+ spin_unlock(&info->lock);
+ filepage = shmem_alloc_page(gfp, info, idx);
+ if (!filepage) {
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ error = -ENOMEM;
+ goto failed;
+ }
+ SetPageSwapBacked(filepage);
- /* Precharge page while we can wait, compensate after */
- error = mem_cgroup_cache_charge(filepage, current->mm,
- GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- goto failed;
+ /*
+ * Precharge page while we can wait, compensate
+ * after
+ */
+ error = mem_cgroup_cache_charge(filepage,
+ current->mm, GFP_KERNEL);
+ if (error) {
+ page_cache_release(filepage);
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ filepage = NULL;
+ goto failed;
+ }
+
+ spin_lock(&info->lock);
+ } else {
+ filepage = prealloc_page;
+ prealloc_page = NULL;
+ SetPageSwapBacked(filepage);
}
- spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp);
if (IS_ERR(entry))
error = PTR_ERR(entry);
@@ -1469,13 +1490,19 @@ repeat:
}
done:
*pagep = filepage;
- return 0;
+ error = 0;
+ goto out;
failed:
if (*pagep != filepage) {
unlock_page(filepage);
page_cache_release(filepage);
}
+out:
+ if (prealloc_page) {
+ mem_cgroup_uncharge_cache_page(prealloc_page);
+ page_cache_release(prealloc_page);
+ }
return error;
}
@@ -1791,17 +1818,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ buf->f_bavail = buf->f_bfree =
+ sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
- spin_unlock(&sbinfo->stat_lock);
return 0;
}
@@ -2242,7 +2268,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
struct shmem_sb_info config = *sbinfo;
- unsigned long blocks;
unsigned long inodes;
int error = -EINVAL;
@@ -2250,9 +2275,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
return error;
spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (config.max_blocks < blocks)
+ if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
goto out;
if (config.max_inodes < inodes)
goto out;
@@ -2269,7 +2293,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
error = 0;
sbinfo->max_blocks = config.max_blocks;
- sbinfo->free_blocks = config.max_blocks - blocks;
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2302,7 +2325,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
static void shmem_put_super(struct super_block *sb)
{
- kfree(sb->s_fs_info);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ percpu_counter_destroy(&sbinfo->used_blocks);
+ kfree(sbinfo);
sb->s_fs_info = NULL;
}
@@ -2344,7 +2370,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#endif
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->free_blocks = sbinfo->max_blocks;
+ if (percpu_counter_init(&sbinfo->used_blocks, 0))
+ goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2496,7 +2523,7 @@ static const struct super_operations shmem_ops = {
.remount_fs = shmem_remount_fs,
.show_options = shmem_show_options,
#endif
- .delete_inode = shmem_delete_inode,
+ .evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
};
diff --git a/mm/slab.c b/mm/slab.c
index 736e497733d..fcae9815d3b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -2330,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - size;
+ && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 03aa2d55f1a..1f3f9c59a73 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
long total_swap_pages;
static int least_priority;
+static bool swap_for_hibernation;
+
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
@@ -318,8 +320,10 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
- /* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ /* reuse swap entry of cache-only swap if not hibernation. */
+ if (vm_swap_full()
+ && usage == SWAP_HAS_CACHE
+ && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -449,6 +453,8 @@ swp_entry_t get_swap_page(void)
spin_lock(&swap_lock);
if (nr_swap_pages <= 0)
goto noswap;
+ if (swap_for_hibernation)
+ goto noswap;
nr_swap_pages--;
for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -481,28 +487,6 @@ noswap:
return (swp_entry_t) {0};
}
-/* The only caller of this function is now susupend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
- struct swap_info_struct *si;
- pgoff_t offset;
-
- spin_lock(&swap_lock);
- si = swap_info[type];
- if (si && (si->flags & SWP_WRITEOK)) {
- nr_swap_pages--;
- /* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, 1);
- if (offset) {
- spin_unlock(&swap_lock);
- return swp_entry(type, offset);
- }
- nr_swap_pages++;
- }
- spin_unlock(&swap_lock);
- return (swp_entry_t) {0};
-}
-
static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -762,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
#endif
#ifdef CONFIG_HIBERNATION
+
+static pgoff_t hibernation_offset[MAX_SWAPFILES];
+/*
+ * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
+ * saved swap_map[] image to the disk will be an incomplete because it's
+ * changing without synchronization with hibernation snap shot.
+ * At resume, we just make swap_for_hibernation=false. We can forget
+ * used maps easily.
+ */
+void hibernation_freeze_swap(void)
+{
+ int i;
+
+ spin_lock(&swap_lock);
+
+ printk(KERN_INFO "PM: Freeze Swap\n");
+ swap_for_hibernation = true;
+ for (i = 0; i < MAX_SWAPFILES; i++)
+ hibernation_offset[i] = 1;
+ spin_unlock(&swap_lock);
+}
+
+void hibernation_thaw_swap(void)
+{
+ spin_lock(&swap_lock);
+ if (swap_for_hibernation) {
+ printk(KERN_INFO "PM: Thaw Swap\n");
+ swap_for_hibernation = false;
+ }
+ spin_unlock(&swap_lock);
+}
+
+/*
+ * Because updateing swap_map[] can make not-saved-status-change,
+ * we use our own easy allocator.
+ * Please see kernel/power/swap.c, Used swaps are recorded into
+ * RB-tree.
+ */
+swp_entry_t get_swap_for_hibernation(int type)
+{
+ pgoff_t off;
+ swp_entry_t val = {0};
+ struct swap_info_struct *si;
+
+ spin_lock(&swap_lock);
+
+ si = swap_info[type];
+ if (!si || !(si->flags & SWP_WRITEOK))
+ goto done;
+
+ for (off = hibernation_offset[type]; off < si->max; ++off) {
+ if (!si->swap_map[off])
+ break;
+ }
+ if (off < si->max) {
+ val = swp_entry(type, off);
+ hibernation_offset[type] = off + 1;
+ }
+done:
+ spin_unlock(&swap_lock);
+ return val;
+}
+
+void swap_free_for_hibernation(swp_entry_t ent)
+{
+ /* Nothing to do */
+}
+
/*
* Find the swap type that corresponds to given device (if any).
*
diff --git a/mm/truncate.c b/mm/truncate.c
index 937571b8b23..ba887bff48c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -541,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
EXPORT_SYMBOL(truncate_pagecache);
/**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updastes i_size update and performs pagecache
+ * truncation (if necessary) for a file size updates. It will be
+ * typically be called from the filesystem's setattr function when
+ * ATTR_SIZE is passed in.
+ *
+ * Must be called with inode_mutex held and after all filesystem
+ * specific block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize;
+
+ oldsize = inode->i_size;
+ i_size_write(inode, newsize);
+
+ truncate_pagecache(inode, oldsize, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+
+/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
* @offset: file offset to start truncating
*
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
- *
- * This function is deprecated and simple_setsize or truncate_pagecache
- * should be used instead.
+ * This function is deprecated and truncate_setsize or truncate_pagecache
+ * should be used instead, together with filesystem specific block truncation.
*/
int vmtruncate(struct inode *inode, loff_t offset)
{
int error;
- error = simple_setsize(inode, offset);
+ error = inode_newsize_ok(inode, offset);
if (error)
return error;
+ truncate_setsize(inode, offset);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
-
- return error;
+ return 0;
}
EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index f5712e8964b..4735ea48181 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
if (length > n)
return ERR_PTR(-EINVAL);
- p = kmalloc(length, GFP_KERNEL);
+ p = memdup_user(s, length);
- if (!p)
- return ERR_PTR(-ENOMEM);
-
- if (copy_from_user(p, s, length)) {
- kfree(p);
- return ERR_PTR(-EFAULT);
- }
+ if (IS_ERR(p))
+ return p;
p[length - 1] = '\0';
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b7e314b1009..6b8889da69a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+bool vmap_lazy_unmap __read_mostly = true;
/*** Page table manipulation functions ***/
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
{
unsigned int log;
+ if (!vmap_lazy_unmap)
+ return 0;
+
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -732,7 +736,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
node, gfp_mask);
if (unlikely(IS_ERR(va))) {
kfree(vb);
- return ERR_PTR(PTR_ERR(va));
+ return ERR_CAST(va);
}
err = radix_tree_preload(gfp_mask);
@@ -2437,8 +2441,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
unsigned int *ptr = NULL;
int ret;
- if (NUMA_BUILD)
+ if (NUMA_BUILD) {
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ if (ptr == NULL)
+ return -ENOMEM;
+ }
ret = seq_open(file, &vmalloc_op);
if (!ret) {
struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da4..c391c320dba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
+ trace_mm_vmscan_writepage(page,
+ trace_reclaim_flags(page, sync_writeback));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
}
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+ struct pagevec freed_pvec;
+ struct page *page, *tmp;
+
+ pagevec_init(&freed_pvec, 1);
+
+ list_for_each_entry_safe(page, tmp, free_pages, lru) {
+ list_del(&page->lru);
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
+ }
+
+ pagevec_free(&freed_pvec);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
+ LIST_HEAD(free_pages);
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
cond_resched();
- pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
enum page_references references;
struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
__clear_page_locked(page);
free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ list_add(&page->lru, &free_pages);
continue;
cull_mlocked:
@@ -832,9 +856,10 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+
+ free_page_list(&free_pages);
+
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long *scanned, int order, int mode, int file)
{
unsigned long nr_taken = 0;
+ unsigned long nr_lumpy_taken = 0;
+ unsigned long nr_lumpy_dirty = 0;
+ unsigned long nr_lumpy_failed = 0;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
nr_taken++;
+ nr_lumpy_taken++;
+ if (PageDirty(cursor_page))
+ nr_lumpy_dirty++;
scan++;
+ } else {
+ if (mode == ISOLATE_BOTH &&
+ page_count(cursor_page))
+ nr_lumpy_failed++;
}
}
}
*scanned = scan;
+
+ trace_mm_vmscan_lru_isolate(order,
+ nr_to_scan, scan,
+ nr_taken,
+ nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+ mode);
return nr_taken;
}
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
ClearPageActive(page);
nr_active++;
}
- count[lru]++;
+ if (count)
+ count[lru]++;
}
return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
}
/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
- * of reclaimed pages
+ * TODO: Try merging with migrations version of putback_lru_pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc,
- int priority, int file)
+static noinline_for_stack void
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
+ unsigned long nr_anon, unsigned long nr_file,
+ struct list_head *page_list)
{
- LIST_HEAD(page_list);
+ struct page *page;
struct pagevec pvec;
- unsigned long nr_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ pagevec_init(&pvec, 1);
- /* We are about to die and free our memory. Return now. */
- if (fatal_signal_pending(current))
- return SWAP_CLUSTER_MAX;
+ /*
+ * Put back any unfreeable pages.
+ */
+ spin_lock(&zone->lru_lock);
+ while (!list_empty(page_list)) {
+ int lru;
+ page = lru_to_page(page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ reclaim_stat->recent_rotated[file]++;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
}
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+ struct scan_control *sc,
+ unsigned long *nr_anon,
+ unsigned long *nr_file,
+ struct list_head *isolated_list)
+{
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- pagevec_init(&pvec, 1);
+ nr_active = clear_active_flags(isolated_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
- unsigned long nr_active;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
- unsigned long nr_anon;
- unsigned long nr_file;
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);
- if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan,
- sc->order, mode,
- zone, 0, file);
- zone->pages_scanned += nr_scan;
- if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scan);
- else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scan);
- } else {
- nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan,
- sc->order, mode,
- zone, sc->mem_cgroup,
- 0, file);
- /*
- * mem_cgroup_isolate_pages() keeps track of
- * scanned pages on its own.
- */
- }
+ *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
- if (nr_taken == 0)
- goto done;
+ reclaim_stat->recent_scanned[0] += *nr_anon;
+ reclaim_stat->recent_scanned[1] += *nr_file;
+}
- nr_active = clear_active_flags(&page_list, count);
- __count_vm_events(PGDEACTIVATE, nr_active);
+/*
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+ unsigned long nr_freed,
+ int priority,
+ struct scan_control *sc)
+{
+ int lumpy_stall_priority;
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
+ /* kswapd should not stall on sync IO */
+ if (current_is_kswapd())
+ return false;
- nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+ /* Only stall on lumpy reclaim */
+ if (!sc->lumpy_reclaim_mode)
+ return false;
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ /* If we have relaimed everything on the isolated list, no stall */
+ if (nr_freed == nr_taken)
+ return false;
- spin_unlock_irq(&zone->lru_lock);
+ /*
+ * For high-order allocations, there are two stall thresholds.
+ * High-cost allocations stall immediately where as lower
+ * order allocations such as stacks require the scanning
+ * priority to be much higher before stalling.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ lumpy_stall_priority = DEF_PRIORITY;
+ else
+ lumpy_stall_priority = DEF_PRIORITY / 3;
- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ return priority <= lumpy_stall_priority;
+}
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+ struct scan_control *sc, int priority, int file)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_taken;
+ unsigned long nr_active;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
+
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+
+ if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, 0, file);
+ zone->pages_scanned += nr_scanned;
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ nr_scanned);
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, sc->mem_cgroup,
+ 0, file);
/*
- * If we are direct reclaiming for contiguous pages and we do
- * not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
*/
- if (nr_freed < nr_taken && !current_is_kswapd() &&
- sc->lumpy_reclaim_mode) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, count);
- count_vm_events(PGDEACTIVATE, nr_active);
+ if (nr_taken == 0) {
+ spin_unlock_irq(&zone->lru_lock);
+ return 0;
+ }
- nr_freed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
- }
+ update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
- nr_reclaimed += nr_freed;
+ spin_unlock_irq(&zone->lru_lock);
- local_irq_disable();
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+ nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /* Check if we should syncronously wait for writeback */
+ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
- spin_lock(&zone->lru_lock);
/*
- * Put back any unfreeable pages.
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
*/
- while (!list_empty(&page_list)) {
- int lru;
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
- continue;
- }
- SetPageLRU(page);
- lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
- }
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ nr_active = clear_active_flags(&page_list, NULL);
+ count_vm_events(PGDEACTIVATE, nr_active);
- } while (nr_scanned < max_scan);
+ nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+ }
-done:
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
- return nr_reclaimed;
-}
+ local_irq_disable();
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
-/*
- * We are about to scan this zone at a certain priority level. If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone. This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
- if (priority < zone->prev_priority)
- zone->prev_priority = priority;
+ putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+ return nr_reclaimed;
}
/*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
}
/*
+ * With swappiness at 100, anonymous and file have the same priority.
+ * This scanning priority is essentially the inverse of IO cost.
+ */
+ anon_prio = sc->swappiness;
+ file_prio = 200 - sc->swappiness;
+
+ /*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+ spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
/*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = sc->swappiness;
- file_prio = 200 - sc->swappiness;
-
- /*
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
+ spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
bool all_unreclaimable = true;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- sc->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
if (scanning_global_lru(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- note_zone_scanning_priority(zone, priority);
-
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- } else {
- /*
- * Ignore cpuset limitation here. We just want to reduce
- * # of used pages by us regardless of memory shortage.
- */
- mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
- priority);
}
shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long lru_pages = 0;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;
get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (scanning_global_lru(sc))
count_vm_event(ALLOCSTALL);
- /*
- * mem_cgroup will not do shrink_slab.
- */
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- }
- }
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
* over limit cgroups
*/
if (scanning_global_lru(sc)) {
+ unsigned long lru_pages = 0;
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ }
+
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
if (priority < 0)
priority = 0;
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- zone->prev_priority = priority;
- }
- } else
- mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-
delayacct_freepages_end();
put_mems_allowed();
@@ -1888,6 +1940,7 @@ out:
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nodemask = nodemask,
};
- return do_try_to_free_pages(zonelist, &sc);
+ trace_mm_vmscan_direct_reclaim_begin(order,
+ sc.may_writepage,
+ gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
- struct zone *zone, int nid)
+ struct zone *zone)
{
struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
@@ -1918,13 +1980,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.order = 0,
.mem_cgroup = mem,
};
- nodemask_t nm = nodemask_of_node(nid);
-
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- sc.nodemask = &nm;
- sc.nr_reclaimed = 0;
- sc.nr_scanned = 0;
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed;
}
@@ -1942,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
unsigned int swappiness)
{
struct zonelist *zonelist;
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -1956,7 +2022,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
- return do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#endif
@@ -2028,22 +2103,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.order = order,
.mem_cgroup = NULL,
};
- /*
- * temp_priority is used to remember the scanning priority at which
- * this zone was successfully refilled to
- * free_pages == high_wmark_pages(zone).
- */
- int temp_priority[MAX_NR_ZONES];
-
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++)
- temp_priority[i] = DEF_PRIORITY;
-
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
@@ -2103,7 +2168,6 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- int nid, zid;
if (!populated_zone(zone))
continue;
@@ -2111,18 +2175,14 @@ loop_again:
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- temp_priority[i] = priority;
sc.nr_scanned = 0;
- note_zone_scanning_priority(zone, priority);
- nid = pgdat->node_id;
- zid = zone_idx(zone);
/*
* Call soft limit reclaim before calling shrink_zone.
* For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
- nid, zid);
+ mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+
/*
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
@@ -2186,16 +2246,6 @@ loop_again:
break;
}
out:
- /*
- * Note within each zone the priority level at which this zone was
- * brought into a happy state. So that the next thread which scans this
- * zone will start out at that priority level.
- */
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->prev_priority = temp_priority[i];
- }
if (!all_zones_ok) {
cond_resched();
@@ -2299,9 +2349,10 @@ static int kswapd(void *p)
* premature sleep. If not, then go fully
* to sleep until explicitly woken up
*/
- if (!sleeping_prematurely(pgdat, order, remaining))
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
schedule();
- else {
+ } else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
@@ -2321,8 +2372,10 @@ static int kswapd(void *p)
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (!ret)
+ if (!ret) {
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balance_pgdat(pgdat, order);
+ }
}
return 0;
}
@@ -2342,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2644,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.swappiness = vm_swappiness,
.order = order,
};
- unsigned long slab_reclaimable;
+ unsigned long nr_slab_pages0, nr_slab_pages1;
- disable_swap_token();
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
- note_zone_scanning_priority(zone, priority);
shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}
- slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (slab_reclaimable > zone->min_slab_pages) {
+ nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages0 > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
@@ -2629,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
- slab_reclaimable - nr_pages)
- ;
+ for (;;) {
+ unsigned long lru_pages = zone_reclaimable_pages(zone);
+
+ /* No reclaimable slab or very low memory pressure */
+ if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ break;
+
+ /* Freed enough memory */
+ nr_slab_pages1 = zone_page_state(zone,
+ NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+ break;
+ }
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- sc.nr_reclaimed += slab_reclaimable -
- zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 < nr_slab_pages0)
+ sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
}
p->reclaim_state = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e7..f389168f9a8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,14 +22,14 @@
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
+static void sum_vm_events(unsigned long *ret)
{
int cpu;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- for_each_cpu(cpu, cpumask) {
+ for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -45,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret, cpu_online_mask);
+ sum_vm_events(ret);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n all_unreclaimable: %u"
- "\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
zone->all_unreclaimable,
- zone->prev_priority,
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');