aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig83
-rw-r--r--mm/Kconfig.debug2
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/balloon_compaction.c1
-rw-r--r--mm/cma.c29
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c14
-rw-r--r--mm/debug.c2
-rw-r--r--mm/dmapool.c5
-rw-r--r--mm/filemap.c217
-rw-r--r--mm/gup.c409
-rw-r--r--mm/gup_benchmark.c5
-rw-r--r--mm/hmm.c1097
-rw-r--r--mm/huge_memory.c35
-rw-r--r--mm/hugetlb.c181
-rw-r--r--mm/hwpoison-inject.c1
-rw-r--r--mm/internal.h6
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/khugepaged.c7
-rw-r--r--mm/kmemleak-test.c14
-rw-r--r--mm/kmemleak.c15
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/list_lru.c9
-rw-r--r--mm/maccess.c1
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/memblock.c88
-rw-r--r--mm/memcontrol.c396
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c109
-rw-r--r--mm/memory_hotplug.c133
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mincore.c23
-rw-r--r--mm/mm_init.c1
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mmu_notifier.c12
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c13
-rw-r--r--mm/page_alloc.c353
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/percpu-internal.h15
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu-stats.c8
-rw-r--r--mm/percpu-vm.c3
-rw-r--r--mm/percpu.c552
-rw-r--r--mm/process_vm_access.c6
-rw-r--r--mm/readahead.c1
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/rodata_test.c6
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/shuffle.c207
-rw-r--r--mm/shuffle.h64
-rw-r--r--mm/slab.c273
-rw-r--r--mm/slob.c59
-rw-r--r--mm/slub.c72
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/swap.c3
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/truncate.c1
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/util.c64
-rw-r--r--mm/vmalloc.c1128
-rw-r--r--mm/vmscan.c209
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/z3fold.c642
-rw-r--r--mm/zbud.c1
-rw-r--r--mm/zpool.c1
-rw-r--r--mm/zswap.c11
75 files changed, 4394 insertions, 2336 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..f0c76ba47695 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "Memory Management options"
@@ -11,23 +12,24 @@ choice
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here selected by the architecture
+ configuration. This is normal.
config FLATMEM_MANUAL
bool "Flat Memory"
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
- This option allows you to change some of the ways that
- Linux manages its memory internally. Most users will
- only have one option here: FLATMEM. This is normal
- and a correct option.
-
- Some users of more advanced features like NUMA and
- memory hotplug may have different options here.
- DISCONTIGMEM is a more mature, better tested system,
- but is incompatible with memory hotplug and may suffer
- decreased performance over SPARSEMEM. If unsure between
- "Sparse Memory" and "Discontiguous Memory", choose
- "Discontiguous Memory".
+ This option is best suited for non-NUMA systems with
+ flat address space. The FLATMEM is the most efficient
+ system in terms of performance and resource consumption
+ and it is the best option for smaller systems.
+
+ For systems that have holes in their physical address
+ spaces and for features like NUMA and memory hotplug,
+ choose "Sparse Memory"
If unsure, choose this option (Flat Memory) over any other.
@@ -38,29 +40,26 @@ config DISCONTIGMEM_MANUAL
This option provides enhanced support for discontiguous
memory systems, over FLATMEM. These systems have holes
in their physical address spaces, and this option provides
- more efficient handling of these holes. However, the vast
- majority of hardware has quite flat address spaces, and
- can have degraded performance from the extra overhead that
- this option imposes.
+ more efficient handling of these holes.
- Many NUMA configurations will have this as the only option.
+ Although "Discontiguous Memory" is still used by several
+ architectures, it is considered deprecated in favor of
+ "Sparse Memory".
- If unsure, choose "Flat Memory" over this option.
+ If unsure, choose "Sparse Memory" over this option.
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
help
This will be the only option for some systems, including
- memory hotplug systems. This is normal.
+ memory hot-plug systems. This is normal.
- For many other systems, this will be an alternative to
- "Discontiguous Memory". This option provides some potential
- performance benefits, along with decreased code complexity,
- but it is newer, and more experimental.
+ This option provides efficient support for systems with
+ holes is their physical address space and allows memory
+ hot-plug and hot-remove.
- If unsure, choose "Discontiguous Memory" or "Flat Memory"
- over this option.
+ If unsure, choose "Flat Memory" over this option.
endchoice
@@ -136,7 +135,7 @@ config HAVE_MEMBLOCK_PHYS_MAP
config HAVE_GENERIC_GUP
bool
-config ARCH_DISCARD_MEMBLOCK
+config ARCH_KEEP_MEMBLOCK
bool
config MEMORY_ISOLATION
@@ -161,7 +160,6 @@ config MEMORY_HOTPLUG_SPARSE
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
- default n
depends on MEMORY_HOTPLUG
help
This option sets the default policy setting for memory hotplug
@@ -258,6 +256,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config ARCH_ENABLE_THP_MIGRATION
bool
+config CONTIG_ALLOC
+ def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT
@@ -436,7 +437,6 @@ config NEED_PER_CPU_KM
config CLEANCACHE
bool "Enable cleancache driver to cache clean pages if tmem is present"
- default n
help
Cleancache can be thought of as a page-granularity victim cache
for clean pages that the kernel's pageframe replacement algorithm
@@ -460,7 +460,6 @@ config CLEANCACHE
config FRONTSWAP
bool "Enable frontswap to cache swap pages if tmem is present"
depends on SWAP
- default n
help
Frontswap is so named because it can be thought of as the opposite
of a "backing" store for a swap device. The data is stored into
@@ -532,7 +531,6 @@ config ZSWAP
depends on FRONTSWAP && CRYPTO=y
select CRYPTO_LZO
select ZPOOL
- default n
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
@@ -549,14 +547,12 @@ config ZSWAP
config ZPOOL
tristate "Common API for compressed memory storage"
- default n
help
Compressed memory storage API. This allows using either zbud or
zsmalloc.
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -567,7 +563,6 @@ config ZBUD
config Z3FOLD
tristate "Up to 3x density storage for compressed pages"
depends on ZPOOL
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to three compressed pages per physical
@@ -577,7 +572,6 @@ config Z3FOLD
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
- default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
@@ -628,7 +622,6 @@ config MAX_STACK_SIZE_MB
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
- default n
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
@@ -676,6 +669,22 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config ARCH_HAS_HMM_MIRROR
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on MMU && 64BIT
+
+config ARCH_HAS_HMM_DEVICE
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+ depends on ARCH_HAS_ZONE_DEVICE
+ select XARRAY_MULTI
+
config ARCH_HAS_HMM
bool
default y
@@ -694,12 +703,12 @@ config DEV_PAGEMAP_OPS
config HMM
bool
+ select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM
- select MMU_NOTIFIER
select HMM
help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
@@ -740,7 +749,6 @@ config ARCH_HAS_PKEYS
config PERCPU_STATS
bool "Collect percpu memory statistics"
- default n
help
This feature collects and exposes statistics via debugfs. The
information includes global and per chunk statistics, which can
@@ -748,7 +756,6 @@ config PERCPU_STATS
config GUP_BENCHMARK
bool "Enable infrastructure for get_user_pages_fast() benchmarking"
- default n
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
performance of get_user_pages_fast().
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e3df921208c0..fa6d79281368 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config PAGE_EXTENSION
bool "Extend memmap on extra space for more information on page"
---help---
@@ -33,7 +34,6 @@ config DEBUG_PAGEALLOC
config DEBUG_PAGEALLOC_ENABLE_DEFAULT
bool "Enable debug page memory allocations by default?"
- default n
depends on DEBUG_PAGEALLOC
---help---
Enable debug page memory allocations by default? This value
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page_alloc.o page-writeback.o \
+ maccess.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
@@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
+# Give 'page_alloc' its own module-parameter namespace
+page-alloc-y := page_alloc.o
+page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+
+obj-y += page-alloc.o
obj-y += init-mm.o
obj-y += memblock.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 72e6d0c55cfa..909dae445ea7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/wait.h>
#include <linux/backing-dev.h>
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index ef858d547e2d..ba739b76e6c5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/balloon_compaction.c
*
diff --git a/mm/cma.c b/mm/cma.c
index bb2d333ffcb3..3340ef34c154 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Contiguous Memory Allocator
*
@@ -9,11 +10,6 @@
* Michal Nazarewicz <mina86@mina86.com>
* Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
* Joonsoo Kim <iamjoonsoo.kim@lge.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License or (at your optional) any later version of the license.
*/
#define pr_fmt(fmt) "cma: " fmt
@@ -106,8 +102,10 @@ static int __init cma_activate_area(struct cma *cma)
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!cma->bitmap)
+ if (!cma->bitmap) {
+ cma->count = 0;
return -ENOMEM;
+ }
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -367,23 +365,26 @@ err:
#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit;
+ unsigned long next_zero_bit, next_set_bit, nr_zero;
unsigned long start = 0;
- unsigned int nr_zero, nr_total = 0;
+ unsigned long nr_part, nr_total = 0;
+ unsigned long nbits = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
- if (next_zero_bit >= cma->count)
+ next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+ if (next_zero_bit >= nbits)
break;
- next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+ next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
nr_zero = next_set_bit - next_zero_bit;
- pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
- nr_total += nr_zero;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+ next_zero_bit);
+ nr_total += nr_part;
start = next_zero_bit + nr_zero;
}
- pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
mutex_unlock(&cma->lock);
}
#else
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 8d7b2fd52225..a7dd9e8e10d5 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
mutex_lock(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= cma->count)
+ if (start >= bitmap_maxno)
break;
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3319e0872d01..9e1b9acb116b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1164,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc,
static inline unsigned int
freelist_scan_limit(struct compact_control *cc)
{
- return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+ unsigned short shift = BITS_PER_LONG - 1;
+
+ return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
}
/*
@@ -1228,7 +1230,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
/* Pageblock boundaries */
start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
/* Scan before */
if (start_pfn != pfn) {
@@ -1239,7 +1241,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
/* Scan after */
start_pfn = pfn + nr_isolated;
- if (start_pfn != end_pfn)
+ if (start_pfn < end_pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
@@ -1397,7 +1399,7 @@ fast_isolate_freepages(struct compact_control *cc)
page = pfn_to_page(highest);
cc->free_pfn = highest;
} else {
- if (cc->direct_compaction) {
+ if (cc->direct_compaction && pfn_valid(min_pfn)) {
page = pfn_to_page(min_pfn);
cc->free_pfn = min_pfn;
}
@@ -1886,13 +1888,13 @@ static enum compact_result __compact_finished(struct compact_control *cc)
bool can_steal;
/* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[migratetype]))
+ if (!free_area_empty(area, migratetype))
return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
- !list_empty(&area->free_list[MIGRATE_CMA]))
+ !free_area_empty(area, MIGRATE_CMA))
return COMPACT_SUCCESS;
#endif
/*
diff --git a/mm/debug.c b/mm/debug.c
index eee9c221280c..8345bb6e4769 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -67,7 +67,7 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(page) ? 0 : page_mapcount(page);
- pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx",
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageCompound(page))
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 76a160083506..8c94c89a6f7e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* DMA Pool allocator
*
@@ -5,10 +6,6 @@
* Copyright 2007 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
- * This software may be redistributed and/or modified under the terms of
- * the GNU General Public License ("GPL") version 2 as published by the
- * Free Software Foundation.
- *
* This allocator returns small blocks of a given size which are DMA-able by
* the given device. It uses the dma_alloc_coherent page allocator to get
* new pages, then splits them up into blocks of the required size.
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577baef2..df2006ba0cfa 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/filemap.c
*
@@ -24,6 +25,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -279,11 +281,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
@@ -292,40 +294,44 @@ static void page_cache_delete_batch(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ if (i >= pagevec_count(pvec))
break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index >
- pvec->pages[i]->index, page);
- continue;
- }
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + (1UL << compound_order(page)) - 1 ==
+ xas.xa_index)
i++;
- } else {
- VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
- != pvec->pages[i]->index, page);
- tail_pages--;
- }
xas_store(&xas, NULL);
total_pages++;
}
@@ -878,6 +884,7 @@ error:
put_page(page);
return xas_error(&xas);
}
+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -1440,7 +1447,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
EXPORT_SYMBOL(page_cache_next_miss);
/**
- * page_cache_prev_miss() - Find the next gap in the page cache.
+ * page_cache_prev_miss() - Find the previous gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
@@ -1491,7 +1498,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
XA_STATE(xas, &mapping->i_pages, offset);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
repeat:
@@ -1506,25 +1513,19 @@ repeat:
if (!page || xa_is_value(page))
goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/*
- * Has the page moved?
+ * Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1706,7 +1707,6 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, ULONG_MAX) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1717,17 +1717,13 @@ unsigned find_get_entries(struct address_space *mapping,
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
@@ -1736,7 +1732,7 @@ export:
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1778,33 +1774,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
rcu_read_lock();
xas_for_each(&xas, page, end) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1849,7 +1839,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1859,24 +1848,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (xa_is_value(page))
break;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1912,7 +1896,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
xas_for_each_marked(&xas, page, end, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1923,26 +1906,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*index = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1964,72 +1942,6 @@ out:
}
EXPORT_SYMBOL(find_get_pages_range_tag);
-/**
- * find_get_entries_tag - find and return entries that match @tag
- * @mapping: the address_space to search
- * @start: the starting page cache index
- * @tag: the tag index
- * @nr_entries: the maximum number of entries
- * @entries: where the resulting entries are placed
- * @indices: the cache indices corresponding to the entries in @entries
- *
- * Like find_get_entries, except we only return entries which are tagged with
- * @tag.
- *
- * Return: the number of entries which were found.
- */
-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- xa_mark_t tag, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices)
-{
- XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
- unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
-
- rcu_read_lock();
- xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
- struct page *head;
- if (xas_retry(&xas, page))
- continue;
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
- if (xa_is_value(page))
- goto export;
-
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto retry;
-
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
-export:
- indices[ret] = xas.xa_index;
- entries[ret] = page;
- if (++ret == nr_entries)
- break;
- continue;
-put_page:
- put_page(head);
-retry:
- xas_reset(&xas);
- }
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL(find_get_entries_tag);
-
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2691,7 +2603,7 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2700,24 +2612,19 @@ void filemap_map_pages(struct vm_fault *vmf,
if (xa_is_value(page))
goto next;
- head = compound_head(page);
-
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(head))
+ if (PageLocked(page))
goto next;
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto next;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto skip;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
diff --git a/mm/gup.c b/mm/gup.c
index 91819b8ad9cc..ddde097cf9e4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
@@ -28,6 +29,111 @@ struct follow_page_context {
unsigned int page_mask;
};
+typedef int (*set_dirty_func_t)(struct page *page);
+
+static void __put_user_pages_dirty(struct page **pages,
+ unsigned long npages,
+ set_dirty_func_t sdf)
+{
+ unsigned long index;
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key cases:
+ *
+ * 1) This code sees the page as already dirty, so it skips
+ * the call to sdf(). That could happen because
+ * clear_page_dirty_for_io() called page_mkclean(),
+ * followed by set_page_dirty(). However, now the page is
+ * going to get written back, which meets the original
+ * intention of setting it dirty, so all is well:
+ * clear_page_dirty_for_io() goes on to call
+ * TestClearPageDirty(), and write the page back.
+ *
+ * 2) This code sees the page as clean, so it calls sdf().
+ * The page stays dirty, despite being written back, so it
+ * gets written back again in the next writeback cycle.
+ * This is harmless.
+ */
+ if (!PageDirty(page))
+ sdf(page);
+
+ put_user_page(page);
+ }
+}
+
+/**
+ * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
+ * variants called on that page.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * set_page_dirty(), which does not lock the page, is used here.
+ * Therefore, it is the caller's responsibility to ensure that this is
+ * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ *
+ */
+void put_user_pages_dirty(struct page **pages, unsigned long npages)
+{
+ __put_user_pages_dirty(pages, npages, set_page_dirty);
+}
+EXPORT_SYMBOL(put_user_pages_dirty);
+
+/**
+ * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * This is just like put_user_pages_dirty(), except that it invokes
+ * set_page_dirty_lock(), instead of set_page_dirty().
+ *
+ */
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
+{
+ __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+}
+EXPORT_SYMBOL(put_user_pages_dirty_lock);
+
+/**
+ * put_user_pages() - release an array of gup-pinned pages.
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, release the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ */
+void put_user_pages(struct page **pages, unsigned long npages)
+{
+ unsigned long index;
+
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+ for (index = 0; index < npages; index++)
+ put_user_page(pages[index]);
+}
+EXPORT_SYMBOL(put_user_pages);
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -936,10 +1042,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
BUG_ON(ret >= nr_pages);
}
- if (!pages)
- /* If it's a prefault don't insist harder */
- return ret;
-
if (ret > 0) {
nr_pages -= ret;
pages_done += ret;
@@ -955,8 +1057,12 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done = ret;
break;
}
- /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
- pages += ret;
+ /*
+ * VM_FAULT_RETRY triggered, so seek to the faulting offset.
+ * For the prefault case (!pages) we only update counts.
+ */
+ if (likely(pages))
+ pages += ret;
start += ret << PAGE_SHIFT;
/*
@@ -979,7 +1085,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done++;
if (!nr_pages)
break;
- pages++;
+ if (likely(pages))
+ pages++;
start += PAGE_SIZE;
}
if (lock_dropped && *locked) {
@@ -1018,6 +1125,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(current, current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
@@ -1046,6 +1162,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int locked = 1;
long ret;
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
down_read(&mm->mmap_sem);
ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
@@ -1116,32 +1241,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
-{
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-
-#ifdef CONFIG_FS_DAX
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
long i;
@@ -1160,12 +1275,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
}
return false;
}
-#else
-static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
- return false;
-}
-#endif
#ifdef CONFIG_CMA
static struct page *new_non_cma_page(struct page *page, unsigned long private)
@@ -1219,10 +1328,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
return __alloc_pages_node(nid, gfp_mask, 0);
}
-static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
- unsigned int gup_flags,
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
long i;
bool drain_allow = true;
@@ -1278,10 +1390,14 @@ check_again:
putback_movable_pages(&cma_page_list);
}
/*
- * We did migrate all the pages, Try to get the page references again
- * migrating any new CMA pages which we failed to isolate earlier.
+ * We did migrate all the pages, Try to get the page references
+ * again migrating any new CMA pages which we failed to isolate
+ * earlier.
*/
- nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ pages, vmas, NULL,
+ gup_flags);
+
if ((nr_pages > 0) && migrate_allow) {
drain_allow = true;
goto check_again;
@@ -1291,66 +1407,101 @@ check_again:
return nr_pages;
}
#else
-static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
- unsigned int gup_flags,
- struct page **pages,
- struct vm_area_struct **vmas)
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
return nr_pages;
}
#endif
/*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
- *
- * "longterm" == userspace controlled elevated page count lifetime.
- * Contrast this to iov_iter_get_pages() usages which are transient.
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
*/
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+static long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
- struct vm_area_struct **vmas = vmas_arg;
- unsigned long flags;
+ struct vm_area_struct **vmas_tmp = vmas;
+ unsigned long flags = 0;
long rc, i;
- if (!pages)
- return -EINVAL;
-
- if (!vmas) {
- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- return -ENOMEM;
+ if (gup_flags & FOLL_LONGTERM) {
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas_tmp) {
+ vmas_tmp = kcalloc(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas_tmp)
+ return -ENOMEM;
+ }
+ flags = memalloc_nocma_save();
}
- flags = memalloc_nocma_save();
- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
- memalloc_nocma_restore(flags);
- if (rc < 0)
- goto out;
+ rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ vmas_tmp, NULL, gup_flags);
- if (check_dax_vmas(vmas, rc)) {
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
- goto out;
+ if (gup_flags & FOLL_LONGTERM) {
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;
+
+ if (check_dax_vmas(vmas_tmp, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ vmas_tmp, gup_flags);
}
- rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
out:
- if (vmas != vmas_arg)
- kfree(vmas);
+ if (vmas_tmp != vmas)
+ kfree(vmas_tmp);
return rc;
}
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
/**
* populate_vma_page_range() - populate a range of pages in the vma.
@@ -1571,7 +1722,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1589,10 +1740,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (pte_protnone(pte))
goto pte_unmap;
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
if (pte_devmap(pte)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ goto pte_unmap;
+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
@@ -1641,7 +1795,7 @@ pte_unmap:
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
return 0;
}
@@ -1724,16 +1878,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
#endif
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pmd_access_permitted(orig, write))
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pmd_devmap(orig))
+ if (pmd_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+ }
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1762,16 +1919,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pud_access_permitted(orig, write))
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pud_devmap(orig))
+ if (pud_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+ }
refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1800,13 +1960,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
}
static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, int write,
+ unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int refs;
struct page *head, *page;
- if (!pgd_access_permitted(orig, write))
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
@@ -1837,7 +1997,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
@@ -1860,7 +2020,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_protnone(pmd))
return 0;
- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
@@ -1870,9 +2030,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* pmd format and THP pmd format
*/
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, write, pages, nr))
+ PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -1880,7 +2040,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
}
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
@@ -1893,14 +2053,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (pud_none(pud))
return 0;
if (unlikely(pud_huge(pud))) {
- if (!gup_huge_pud(pud, pudp, addr, next, write,
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, write, pages, nr))
+ PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
@@ -1908,7 +2068,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
}
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
@@ -1923,9 +2083,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
BUILD_BUG_ON(p4d_huge(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, write, pages, nr))
+ P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+ } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -1933,7 +2093,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
}
static void gup_pgd_range(unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pgd_t *pgdp;
@@ -1946,14 +2106,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (pgd_none(pgd))
return;
if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, nr))
+ PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -2007,18 +2167,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_save(flags);
- gup_pgd_range(start, end, write, pages, &nr);
+ gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
local_irq_restore(flags);
}
return nr;
}
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int ret;
+
+ /*
+ * FIXME: FOLL_LONGTERM does not work with
+ * get_user_pages_unlocked() (see comments in that function)
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ down_read(&current->mm->mmap_sem);
+ ret = __gup_longterm_locked(current, current->mm,
+ start, nr_pages,
+ pages, NULL, gup_flags);
+ up_read(&current->mm->mmap_sem);
+ } else {
+ ret = get_user_pages_unlocked(start, nr_pages,
+ pages, gup_flags);
+ }
+
+ return ret;
+}
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -2030,8 +2213,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
@@ -2049,7 +2232,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_disable();
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags, pages, &nr);
local_irq_enable();
ret = nr;
}
@@ -2059,8 +2242,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- write ? FOLL_WRITE : 0);
+ ret = __gup_longterm_unlocked(start, nr_pages - nr,
+ gup_flags, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 6c0279e70cc4..7dd602d7f8db 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
- pages + i, NULL);
+ nr = get_user_pages(addr, nr,
+ (gup->flags & 1) | FOLL_LONGTERM,
+ pages + i, NULL);
break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
diff --git a/mm/hmm.c b/mm/hmm.c
index fe1cd87e49ac..c5d840e34b28 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2013 Red Hat Inc.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Authors: Jérôme Glisse <jglisse@redhat.com>
*/
/*
@@ -30,6 +21,7 @@
#include <linux/hugetlb.h>
#include <linux/memremap.h>
#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -38,54 +30,48 @@
#if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- */
-struct hmm {
- struct mm_struct *mm;
- spinlock_t lock;
- struct list_head ranges;
- struct list_head mirrors;
- struct mmu_notifier mmu_notifier;
- struct rw_semaphore mirrors_sem;
-};
+static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
-/*
- * hmm_register - register HMM against an mm (HMM internal)
+ if (hmm && kref_get_unless_zero(&hmm->kref))
+ return hmm;
+
+ return NULL;
+}
+
+/**
+ * hmm_get_or_create - register HMM against an mm (HMM internal)
*
* @mm: mm struct to attach to
+ * Returns: returns an HMM object, either by referencing the existing
+ * (per-process) object, or by creating a new one.
*
- * This is not intended to be used directly by device drivers. It allocates an
- * HMM struct if mm does not have one, and initializes it.
+ * This is not intended to be used directly by device drivers. If mm already
+ * has an HMM struct then it get a reference on it and returns it. Otherwise
+ * it allocates an HMM struct, initializes it, associate it with the mm and
+ * returns it.
*/
-static struct hmm *hmm_register(struct mm_struct *mm)
+static struct hmm *hmm_get_or_create(struct mm_struct *mm)
{
- struct hmm *hmm = READ_ONCE(mm->hmm);
+ struct hmm *hmm = mm_get_hmm(mm);
bool cleanup = false;
- /*
- * The hmm struct can only be freed once the mm_struct goes away,
- * hence we should always have pre-allocated an new hmm struct
- * above.
- */
if (hmm)
return hmm;
hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
return NULL;
+ init_waitqueue_head(&hmm->wq);
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
- spin_lock_init(&hmm->lock);
+ mutex_init(&hmm->lock);
+ kref_init(&hmm->kref);
+ hmm->notifiers = 0;
+ hmm->dead = false;
hmm->mm = mm;
spin_lock(&mm->page_table_lock);
@@ -106,7 +92,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
goto error_mm;
- return mm->hmm;
+ return hmm;
error_mm:
spin_lock(&mm->page_table_lock);
@@ -118,54 +104,60 @@ error:
return NULL;
}
-void hmm_mm_destroy(struct mm_struct *mm)
+static void hmm_free(struct kref *kref)
{
- kfree(mm->hmm);
-}
+ struct hmm *hmm = container_of(kref, struct hmm, kref);
+ struct mm_struct *mm = hmm->mm;
-static int hmm_invalidate_range(struct hmm *hmm, bool device,
- const struct hmm_update *update)
-{
- struct hmm_mirror *mirror;
- struct hmm_range *range;
-
- spin_lock(&hmm->lock);
- list_for_each_entry(range, &hmm->ranges, list) {
- unsigned long addr, idx, npages;
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
- if (update->end < range->start || update->start >= range->end)
- continue;
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
- range->valid = false;
- addr = max(update->start, range->start);
- idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
- memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
- }
- spin_unlock(&hmm->lock);
+ kfree(hmm);
+}
- if (!device)
- return 0;
+static inline void hmm_put(struct hmm *hmm)
+{
+ kref_put(&hmm->kref, hmm_free);
+}
- down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list) {
- int ret;
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ struct hmm *hmm;
- ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
- if (!update->blockable && ret == -EAGAIN) {
- up_read(&hmm->mirrors_sem);
- return -EAGAIN;
- }
+ spin_lock(&mm->page_table_lock);
+ hmm = mm_get_hmm(mm);
+ mm->hmm = NULL;
+ if (hmm) {
+ hmm->mm = NULL;
+ hmm->dead = true;
+ spin_unlock(&mm->page_table_lock);
+ hmm_put(hmm);
+ return;
}
- up_read(&hmm->mirrors_sem);
- return 0;
+ spin_unlock(&mm->page_table_lock);
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
+ struct hmm *hmm = mm_get_hmm(mm);
struct hmm_mirror *mirror;
- struct hmm *hmm = mm->hmm;
+ struct hmm_range *range;
+
+ /* Report this HMM as dying. */
+ hmm->dead = true;
+
+ /* Wake-up everyone waiting on any range. */
+ mutex_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ range->valid = false;
+ }
+ wake_up_all(&hmm->wq);
+ mutex_unlock(&hmm->lock);
down_write(&hmm->mirrors_sem);
mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -186,36 +178,86 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
struct hmm_mirror, list);
}
up_write(&hmm->mirrors_sem);
+
+ hmm_put(hmm);
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
+ struct hmm *hmm = mm_get_hmm(nrange->mm);
+ struct hmm_mirror *mirror;
struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm_range *range;
+ int ret = 0;
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
+ update.start = nrange->start;
+ update.end = nrange->end;
update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = range->blockable;
- return hmm_invalidate_range(hmm, true, &update);
+ update.blockable = mmu_notifier_range_blockable(nrange);
+
+ if (mmu_notifier_range_blockable(nrange))
+ mutex_lock(&hmm->lock);
+ else if (!mutex_trylock(&hmm->lock)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ hmm->notifiers++;
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (update.end < range->start || update.start >= range->end)
+ continue;
+
+ range->valid = false;
+ }
+ mutex_unlock(&hmm->lock);
+
+ if (mmu_notifier_range_blockable(nrange))
+ down_read(&hmm->mirrors_sem);
+ else if (!down_read_trylock(&hmm->mirrors_sem)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int ret;
+
+ ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+ if (!update.blockable && ret == -EAGAIN) {
+ up_read(&hmm->mirrors_sem);
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+ up_read(&hmm->mirrors_sem);
+
+out:
+ hmm_put(hmm);
+ return ret;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
- struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm *hmm = mm_get_hmm(nrange->mm);
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
- update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = true;
- hmm_invalidate_range(hmm, false, &update);
+ mutex_lock(&hmm->lock);
+ hmm->notifiers--;
+ if (!hmm->notifiers) {
+ struct hmm_range *range;
+
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (range->valid)
+ continue;
+ range->valid = true;
+ }
+ wake_up_all(&hmm->wq);
+ }
+ mutex_unlock(&hmm->lock);
+
+ hmm_put(hmm);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -241,24 +283,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
-again:
- mirror->hmm = hmm_register(mm);
+ mirror->hmm = hmm_get_or_create(mm);
if (!mirror->hmm)
return -ENOMEM;
down_write(&mirror->hmm->mirrors_sem);
- if (mirror->hmm->mm == NULL) {
- /*
- * A racing hmm_mirror_unregister() is about to destroy the hmm
- * struct. Try again to allocate a new one.
- */
- up_write(&mirror->hmm->mirrors_sem);
- mirror->hmm = NULL;
- goto again;
- } else {
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
- }
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
return 0;
}
@@ -273,38 +304,24 @@ EXPORT_SYMBOL(hmm_mirror_register);
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- bool should_unregister = false;
- struct mm_struct *mm;
- struct hmm *hmm;
+ struct hmm *hmm = READ_ONCE(mirror->hmm);
- if (mirror->hmm == NULL)
+ if (hmm == NULL)
return;
- hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
list_del_init(&mirror->list);
- should_unregister = list_empty(&hmm->mirrors);
+ /* To protect us against double unregister ... */
mirror->hmm = NULL;
- mm = hmm->mm;
- hmm->mm = NULL;
up_write(&hmm->mirrors_sem);
- if (!should_unregister || mm == NULL)
- return;
-
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
-
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-
- kfree(hmm);
+ hmm_put(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
struct hmm_vma_walk {
struct hmm_range *range;
+ struct dev_pagemap *pgmap;
unsigned long last;
bool fault;
bool block;
@@ -323,13 +340,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
- return -EBUSY;
+ return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
- return -EAGAIN;
+ return -EBUSY;
}
static int hmm_pfns_bad(unsigned long addr,
@@ -355,7 +372,7 @@ static int hmm_pfns_bad(unsigned long addr,
* @fault: should we fault or not ?
* @write_fault: write fault ?
* @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
@@ -367,23 +384,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i, page_size;
hmm_vma_walk->last = addr;
- i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
+ page_size = hmm_range_page_size(range);
+ i = (addr - range->start) >> range->page_shift;
+
+ for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
ret = hmm_vma_do_fault(walk, addr, write_fault,
&pfns[i]);
- if (ret != -EAGAIN)
+ if (ret != -EBUSY)
return ret;
}
}
- return (fault || write_fault) ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EBUSY : 0;
}
static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -392,10 +411,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
{
struct hmm_range *range = hmm_vma_walk->range;
- *fault = *write_fault = false;
if (!hmm_vma_walk->fault)
return;
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be use in 2 fashions. The first one where the HMM user coalesce
+ * multiple page fault into one request and set flags per pfns for
+ * of those faults. The second one where the HMM user want to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
@@ -431,10 +461,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
return;
}
+ *fault = *write_fault = false;
for (i = 0; i < npages; ++i) {
hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
fault, write_fault);
- if ((*fault) || (*write_fault))
+ if ((*write_fault))
return;
}
}
@@ -465,12 +496,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
}
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
static int hmm_vma_handle_pmd(struct mm_walk *walk,
unsigned long addr,
unsigned long end,
uint64_t *pfns,
pmd_t pmd)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
@@ -486,10 +527,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
pfn = pmd_pfn(pmd) + pte_index(addr);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ if (pmd_devmap(pmd)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ }
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
hmm_vma_walk->last = end;
return 0;
+#else
+ /* If THP is not enabled then we should never reach that code ! */
+ return -EINVAL;
+#endif
}
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
@@ -514,11 +570,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
uint64_t orig_pfn = *pfn;
*pfn = range->values[HMM_PFN_NONE];
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
+ fault = write_fault = false;
if (pte_none(pte)) {
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -546,7 +602,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
&fault, &write_fault);
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn = hmm_device_entry_from_pfn(range,
+ swp_offset(entry));
*pfn |= cpu_flags;
return 0;
}
@@ -557,7 +614,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
pmdp, addr);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
}
@@ -565,15 +622,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
/* Report error for everything else */
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
+ } else {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
}
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ if (pte_devmap(pte)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+
+ *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -615,7 +690,7 @@ again:
if (fault || write_fault) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(vma->vm_mm, pmdp);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
} else if (!pmd_present(pmd))
@@ -661,12 +736,158 @@ again:
return r;
}
}
+ if (hmm_vma_walk->pgmap) {
+ /*
+ * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+ * so that we can leverage get_dev_pagemap() optimization which
+ * will not re-take a reference on a pgmap if we already have
+ * one.
+ */
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
return 0;
}
+static int hmm_vma_walk_pud(pud_t *pudp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long addr = start, next;
+ pmd_t *pmdp;
+ pud_t pud;
+ int ret;
+
+again:
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ uint64_t *pfns, cpu_flags;
+ bool fault, write_fault;
+
+ if (!pud_present(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ cpu_flags, &fault, &write_fault);
+ if (fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+
+#ifdef CONFIG_HUGETLB_PAGE
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
+ hmm_vma_walk->last = end;
+ return 0;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ split_huge_pud(walk->vma, pudp, addr);
+ if (pud_none(*pudp))
+ goto again;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(vma);
+ uint64_t orig_pfn, cpu_flags;
+ bool fault, write_fault;
+ spinlock_t *ptl;
+ pte_t entry;
+ int ret = 0;
+
+ size = 1UL << huge_page_shift(h);
+ mask = size - 1;
+ if (range->page_shift != PAGE_SHIFT) {
+ /* Make sure we are looking at full page. */
+ if (start & mask)
+ return -EINVAL;
+ if (end < (start + size))
+ return -EINVAL;
+ pfn_inc = size >> PAGE_SHIFT;
+ } else {
+ pfn_inc = 1;
+ size = PAGE_SIZE;
+ }
+
+
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> range->page_shift;
+ orig_pfn = range->pfns[i];
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ fault = write_fault = false;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
+ for (; addr < end; addr += size, i++, pfn += pfn_inc)
+ range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ hmm_vma_walk->last = end;
+
+unlock:
+ spin_unlock(ptl);
+
+ if (ret == -ENOENT)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ return ret;
+#else /* CONFIG_HUGETLB_PAGE */
+ return -EINVAL;
+#endif
+}
+
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr,
@@ -676,279 +897,437 @@ static void hmm_pfns_clear(struct hmm_range *range,
*pfns = range->values[HMM_PFN_NONE];
}
-static void hmm_pfns_special(struct hmm_range *range)
-{
- unsigned long addr = range->start, i = 0;
-
- for (; addr < range->end; addr += PAGE_SIZE, i++)
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
/*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- * vma permission, 0 success
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See hmm_vma_range_done() for further
- * information.
+ * hmm_range_register() - start tracking change to CPU page table over a range
+ * @range: range
+ * @mm: the mm struct for the range of virtual address
+ * @start: start virtual address (inclusive)
+ * @end: end virtual address (exclusive)
+ * @page_shift: expect page shift for the range
+ * Returns 0 on success, -EFAULT if the address space is no longer valid
*
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
- *
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ * Track updates to the CPU page table see include/linux/hmm.h
*/
-int hmm_vma_get_pfns(struct hmm_range *range)
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift)
{
- struct vm_area_struct *vma = range->vma;
- struct hmm_vma_walk hmm_vma_walk;
- struct mm_walk mm_walk;
- struct hmm *hmm;
+ unsigned long mask = ((1UL << page_shift) - 1UL);
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
+ range->valid = false;
+ range->hmm = NULL;
+
+ if ((start & mask) || (end & mask))
return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
+ if (start >= end)
return -EINVAL;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm)
- return -ENOMEM;
- /* Caller must have registered a mirror, via hmm_mirror_register() ! */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ range->page_shift = page_shift;
+ range->start = start;
+ range->end = end;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ range->hmm = hmm_get_or_create(mm);
+ if (!range->hmm)
+ return -EFAULT;
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
- */
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ /* Check if hmm_mm_destroy() was call. */
+ if (range->hmm->mm == NULL || range->hmm->dead) {
+ hmm_put(range->hmm);
+ return -EFAULT;
}
/* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = false;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
-
- walk_page_range(range->start, range->end, &mm_walk);
+ mutex_lock(&range->hmm->lock);
+
+ list_add_rcu(&range->list, &range->hmm->ranges);
+
+ /*
+ * If there are any concurrent notifiers we have to wait for them for
+ * the range to be valid (see hmm_range_wait_until_valid()).
+ */
+ if (!range->hmm->notifiers)
+ range->valid = true;
+ mutex_unlock(&range->hmm->lock);
+
return 0;
}
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_register);
/*
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @range: range being tracked
- * Returns: false if range data has been invalidated, true otherwise
+ * hmm_range_unregister() - stop tracking change to CPU page table over a range
+ * @range: range
*
* Range struct is used to track updates to the CPU page table after a call to
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
- * using the data, or wants to lock updates to the data it got from those
- * functions, it must call the hmm_vma_range_done() function, which will then
- * stop tracking CPU page table updates.
- *
- * Note that device driver must still implement general CPU page table update
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
- * the mmu_notifier API directly.
- *
- * CPU page table update tracking done through hmm_range is only temporary and
- * to be used while trying to duplicate CPU page table contents for a range of
- * virtual addresses.
- *
- * There are two ways to use this :
- * again:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * trans = device_build_page_table_update_transaction(pfns);
- * device_page_table_lock();
- * if (!hmm_vma_range_done(range)) {
- * device_page_table_unlock();
- * goto again;
- * }
- * device_commit_transaction(trans);
- * device_page_table_unlock();
+ * hmm_range_register(). See include/linux/hmm.h for how to use it.
+ */
+void hmm_range_unregister(struct hmm_range *range)
+{
+ /* Sanity check this really should not happen. */
+ if (range->hmm == NULL || range->end <= range->start)
+ return;
+
+ mutex_lock(&range->hmm->lock);
+ list_del_rcu(&range->list);
+ mutex_unlock(&range->hmm->lock);
+
+ /* Drop reference taken by hmm_range_register() */
+ range->valid = false;
+ hmm_put(range->hmm);
+ range->hmm = NULL;
+}
+EXPORT_SYMBOL(hmm_range_unregister);
+
+/*
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * permission (for instance asking for write and range is read only),
+ * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ * vma or it is illegal to access that range), number of valid pages
+ * in range->pfns[] (from range start address).
*
- * Or:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * device_page_table_lock();
- * hmm_vma_range_done(range);
- * device_update_page_table(range->pfns);
- * device_page_table_unlock();
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See in include/linux/hmm.h for example
+ * on how to use.
*/
-bool hmm_vma_range_done(struct hmm_range *range)
+long hmm_range_snapshot(struct hmm_range *range)
{
- unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
- struct hmm *hmm;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
+ struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
+ struct mm_walk mm_walk;
- if (range->end <= range->start) {
- BUG();
- return false;
- }
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(range->vma->vm_mm);
- if (!hmm) {
- memset(range->pfns, 0, sizeof(*range->pfns) * npages);
- return false;
- }
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid)
+ return -EAGAIN;
- spin_lock(&hmm->lock);
- list_del_rcu(&range->list);
- spin_unlock(&hmm->lock);
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
- return range->valid;
+ if (huge_page_shift(h) != range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ walk_page_range(start, end, &mm_walk);
+ start = end;
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
-EXPORT_SYMBOL(hmm_vma_range_done);
+EXPORT_SYMBOL(hmm_range_snapshot);
/*
- * hmm_vma_fault() - try to fault some address in a virtual address range
+ * hmm_range_fault() - try to fault some address in a virtual address range
* @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ * Returns: number of valid pages in range->pfns[] (from range start
+ * address). This may be zero. If the return value is negative,
+ * then one of the following values may be returned:
+ *
+ * -EINVAL invalid arguments or mm or virtual address are in an
+ * invalid vma (for instance device file vma).
+ * -ENOMEM: Out of memory.
+ * -EPERM: Invalid permission (for instance asking for write and
+ * range is read only).
+ * -EAGAIN: If you need to retry and mmap_sem was drop. This can only
+ * happens if block argument is false.
+ * -EBUSY: If the the range is being invalidated and you should wait
+ * for invalidation to finish.
+ * -EFAULT: Invalid (ie either no valid vma or it is illegal to access
+ * that range), number of valid pages in range->pfns[] (from
+ * range start address).
*
* This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs.
+ * any memory migration if the memory being faulted is not accessible by CPUs
+ * and caller does not ask for migration.
*
* On error, for one virtual address in the range, the function will mark the
* corresponding HMM pfn entry with an error flag.
- *
- * Expected use pattern:
- * retry:
- * down_read(&mm->mmap_sem);
- * // Find vma and address device wants to fault, initialize hmm_pfn_t
- * // array accordingly
- * ret = hmm_vma_fault(range, write, block);
- * switch (ret) {
- * case -EAGAIN:
- * hmm_vma_range_done(range);
- * // You might want to rate limit or yield to play nicely, you may
- * // also commit any valid pfn in the array assuming that you are
- * // getting true from hmm_vma_range_monitor_end()
- * goto retry;
- * case 0:
- * break;
- * case -ENOMEM:
- * case -EINVAL:
- * case -EPERM:
- * default:
- * // Handle error !
- * up_read(&mm->mmap_sem)
- * return;
- * }
- * // Take device driver lock that serialize device page table update
- * driver_lock_device_page_table_update();
- * hmm_vma_range_done(range);
- * // Commit pfns we got from hmm_vma_fault()
- * driver_unlock_device_page_table_update();
- * up_read(&mm->mmap_sem)
- *
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
- *
- * YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, bool block)
{
- struct vm_area_struct *vma = range->vma;
- unsigned long start = range->start;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
struct mm_walk mm_walk;
- struct hmm *hmm;
int ret;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
- return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
- return -EINVAL;
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm) {
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -ENOMEM;
- }
- /* Caller must have registered a mirror using hmm_mirror_register() */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid) {
+ up_read(&hmm->mm->mmap_sem);
+ return -EAGAIN;
+ }
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ if (huge_page_shift(hstate_vma(vma)) !=
+ range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+
+ /* Keep trying while the range is valid. */
+ } while (ret == -EBUSY && range->valid);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(range, &range->pfns[i],
+ hmm_vma_walk.last, range->end);
+ return ret;
+ }
+ start = end;
+
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block)
+{
+ unsigned long i, npages, mapped;
+ long ret;
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0)
+ return ret ? ret : -EBUSY;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0, mapped = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- if (!(vma->vm_flags & VM_READ)) {
/*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
+ * FIXME need to update DMA API to provide invalid DMA address
+ * value instead of a function to test dma address value. This
+ * would remove lot of dumb code duplicated accross many arch.
+ *
+ * For now setting it to 0 here is good enough as the pfns[]
+ * value is what is use to check what is valid and what isn't.
*/
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ daddrs[i] = 0;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* Check if range is being invalidated */
+ if (!range->valid) {
+ ret = -EBUSY;
+ goto unmap;
+ }
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+ if (dma_mapping_error(device, daddrs[i])) {
+ ret = -EFAULT;
+ goto unmap;
+ }
+
+ mapped++;
}
- /* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = true;
- hmm_vma_walk.block = block;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
- hmm_vma_walk.last = range->start;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
+ return mapped;
- do {
- ret = walk_page_range(start, range->end, &mm_walk);
- start = hmm_vma_walk.last;
- } while (ret == -EAGAIN);
+unmap:
+ for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- if (ret) {
- unsigned long i;
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ if (dma_mapping_error(device, daddrs[i]))
+ continue;
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
- range->end);
- hmm_vma_range_done(range);
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ mapped--;
}
+
return ret;
}
-EXPORT_SYMBOL(hmm_vma_fault);
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/**
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty)
+{
+ unsigned long i, npages;
+ long cpages = 0;
+
+ /* Sanity check. */
+ if (range->end <= range->start)
+ return -EINVAL;
+ if (!daddrs)
+ return -EINVAL;
+ if (!range->pfns)
+ return -EINVAL;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
+ dir = DMA_BIDIRECTIONAL;
+
+ /*
+ * See comments in function description on why it is
+ * safe here to call set_page_dirty()
+ */
+ if (dirty)
+ set_page_dirty(page);
+ }
+
+ /* Unmap and clear pfns/dma address */
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ /* FIXME see comments in hmm_vma_dma_map() */
+ daddrs[i] = 0;
+ cpages++;
+ }
+
+ return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b6a34b32d8ac..9f8bce9a6b32 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -509,7 +509,7 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
unsigned long addr;
@@ -793,11 +793,13 @@ out_unlock:
pte_free(mm, pgtable);
}
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -820,7 +822,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
+ insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -869,10 +871,12 @@ out_unlock:
spin_unlock(ptl);
}
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PUD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+
/*
* If we had pud_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -889,7 +893,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1220,8 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
cond_resched();
}
- mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1384,8 +1388,8 @@ alloc:
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
spin_lock(vmf->ptl);
@@ -2060,7 +2064,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
@@ -2278,7 +2283,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
@@ -2492,6 +2498,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
}
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 641cedfc8c0f..ac843d32b019 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic hugetlb support.
* (C) Nadia Yvette Chambers, April 2004
@@ -740,7 +741,15 @@ void resv_map_release(struct kref *ref)
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
- return inode->i_mapping->private_data;
+ /*
+ * At inode evict time, i_mapping may not point to the original
+ * address space within the inode. This original address space
+ * contains the pointer to the resv_map. So, always use the
+ * address space embedded within the inode.
+ * The VERY common case is inode->mapping == &inode->i_data but,
+ * this may not be true for device special inodes.
+ */
+ return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -1059,6 +1068,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
free_contig_range(page_to_pfn(page), 1 << order);
}
+#ifdef CONFIG_CONTIG_ALLOC
static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
@@ -1143,11 +1153,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask) { return NULL; }
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
@@ -1157,7 +1176,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
@@ -1258,12 +1277,23 @@ void free_huge_page(struct page *page)
ClearPagePrivate(page);
/*
- * A return code of zero implies that the subpool will be under its
- * minimum size if the reservation is not restored after page is free.
- * Therefore, force restore_reserve operation.
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
- restore_reserve = true;
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
@@ -1574,8 +1604,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
+ spin_unlock(&hugetlb_lock);
put_page(page);
- page = NULL;
+ return NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -2277,13 +2308,47 @@ found:
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
- nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return h->max_huge_pages;
+ spin_lock(&hugetlb_lock);
+
+ /*
+ * Check for a node specific request.
+ * Changing node specific huge page count may require a corresponding
+ * change to the global count. In any case, the passed node mask
+ * (nodes_allowed) will restrict alloc/free to the specified node.
+ */
+ if (nid != NUMA_NO_NODE) {
+ unsigned long old_count = count;
+
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ /*
+ * User may have specified a large count value which caused the
+ * above calculation to overflow. In this case, they wanted
+ * to allocate as many huge pages as possible. Set count to
+ * largest possible value to align with their intention.
+ */
+ if (count < old_count)
+ count = ULONG_MAX;
+ }
+
+ /*
+ * Gigantic pages runtime allocation depend on the capability for large
+ * page range allocation.
+ * If the system does not provide this feature, return an error when
+ * the user tries to allocate gigantic pages but let the user free the
+ * boottime allocated gigantic pages.
+ */
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+ }
/*
* Increase the pool size
@@ -2296,7 +2361,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
- spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
@@ -2351,9 +2415,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
break;
}
out:
- ret = persistent_huge_pages(h);
+ h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return ret;
+
+ return 0;
}
#define HSTATE_ATTR_RO(_name) \
@@ -2403,41 +2468,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
unsigned long count, size_t len)
{
int err;
- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+ nodemask_t nodes_allowed, *n_mask;
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
- err = -EINVAL;
- goto out;
- }
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
- init_nodemask_of_mempolicy(nodes_allowed))) {
- NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_MEMORY];
- }
- } else if (nodes_allowed) {
+ init_nodemask_of_mempolicy(&nodes_allowed)))
+ n_mask = &node_states[N_MEMORY];
+ else
+ n_mask = &nodes_allowed;
+ } else {
/*
- * per node hstate attribute: adjust count to global,
- * but restrict alloc/free to the specified node.
+ * Node specific request. count adjustment happens in
+ * set_max_huge_pages() after acquiring hugetlb_lock.
*/
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
- init_nodemask_of_node(nodes_allowed, nid);
- } else
- nodes_allowed = &node_states[N_MEMORY];
-
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ }
- if (nodes_allowed != &node_states[N_MEMORY])
- NODEMASK_FREE(nodes_allowed);
+ err = set_max_huge_pages(h, count, nid, n_mask);
- return len;
-out:
- NODEMASK_FREE(nodes_allowed);
- return err;
+ return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3247,7 +3303,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (cow) {
- mmu_notifier_range_init(&range, src, vma->vm_start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+ vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -3359,7 +3416,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+ end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
address = start;
@@ -3626,7 +3684,8 @@ retry_avoidcopy:
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
/*
@@ -3777,8 +3836,7 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3886,21 +3944,14 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3911,9 +3962,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3958,7 +4007,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4371,7 +4420,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+ 0, vma, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
@@ -4477,6 +4527,11 @@ int hugetlb_reserve_pages(struct inode *inode,
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
+ * called for inodes for which resv_maps were created (see
+ * hugetlbfs_get_inode).
+ */
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to);
@@ -4568,6 +4623,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+ /*
+ * Since this routine can be called in the evict inode path for all
+ * hugetlbfs inodes, resv_map could be NULL.
+ */
if (resv_map) {
chg = region_del(resv_map, start, end);
/*
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index b6ac70616c32..1a7497d015b2 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Inject a hwpoison memory failure on a arbitrary pfn */
#include <linux/module.h>
#include <linux/debugfs.h>
diff --git a/mm/internal.h b/mm/internal.h
index 9eeaf2b95166..e32390802fd3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 36afcf64e016..242fdc01aaa9 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -464,7 +464,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
{
unsigned long redzone_start;
unsigned long redzone_end;
- u8 tag;
+ u8 tag = 0xff;
if (gfpflags_allow_blocking(flags))
quarantine_reduce();
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 449044378782..a335f7c1fac4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
- mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
@@ -1374,7 +1375,7 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
nr_none++;
continue;
}
@@ -1450,7 +1451,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dd3c23a801b1..e19279ff6aa3 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/kmemleak-test.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "kmemleak: " fmt
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e57bf810f798..9dd581d11565 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/kmemleak.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
* For more information on the algorithm and kmemleak usage, please see
* Documentation/dev-tools/kmemleak.rst.
*
diff --git a/mm/ksm.c b/mm/ksm.c
index fc64874dc6f4..81c20ed57bf6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmu_notifier_range_init(&range, mm, pvmw.address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 0730bf8ff39f..e4709fdaa8e6 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
@@ -37,11 +38,7 @@ static int lru_shrinker_id(struct list_lru *lru)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
- /*
- * This needs node 0 to be always present, even
- * in the systems supporting sparse numa ids.
- */
- return !!lru->node[0].memcg_lrus;
+ return lru->memcg_aware;
}
static inline struct list_lru_one *
@@ -451,6 +448,8 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
int i;
+ lru->memcg_aware = memcg_aware;
+
if (!memcg_aware)
return 0;
diff --git a/mm/maccess.c b/mm/maccess.c
index ec00be51a24f..482d4d670f19 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Access kernel memory without faulting.
*/
diff --git a/mm/madvise.c b/mm/madvise.c
index bb3a4554d5d5..628022e674a7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
- mmu_notifier_range_init(&range, mm, range.start, range.end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ range.start, range.end);
lru_add_drain();
tlb_gather_mmu(&tlb, mm, range.start, range.end);
diff --git a/mm/memblock.c b/mm/memblock.c
index e7665cf914b1..7d4f61ae666a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Procedures for maintaining information about logical memory blocks.
*
* Peter Bergner, IBM Corp. June 2001.
* Copyright (C) 2001 Peter Bergner.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -94,7 +90,7 @@
* :c:func:`mem_init` function frees all the memory to the buddy page
* allocator.
*
- * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
* memblock data structures will be discarded after the system
* initialization compltes.
*/
@@ -375,7 +371,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
/**
* memblock_discard - discard memory and reserved arrays if they were allocated
*/
@@ -702,7 +698,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
+ memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -821,7 +817,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg(" memblock_free: [%pa-%pa] %pF\n",
+ memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
@@ -832,7 +828,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
+ memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
@@ -1255,6 +1251,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/**
* memblock_alloc_range_nid - allocate boot memory block
@@ -1447,7 +1507,7 @@ void * __init memblock_alloc_try_nid_raw(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
@@ -1483,7 +1543,7 @@ void * __init memblock_alloc_try_nid(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
@@ -1508,7 +1568,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
phys_addr_t cursor, end;
end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pF\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n",
__func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
cursor = PFN_UP(base);
@@ -1923,7 +1983,7 @@ unsigned long __init memblock_free_all(void)
return pages;
}
-#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
static int memblock_debug_show(struct seq_file *m, void *private)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81a0d3914ec9..ca0bc6e6be13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
@@ -19,16 +20,6 @@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/page_counter.h>
@@ -687,10 +678,119 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- int event)
+/**
+ * __mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
+{
+ long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ atomic_long_add(x, &memcg->vmstats_local[idx]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmstats[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+}
+
+static struct mem_cgroup_per_node *
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+{
+ struct mem_cgroup *parent;
+
+ parent = parent_mem_cgroup(pn->memcg);
+ if (!parent)
+ return NULL;
+ return mem_cgroup_nodeinfo(parent, nid);
+}
+
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ long x;
+
+ /* Update node */
+ __mod_node_page_state(pgdat, idx, val);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
+ /* Update memcg */
+ __mod_memcg_state(memcg, idx, val);
+
+ /* Update lruvec */
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup_per_node *pi;
+
+ atomic_long_add(x, &pn->lruvec_stat_local[idx]);
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+}
+
+/**
+ * __count_memcg_events - account VM events in a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event item
+ * @count: the number of events that occured
+ */
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+ unsigned long count)
+{
+ unsigned long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ atomic_long_add(x, &memcg->vmevents_local[idx]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmevents[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+}
+
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return atomic_long_read(&memcg->events[event]);
+ return atomic_long_read(&memcg->vmevents[event]);
+}
+
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
+{
+ return atomic_long_read(&memcg->vmevents_local[event]);
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -722,35 +822,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
-}
-
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- int nid;
-
- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -758,8 +830,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -775,7 +847,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
default:
break;
}
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
return true;
}
return false;
@@ -1353,12 +1425,14 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state(iter, memcg1_stats[i])));
+ K(memcg_page_state_local(iter,
+ memcg1_stats[i])));
}
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+ K(memcg_page_state_local(iter,
+ NR_LRU_BASE + i)));
pr_cont("\n");
}
@@ -1422,11 +1496,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+ if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;
@@ -2100,7 +2178,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
@@ -2112,9 +2190,12 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
int nid;
long x;
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
- if (x)
- atomic_long_add(x, &memcg->stat[i]);
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
+ if (x) {
+ atomic_long_add(x, &memcg->vmstats_local[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmstats[i]);
+ }
if (i >= NR_VM_NODE_STAT_ITEMS)
continue;
@@ -2124,17 +2205,24 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
pn = mem_cgroup_nodeinfo(memcg, nid);
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
- if (x)
- atomic_long_add(x, &pn->lruvec_stat[i]);
+ if (x) {
+ atomic_long_add(x, &pn->lruvec_stat_local[i]);
+ do {
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
+ }
}
}
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
long x;
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
- if (x)
- atomic_long_add(x, &memcg->events[i]);
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
+ if (x) {
+ atomic_long_add(x, &memcg->vmevents_local[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmevents[i]);
+ }
}
}
@@ -2964,50 +3052,15 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-struct accumulated_stats {
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
- unsigned long lru_pages[NR_LRU_LISTS];
- const unsigned int *stats_array;
- const unsigned int *events_array;
- int stats_size;
- int events_size;
-};
-
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
- struct accumulated_stats *acc)
-{
- struct mem_cgroup *mi;
- int i;
-
- for_each_mem_cgroup_tree(mi, memcg) {
- for (i = 0; i < acc->stats_size; i++)
- acc->stat[i] += memcg_page_state(mi,
- acc->stats_array ? acc->stats_array[i] : i);
-
- for (i = 0; i < acc->events_size; i++)
- acc->events[i] += memcg_sum_events(mi,
- acc->events_array ? acc->events_array[i] : i);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
- }
-}
-
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val = 0;
+ unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg) {
- val += memcg_page_state(iter, MEMCG_CACHE);
- val += memcg_page_state(iter, MEMCG_RSS);
- if (swap)
- val += memcg_page_state(iter, MEMCG_SWAP);
- }
+ val = memcg_page_state(memcg, MEMCG_CACHE) +
+ memcg_page_state(memcg, MEMCG_RSS);
+ if (swap)
+ val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -3331,6 +3384,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
@@ -3402,7 +3491,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
- struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3411,17 +3499,18 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state(memcg, memcg1_stats[i]) *
+ memcg_page_state_local(memcg, memcg1_stats[i]) *
PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "%s %lu\n", memcg1_event_names[i],
- memcg_sum_events(memcg, memcg1_events[i]));
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -3435,27 +3524,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
- acc.stats_array = memcg1_stats;
- acc.events_size = ARRAY_SIZE(memcg1_events);
- acc.events_array = memcg1_events;
- accumulate_memcg_tree(memcg, &acc);
-
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)acc.stat[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, i) * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)acc.events[i]);
+ (u64)memcg_events(memcg, i));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -3888,11 +3971,11 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
*/
static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
{
- long x = atomic_long_read(&memcg->stat[idx]);
+ long x = atomic_long_read(&memcg->vmstats[idx]);
int cpu;
for_each_online_cpu(cpu)
- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
if (x < 0)
x = 0;
return x;
@@ -3927,8 +4010,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
@@ -4432,7 +4515,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat_cpu);
+ free_percpu(memcg->vmstats_percpu);
kfree(memcg);
}
@@ -4461,8 +4544,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (memcg->id.id < 0)
goto fail;
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat_cpu)
+ memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+ if (!memcg->vmstats_percpu)
goto fail;
for_each_node(node)
@@ -5548,7 +5631,6 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- struct accumulated_stats acc;
int i;
/*
@@ -5562,31 +5644,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = MEMCG_NR_STAT;
- acc.events_size = NR_VM_EVENT_ITEMS;
- accumulate_memcg_tree(memcg, &acc);
-
seq_printf(m, "anon %llu\n",
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
+ (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
seq_printf(m, "slab %llu\n",
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
+ PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
seq_printf(m, "shmem %llu\n",
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
/*
* TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
@@ -5595,43 +5673,47 @@ static int memory_stat_show(struct seq_file *m, void *v)
* where the page->mem_cgroup is set up and stable.
*/
seq_printf(m, "anon_thp %llu\n",
- (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
+ PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
+ PAGE_SIZE);
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+ seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
+ seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
+ memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE));
seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
- acc.events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
- acc.events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
+ memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
+
+ seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
+ seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
+ seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
+ seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
+ seq_printf(m, "thp_fault_alloc %lu\n",
+ memcg_events(memcg, THP_FAULT_ALLOC));
seq_printf(m, "thp_collapse_alloc %lu\n",
- acc.events[THP_COLLAPSE_ALLOC]);
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return 0;
@@ -6067,7 +6149,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a46b9c..2647c898990c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
xas_for_each(xas, page, ULONG_MAX) {
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas->xa_index);
if (page_count(page) - page_mapcount(page) > 1)
xas_set_mark(xas, MEMFD_TAG_PINNED);
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
bool clear = true;
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas.xa_index);
if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fc8b51744579..8da0334b9ca0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008, 2009 Intel Corporation
* Authors: Andi Kleen, Fengguang Wu
*
- * This software may be redistributed and/or modified under the terms of
- * the GNU General Public License ("GPL") version 2 only as published by the
- * Free Software Foundation.
- *
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
diff --git a/mm/memory.c b/mm/memory.c
index 36aac6844662..ddf20bd0c317 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
*
@@ -519,7 +520,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
@@ -1010,7 +1011,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow = is_cow_mapping(vma->vm_flags);
if (is_cow) {
- mmu_notifier_range_init(&range, src_mm, addr, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1334,7 +1336,8 @@ void unmap_vmas(struct mmu_gather *tlb,
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
@@ -1356,7 +1359,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, start + size);
tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1382,7 +1386,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, address + size);
tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1523,6 +1528,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num, unsigned long offset)
+{
+ unsigned long count = vma_pages(vma);
+ unsigned long uaddr = vma->vm_start;
+ int ret, i;
+
+ /* Fail if the user requested offset is beyond the end of the object */
+ if (offset > num)
+ return -ENXIO;
+
+ /* Fail if the user requested size exceeds available object size */
+ if (count > num - offset)
+ return -ENXIO;
+
+ for (i = 0; i < count; i++) {
+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+ if (ret < 0)
+ return ret;
+ uaddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+/**
+ * vm_map_pages - maps range of kernel pages starts with non zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Maps an object consisting of @num pages, catering for the user's
+ * requested vm_pgoff
+ *
+ * If we fail to insert any page into the vma, the function will return
+ * immediately leaving any previously inserted pages present. Callers
+ * from the mmap handler may immediately return the error as their caller
+ * will destroy the vma, removing any successfully inserted pages. Other
+ * callers should make their own arrangements for calling unmap_region().
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+/**
+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Similar to vm_map_pages(), except that it explicitly sets the offset
+ * to 0. This function is intended for the drivers that did not consider
+ * vm_pgoff.
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, 0);
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
@@ -2279,7 +2365,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -4104,8 +4191,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- mmu_notifier_range_init(range, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
@@ -4122,8 +4210,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- mmu_notifier_range_init(range, mm, address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b236069ff0d8..e096c987d261 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory_hotplug.c
*
@@ -39,6 +40,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "shuffle.h"
/*
* online_page_callback contains pointer to current page onlining function.
@@ -273,12 +275,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
* add the new pages.
*/
int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ unsigned long nr_pages, struct mhp_restrictions *restrictions)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
+ struct vmem_altmap *altmap = restrictions->altmap;
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -299,7 +301,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(nid, section_nr_to_pfn(i), altmap,
- want_memblock);
+ restrictions->flags & MHP_MEMBLOCK_API);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -516,26 +518,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static int __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn;
int scn_nr;
- int ret = -EINVAL;
- if (!valid_section(ms))
- return ret;
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
- ret = unregister_memory_section(ms);
- if (ret)
- return ret;
+ unregister_memory_section(ms);
scn_nr = __section_nr(ms);
start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
__remove_zone(zone, start_pfn);
sparse_remove_one_section(zone, ms, map_offset, altmap);
- return 0;
}
/**
@@ -550,31 +549,17 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long i;
unsigned long map_offset = 0;
- int sections_to_remove, ret = 0;
+ int sections_to_remove;
/* In the ZONE_DEVICE case device driver owns the memory region */
if (is_dev_zone(zone)) {
if (altmap)
map_offset = vmem_altmap_offset(altmap);
- } else {
- resource_size_t start, size;
-
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
-
- ret = release_mem_region_adjustable(&iomem_resource, start,
- size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
}
clear_zone_contiguous(zone);
@@ -590,16 +575,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
cond_resched();
- ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
- altmap);
+ __remove_section(zone, __pfn_to_section(pfn), map_offset,
+ altmap);
map_offset = 0;
- if (ret)
- break;
}
set_zone_contiguous(zone);
-
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -714,7 +695,7 @@ static void node_states_check_changes_online(unsigned long nr_pages,
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
#ifdef CONFIG_HIGHMEM
- if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
#endif
}
@@ -912,6 +893,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ shuffle_zone(zone);
+
if (onlined_pages) {
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
@@ -1097,6 +1080,9 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res)
{
+ struct mhp_restrictions restrictions = {
+ .flags = MHP_MEMBLOCK_API,
+ };
u64 start, size;
bool new_node = false;
int ret;
@@ -1124,7 +1110,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
new_node = ret;
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, NULL, true);
+ ret = arch_add_memory(nid, start, size, &restrictions);
if (ret < 0)
goto error;
@@ -1341,8 +1327,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
if (!PageHuge(page))
continue;
head = compound_head(page);
- if (hugepage_migration_supported(page_hstate(head)) &&
- page_huge_active(head))
+ if (page_huge_active(head))
return pfn;
skip = (1 << compound_order(head)) - (page - head);
pfn += skip - 1;
@@ -1382,10 +1367,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- if (compound_order(head) > PFN_SECTION_SHIFT) {
- ret = -EBUSY;
- break;
- }
pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
isolate_huge_page(head, &source);
continue;
@@ -1454,15 +1435,10 @@ static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
{
- __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
+ unsigned long *offlined_pages = (unsigned long *)data;
-static void
-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
-{
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
- offline_isolated_pages_cb);
+ *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
+ return 0;
}
/*
@@ -1472,26 +1448,7 @@ static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- int ret;
- long offlined = *(long *)data;
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
- offlined = nr_pages;
- if (!ret)
- *(long *)data += offlined;
- return ret;
-}
-
-static long
-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
-{
- long offlined = 0;
- int ret;
-
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
- check_pages_isolated_cb);
- if (ret < 0)
- offlined = (long)ret;
- return offlined;
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1576,7 +1533,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, nr_pages;
- long offlined_pages;
+ unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
unsigned long valid_start, valid_end;
@@ -1652,14 +1609,15 @@ static int __ref __offline_pages(unsigned long start_pfn,
goto failed_removal_isolated;
}
/* check again */
- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
- } while (offlined_pages < 0);
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ NULL, check_pages_isolated_cb);
+ } while (ret);
- pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
- offline_isolated_pages(start_pfn, end_pfn);
-
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &offlined_pages, offline_isolated_pages_cb);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/*
* Onlining will reset pagetype flags and makes migrate type
* MOVABLE, so just need to decrease the number of isolated
@@ -1843,6 +1801,26 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
+static void __release_memory_resource(resource_size_t start,
+ resource_size_t size)
+{
+ int ret;
+
+ /*
+ * When removing memory in the same granularity as it was added,
+ * this function never fails. It might only fail if resources
+ * have to be adjusted or split. We'll ignore the error, as
+ * removing of memory cannot fail.
+ */
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+}
+
/**
* remove_memory
* @nid: the node ID
@@ -1877,6 +1855,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
arch_remove_memory(nid, start, size, NULL);
+ __release_memory_resource(start, size);
try_offline_node(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2219e747df49..01600d80ae01 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1,9 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
- * Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
diff --git a/mm/migrate.c b/mm/migrate.c
index 663a5449367a..f2ecc2855a12 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
for (i = 1; i < HPAGE_PMD_NR; i++) {
xas_next(&xas);
- xas_store(&xas, newpage + i);
+ xas_store(&xas, newpage);
}
}
@@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
mm_walk.mm = migrate->vma->vm_mm;
mm_walk.private = migrate;
- mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm,
+ migrate->start,
migrate->end);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->start, migrate->end, &mm_walk);
@@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
notified = true;
mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL,
migrate->vma->vm_mm,
addr, migrate->end);
mmu_notifier_invalidate_range_start(&range);
diff --git a/mm/mincore.c b/mm/mincore.c
index 218099b5ed31..c3f058bd0faf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -169,6 +169,22 @@ out:
return 0;
}
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * Reveal pagecache information only for non-anonymous mappings that
+ * correspond to the files the calling process could (if tried) open
+ * for writing; otherwise we'd be including shared non-exclusive
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
- mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ if (!can_do_mincore(vma)) {
+ unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+ memset(vec, 1, pages);
+ return pages;
+ }
+ mincore_walk.mm = vma->vm_mm;
err = walk_page_range(addr, end, &mincore_walk);
if (err < 0)
return err;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 33917105a3a2..5c918388de99 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm_init.c - Memory initialisation verification and debugging
*
diff --git a/mm/mmap.c b/mm/mmap.c
index bd7b9f293b39..7e8c3e8ae75f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/mmap.c
*
@@ -2735,9 +2736,17 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return -EINVAL;
len = PAGE_ALIGN(len);
+ end = start + len;
if (len == 0)
return -EINVAL;
+ /*
+ * arch_unmap() might do unmaps itself. It must be called
+ * and finish any rbtree manipulation before this code
+ * runs and also starts to manipulate the rbtree.
+ */
+ arch_unmap(mm, start, end);
+
/* Find the first overlapping VMA */
vma = find_vma(mm, start);
if (!vma)
@@ -2746,7 +2755,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
- end = start + len;
if (vma->vm_start >= end)
return 0;
@@ -2816,12 +2824,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
/* Detach vmas from rbtree */
detach_vmas_to_be_unmapped(mm, vma, prev, end);
- /*
- * mpx unmap needs to be called with mmap_sem held for write.
- * It is safe to call it before unmap_region().
- */
- arch_unmap(mm, vma, start, end);
-
if (downgrade)
downgrade_write(&mm->mmap_sem);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9c884abc7850..ee36068077b6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
mn->ops->invalidate_range_start, _ret,
- !range->blockable ? "non-" : "");
+ !mmu_notifier_range_blockable(range) ? "non-" : "");
ret = _ret;
}
}
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+ if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+ return false;
+ /* Return true if the vma still have the read flag set. */
+ return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 028c724dcb1a..bf38dfbbb4b4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -39,7 +39,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
- struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
@@ -136,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- set_pte_at(mm, addr, pte, newpte);
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
@@ -150,7 +149,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_pte_at(mm, addr, pte, newpte);
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
@@ -185,7 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
- mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_PROTECTION_VMA, 0,
+ vma, vma->vm_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index e3edef6b7a12..fc241d23cd97 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 749276beb109..d8c02fbe03b5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/nommu.c
*
@@ -473,6 +474,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a2484884cfd..5a58778c91d4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
*
@@ -531,7 +532,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_notifier_range range;
struct mmu_gather tlb;
- mmu_notifier_range_init(&range, mm, vma->vm_start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+ vma, mm, vma->vm_start,
vma->vm_end);
tlb_gather_mmu(&tlb, mm, range.start, range.end);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9f61dfec6a1f..bdbe8b6b1225 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/page-writeback.c
*
@@ -2808,6 +2809,18 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
}
EXPORT_SYMBOL(__test_set_page_writeback);
+/*
+ * Wait for a page to complete writeback
+ */
+void wait_on_page_writeback(struct page *page)
+{
+ if (PageWriteback(page)) {
+ trace_wait_on_page_writeback(page, page_mapping(page));
+ wait_on_page_bit(page, PG_writeback);
+ }
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59661106da16..d66bc8abe0af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
@@ -43,6 +44,7 @@
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
+#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
@@ -72,6 +74,7 @@
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+#include "shuffle.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -755,12 +758,6 @@ static inline void set_page_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
-static inline void rmv_page_order(struct page *page)
-{
- __ClearPageBuddy(page);
- set_page_private(page, 0);
-}
-
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
@@ -918,13 +915,10 @@ continue_merging:
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
- if (page_is_guard(buddy)) {
+ if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
- } else {
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
- }
+ else
+ del_page_from_free_area(buddy, &zone->free_area[order]);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -966,7 +960,8 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
+ && !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
@@ -974,15 +969,18 @@ done_merging:
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
- list_add_tail(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
- goto out;
+ add_to_free_area_tail(page, &zone->free_area[order],
+ migratetype);
+ return;
}
}
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
-out:
- zone->free_area[order].nr_free++;
+ if (is_shuffle_order(order))
+ add_to_free_area_random(page, &zone->free_area[order],
+ migratetype);
+ else
+ add_to_free_area(page, &zone->free_area[order], migratetype);
+
}
/*
@@ -1416,36 +1414,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1574,21 +1558,13 @@ static inline void __init pgdat_init_report_one_done(void)
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
return true;
}
@@ -1596,15 +1572,14 @@ deferred_pfn_valid(int nid, unsigned long pfn,
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
@@ -1624,17 +1599,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
+static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
@@ -1649,18 +1625,100 @@ static unsigned long __init deferred_init_pages(int nid, int zid,
return (nr_pages);
}
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
struct zone *zone;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1686,31 +1744,27 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1734,14 +1788,11 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1770,38 +1821,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ return true;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+
+ /* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
- pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -1824,9 +1872,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
void __init page_alloc_init_late(void)
{
struct zone *zone;
+ int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- int nid;
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
@@ -1846,10 +1894,12 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
/* Discard memblock private memory */
memblock_discard();
-#endif
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -1921,8 +1971,7 @@ static inline void expand(struct zone *zone, struct page *page,
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- list_add(&page[size].lru, &area->free_list[migratetype]);
- area->nr_free++;
+ add_to_free_area(&page[size], area, migratetype);
set_page_order(&page[size], high);
}
}
@@ -1937,7 +1986,7 @@ static void check_new_page_bad(struct page *page)
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _count";
+ bad_reason = "nonzero _refcount";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
bad_flags = __PG_HWPOISON;
@@ -2064,13 +2113,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- page = list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ del_page_from_free_area(page, area);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -2156,8 +2202,7 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_move(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2345,7 +2390,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
single_page:
area = &zone->free_area[current_order];
- list_move(&page->lru, &area->free_list[start_type]);
+ move_to_free_area(page, area, start_type);
}
/*
@@ -2369,7 +2414,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
if (fallback_mt == MIGRATE_TYPES)
break;
- if (list_empty(&area->free_list[fallback_mt]))
+ if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
@@ -2456,9 +2501,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
- page = list_first_entry_or_null(
- &area->free_list[MIGRATE_HIGHATOMIC],
- struct page, lru);
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
@@ -2581,8 +2624,7 @@ find_smallest:
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
@@ -3019,6 +3061,7 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
+ struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -3043,9 +3086,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/* Remove page from free list */
- list_del(&page->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(page);
+
+ del_page_from_free_area(page, area);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -3120,9 +3162,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ struct zone *zone, gfp_t gfp_flags,
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3134,7 +3175,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -3154,8 +3195,8 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ migratetype, alloc_flags);
goto out;
}
@@ -3343,13 +3384,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
- if (!list_empty(&area->free_list[mt]))
+ if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ !free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
@@ -4821,7 +4862,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
@@ -4838,6 +4879,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
@@ -4848,7 +4892,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
@@ -4858,7 +4902,12 @@ EXPORT_SYMBOL(alloc_pages_exact);
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ struct page *p;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -5268,7 +5317,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!list_empty(&area->free_list[type]))
+ if (!free_area_empty(area, type))
types[order] |= 1 << type;
}
}
@@ -6247,13 +6296,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
@@ -8129,8 +8180,7 @@ unmovable:
return true;
}
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
@@ -8339,8 +8389,9 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
@@ -8352,7 +8403,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -8394,7 +8444,7 @@ void zone_pcp_reset(struct zone *zone)
* All pages in the range must be in a single zone and isolated
* before calling this.
*/
-void
+unsigned long
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
@@ -8402,12 +8452,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned int order, i;
unsigned long pfn;
unsigned long flags;
+ unsigned long offlined_pages = 0;
+
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
break;
if (pfn == end_pfn)
- return;
+ return offlined_pages;
+
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
@@ -8425,24 +8478,26 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
SetPageReserved(page);
+ offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
+ offlined_pages += 1 << order;
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
- list_del(&page->lru);
- rmv_page_order(page);
- zone->free_area[order].nr_free--;
+ del_page_from_free_area(page, &zone->free_area[order]);
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
+
+ return offlined_pages;
}
#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 019280712e1b..e3638a5bafff 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -151,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
for (i = 0; i < nr_pages; i++) {
struct page *page;
- if (!pfn_valid_within(pfn + i))
- continue;
page = pfn_to_online_page(pfn + i);
if (!page)
continue;
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index b1739dc06b73..0468ba500bd4 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -9,8 +9,17 @@
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
+ *
+ * The scan hint is the largest known contiguous area before the contig hint.
+ * It is not necessarily the actual largest contig hint though. There is an
+ * invariant that the scan_hint_start > contig_hint_start iff
+ * scan_hint == contig_hint. This is necessary because when scanning forward,
+ * we don't know if a new contig hint would be better than the current one.
*/
struct pcpu_block_md {
+ int scan_hint; /* scan hint for block */
+ int scan_hint_start; /* block relative starting
+ position of the scan hint */
int contig_hint; /* contig hint for block */
int contig_hint_start; /* block relative starting
position of the contig hint */
@@ -19,6 +28,7 @@ struct pcpu_block_md {
int right_free; /* size of free space along
the right side of the block */
int first_free; /* block position of first free */
+ int nr_bits; /* total bits responsible for */
};
struct pcpu_chunk {
@@ -29,9 +39,7 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
- int contig_bits; /* max contiguous size hint */
- int contig_bits_start; /* contig_bits starting
- offset */
+ struct pcpu_block_md chunk_md;
void *base_addr; /* base address of this chunk */
unsigned long *alloc_map; /* allocation map */
@@ -39,7 +47,6 @@ struct pcpu_chunk {
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
- int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */
int start_offset; /* the overlap with the previous
region to have a page aligned
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index b68d5df14731..20d2b69a13b0 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-km.c - kernel memory based chunk allocation
*
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
- *
* Chunks are allocated as a contiguous kernel memory using gfp
* allocation. This is to be used on nommu architectures.
*
@@ -70,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
chunk->base_addr = page_address(pages);
spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_chunk_populated(chunk, 0, nr_pages, false);
+ pcpu_chunk_populated(chunk, 0, nr_pages);
spin_unlock_irqrestore(&pcpu_lock, flags);
pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index b5fdd43b60c9..a5a8b22816ff 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-debug.c
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennisz@fb.com>
*
- * This file is released under the GPLv2.
- *
* Prints statistics about the percpu allocator and backing chunks.
*/
#include <linux/debugfs.h>
@@ -53,6 +52,7 @@ static int find_max_nr_alloc(void)
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
int *buffer)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
@@ -121,9 +121,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
P("empty_pop_pages", chunk->nr_empty_pop_pages);
- P("first_bit", chunk->first_bit);
+ P("first_bit", chunk_md->first_free);
P("free_bytes", chunk->free_bytes);
- P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
+ P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index d8078de912de..a2b395acef89 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-vm.c - vmalloc area based chunk allocation
*
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
- *
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
diff --git a/mm/percpu.c b/mm/percpu.c
index 68dd2e7e73b5..9821241fdede 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu.c - percpu memory allocator
*
@@ -7,8 +8,6 @@
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com>
*
- * This file is released under the GPLv2 license.
- *
* The percpu allocator handles both static and dynamic areas. Percpu
* areas are allocated in chunks which are divided into units. There is
* a 1-to-1 mapping for units to possible cpus. These units are grouped
@@ -94,6 +93,8 @@
/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT 5
+/* chunks in slots below this are subject to being sidelined on failed alloc */
+#define PCPU_SLOT_FAIL_THRESHOLD 3
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
@@ -231,10 +232,13 @@ static int pcpu_size_to_slot(int size)
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
+ const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
+ chunk_md->contig_hint == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_bytes);
+ return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}
/* set the pointer to a chunk in a page struct */
@@ -318,6 +322,34 @@ static unsigned long pcpu_block_off_to_off(int index, int off)
return index * PCPU_BITMAP_BLOCK_BITS + off;
}
+/*
+ * pcpu_next_hint - determine which hint to use
+ * @block: block of interest
+ * @alloc_bits: size of allocation
+ *
+ * This determines if we should scan based on the scan_hint or first_free.
+ * In general, we want to scan from first_free to fulfill allocations by
+ * first fit. However, if we know a scan_hint at position scan_hint_start
+ * cannot fulfill an allocation, we can begin scanning from there knowing
+ * the contig_hint will be our fallback.
+ */
+static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
+{
+ /*
+ * The three conditions below determine if we can skip past the
+ * scan_hint. First, does the scan hint exist. Second, is the
+ * contig_hint after the scan_hint (possibly not true iff
+ * contig_hint == scan_hint). Third, is the allocation request
+ * larger than the scan_hint.
+ */
+ if (block->scan_hint &&
+ block->contig_hint_start > block->scan_hint_start &&
+ alloc_bits > block->scan_hint)
+ return block->scan_hint_start + block->scan_hint;
+
+ return block->first_free;
+}
+
/**
* pcpu_next_md_free_region - finds the next hint free area
* @chunk: chunk of interest
@@ -413,9 +445,11 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
if (block->contig_hint &&
block->contig_hint_start >= block_off &&
block->contig_hint >= *bits + alloc_bits) {
+ int start = pcpu_next_hint(block, alloc_bits);
+
*bits += alloc_bits + block->contig_hint_start -
- block->first_free;
- *bit_off = pcpu_block_off_to_off(i, block->first_free);
+ start;
+ *bit_off = pcpu_block_off_to_off(i, start);
return;
}
/* reset to satisfy the second predicate above */
@@ -488,6 +522,22 @@ static void pcpu_mem_free(void *ptr)
kvfree(ptr);
}
+static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
+ bool move_front)
+{
+ if (chunk != pcpu_reserved_chunk) {
+ if (move_front)
+ list_move(&chunk->list, &pcpu_slot[slot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_slot[slot]);
+ }
+}
+
+static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
+{
+ __pcpu_chunk_move(chunk, slot, true);
+}
+
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
@@ -505,110 +555,39 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
- if (chunk != pcpu_reserved_chunk && oslot != nslot) {
- if (oslot < nslot)
- list_move(&chunk->list, &pcpu_slot[nslot]);
- else
- list_move_tail(&chunk->list, &pcpu_slot[nslot]);
- }
+ if (oslot != nslot)
+ __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
-/**
- * pcpu_cnt_pop_pages- counts populated backing pages in range
+/*
+ * pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
- * @bit_off: start offset
- * @bits: size of area to check
+ * @nr: nr of empty pages
*
- * Calculates the number of populated pages in the region
- * [page_start, page_end). This keeps track of how many empty populated
- * pages are available and decide if async work should be scheduled.
- *
- * RETURNS:
- * The nr of populated pages.
+ * This is used to keep track of the empty pages now based on the premise
+ * a md_block covers a page. The hint update functions recognize if a block
+ * is made full or broken to calculate deltas for keeping track of free pages.
*/
-static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
- int bits)
+static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
- int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
- int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
-
- if (page_start >= page_end)
- return 0;
-
- /*
- * bitmap_weight counts the number of bits set in a bitmap up to
- * the specified number of bits. This is counting the populated
- * pages up to page_end and then subtracting the populated pages
- * up to page_start to count the populated pages in
- * [page_start, page_end).
- */
- return bitmap_weight(chunk->populated, page_end) -
- bitmap_weight(chunk->populated, page_start);
-}
-
-/**
- * pcpu_chunk_update - updates the chunk metadata given a free area
- * @chunk: chunk of interest
- * @bit_off: chunk offset
- * @bits: size of free area
- *
- * This updates the chunk's contig hint and starting offset given a free area.
- * Choose the best starting offset if the contig hint is equal.
- */
-static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
-{
- if (bits > chunk->contig_bits) {
- chunk->contig_bits_start = bit_off;
- chunk->contig_bits = bits;
- } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
- (!bit_off ||
- __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
- /* use the start with the best alignment */
- chunk->contig_bits_start = bit_off;
- }
+ chunk->nr_empty_pop_pages += nr;
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages += nr;
}
-/**
- * pcpu_chunk_refresh_hint - updates metadata about a chunk
- * @chunk: chunk of interest
- *
- * Iterates over the metadata blocks to find the largest contig area.
- * It also counts the populated pages and uses the delta to update the
- * global count.
- *
- * Updates:
- * chunk->contig_bits
- * chunk->contig_bits_start
- * nr_empty_pop_pages (chunk and global)
+/*
+ * pcpu_region_overlap - determines if two regions overlap
+ * @a: start of first region, inclusive
+ * @b: end of first region, exclusive
+ * @x: start of second region, inclusive
+ * @y: end of second region, exclusive
+ *
+ * This is used to determine if the hint region [a, b) overlaps with the
+ * allocated region [x, y).
*/
-static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
+static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
- int bit_off, bits, nr_empty_pop_pages;
-
- /* clear metadata */
- chunk->contig_bits = 0;
-
- bit_off = chunk->first_bit;
- bits = nr_empty_pop_pages = 0;
- pcpu_for_each_md_free_region(chunk, bit_off, bits) {
- pcpu_chunk_update(chunk, bit_off, bits);
-
- nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
- }
-
- /*
- * Keep track of nr_empty_pop_pages.
- *
- * The chunk maintains the previous number of free pages it held,
- * so the delta is used to update the global counter. The reserved
- * chunk is not part of the free page count as they are populated
- * at init and are special to serving reserved allocations.
- */
- if (chunk != pcpu_reserved_chunk)
- pcpu_nr_empty_pop_pages +=
- (nr_empty_pop_pages - chunk->nr_empty_pop_pages);
-
- chunk->nr_empty_pop_pages = nr_empty_pop_pages;
+ return (a < y) && (x < b);
}
/**
@@ -629,16 +608,132 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
if (start == 0)
block->left_free = contig;
- if (end == PCPU_BITMAP_BLOCK_BITS)
+ if (end == block->nr_bits)
block->right_free = contig;
if (contig > block->contig_hint) {
+ /* promote the old contig_hint to be the new scan_hint */
+ if (start > block->contig_hint_start) {
+ if (block->contig_hint > block->scan_hint) {
+ block->scan_hint_start =
+ block->contig_hint_start;
+ block->scan_hint = block->contig_hint;
+ } else if (start < block->scan_hint_start) {
+ /*
+ * The old contig_hint == scan_hint. But, the
+ * new contig is larger so hold the invariant
+ * scan_hint_start < contig_hint_start.
+ */
+ block->scan_hint = 0;
+ }
+ } else {
+ block->scan_hint = 0;
+ }
block->contig_hint_start = start;
block->contig_hint = contig;
- } else if (block->contig_hint_start && contig == block->contig_hint &&
- (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
- /* use the start with the best alignment */
- block->contig_hint_start = start;
+ } else if (contig == block->contig_hint) {
+ if (block->contig_hint_start &&
+ (!start ||
+ __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* start has a better alignment so use it */
+ block->contig_hint_start = start;
+ if (start < block->scan_hint_start &&
+ block->contig_hint > block->scan_hint)
+ block->scan_hint = 0;
+ } else if (start > block->scan_hint_start ||
+ block->contig_hint > block->scan_hint) {
+ /*
+ * Knowing contig == contig_hint, update the scan_hint
+ * if it is farther than or larger than the current
+ * scan_hint.
+ */
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ } else {
+ /*
+ * The region is smaller than the contig_hint. So only update
+ * the scan_hint if it is larger than or equal and farther than
+ * the current scan_hint.
+ */
+ if ((start < block->contig_hint_start &&
+ (contig > block->scan_hint ||
+ (contig == block->scan_hint &&
+ start > block->scan_hint_start)))) {
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ }
+}
+
+/*
+ * pcpu_block_update_scan - update a block given a free area from a scan
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finding the final allocation spot first goes through pcpu_find_block_fit()
+ * to find a block that can hold the allocation and then pcpu_alloc_area()
+ * where a scan is used. When allocations require specific alignments,
+ * we can inadvertently create holes which will not be seen in the alloc
+ * or free paths.
+ *
+ * This takes a given free area hole and updates a block as it may change the
+ * scan_hint. We need to scan backwards to ensure we don't miss free bits
+ * from alignment.
+ */
+static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int s_off = pcpu_off_to_block_off(bit_off);
+ int e_off = s_off + bits;
+ int s_index, l_bit;
+ struct pcpu_block_md *block;
+
+ if (e_off > PCPU_BITMAP_BLOCK_BITS)
+ return;
+
+ s_index = pcpu_off_to_block_index(bit_off);
+ block = chunk->md_blocks + s_index;
+
+ /* scan backwards in case of alignment skipping free bits */
+ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
+ s_off = (s_off == l_bit) ? 0 : l_bit + 1;
+
+ pcpu_block_update(block, s_off, e_off);
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
+ * @full_scan: if we should scan from the beginning
+ *
+ * Iterates over the metadata blocks to find the largest contig area.
+ * A full scan can be avoided on the allocation path as this is triggered
+ * if we broke the contig_hint. In doing so, the scan_hint will be before
+ * the contig_hint or after if the scan_hint == contig_hint. This cannot
+ * be prevented on freeing as we want to find the largest area possibly
+ * spanning blocks.
+ */
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits;
+
+ /* promote scan_hint to contig_hint */
+ if (!full_scan && chunk_md->scan_hint) {
+ bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
+ chunk_md->contig_hint_start = chunk_md->scan_hint_start;
+ chunk_md->contig_hint = chunk_md->scan_hint;
+ chunk_md->scan_hint = 0;
+ } else {
+ bit_off = chunk_md->first_free;
+ chunk_md->contig_hint = 0;
+ }
+
+ bits = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}
}
@@ -654,14 +749,23 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
- int rs, re; /* region start, region end */
+ int rs, re, start; /* region start, region end */
+
+ /* promote scan_hint to contig_hint */
+ if (block->scan_hint) {
+ start = block->scan_hint_start + block->scan_hint;
+ block->contig_hint_start = block->scan_hint_start;
+ block->contig_hint = block->scan_hint;
+ block->scan_hint = 0;
+ } else {
+ start = block->first_free;
+ block->contig_hint = 0;
+ }
- /* clear hints */
- block->contig_hint = 0;
- block->left_free = block->right_free = 0;
+ block->right_free = 0;
/* iterate over free areas and update the contig hints */
- pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
+ pcpu_for_each_unpop_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS) {
pcpu_block_update(block, rs, re);
}
@@ -680,6 +784,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
@@ -704,15 +810,29 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
* If the allocation breaks the contig_hint, a scan is required to
* restore this hint.
*/
+ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
PCPU_BITMAP_BLOCK_BITS,
s_off + bits);
- if (s_off >= s_block->contig_hint_start &&
- s_off < s_block->contig_hint_start + s_block->contig_hint) {
+ if (pcpu_region_overlap(s_block->scan_hint_start,
+ s_block->scan_hint_start + s_block->scan_hint,
+ s_off,
+ s_off + bits))
+ s_block->scan_hint = 0;
+
+ if (pcpu_region_overlap(s_block->contig_hint_start,
+ s_block->contig_hint_start +
+ s_block->contig_hint,
+ s_off,
+ s_off + bits)) {
/* block contig hint is broken - scan to fix it */
+ if (!s_off)
+ s_block->left_free = 0;
pcpu_block_refresh_hint(chunk, s_index);
} else {
/* update left and right contig manually */
@@ -728,6 +848,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
* Update e_block.
*/
if (s_index != e_index) {
+ if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
/*
* When the allocation is across blocks, the end is along
* the left part of the e_block.
@@ -740,11 +863,14 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
/* reset the block */
e_block++;
} else {
+ if (e_off > e_block->scan_hint_start)
+ e_block->scan_hint = 0;
+
+ e_block->left_free = 0;
if (e_off > e_block->contig_hint_start) {
/* contig hint is broken - scan to fix it */
pcpu_block_refresh_hint(chunk, e_index);
} else {
- e_block->left_free = 0;
e_block->right_free =
min_t(int, e_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
@@ -752,21 +878,36 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
}
/* update in-between md_blocks */
+ nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
+ block->scan_hint = 0;
block->contig_hint = 0;
block->left_free = 0;
block->right_free = 0;
}
}
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, -nr_empty_pages);
+
+ if (pcpu_region_overlap(chunk_md->scan_hint_start,
+ chunk_md->scan_hint_start +
+ chunk_md->scan_hint,
+ bit_off,
+ bit_off + bits))
+ chunk_md->scan_hint = 0;
+
/*
* The only time a full chunk scan is required is if the chunk
* contig hint is broken. Otherwise, it means a smaller space
* was used and therefore the chunk contig hint is still correct.
*/
- if (bit_off >= chunk->contig_bits_start &&
- bit_off < chunk->contig_bits_start + chunk->contig_bits)
- pcpu_chunk_refresh_hint(chunk);
+ if (pcpu_region_overlap(chunk_md->contig_hint_start,
+ chunk_md->contig_hint_start +
+ chunk_md->contig_hint,
+ bit_off,
+ bit_off + bits))
+ pcpu_chunk_refresh_hint(chunk, false);
}
/**
@@ -782,13 +923,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
*
* A chunk update is triggered if a page becomes free, a block becomes free,
* or the free spans across blocks. This tradeoff is to minimize iterating
- * over the block metadata to update chunk->contig_bits. chunk->contig_bits
- * may be off by up to a page, but it will never be more than the available
- * space. If the contig hint is contained in one block, it will be accurate.
+ * over the block metadata to update chunk_md->contig_hint.
+ * chunk_md->contig_hint may be off by up to a page, but it will never be more
+ * than the available space. If the contig hint is contained in one block, it
+ * will be accurate.
*/
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
+ int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
@@ -842,16 +985,22 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
/* update s_block */
e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
pcpu_block_update(s_block, start, e_off);
/* freeing in the same block */
if (s_index != e_index) {
/* update e_block */
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
pcpu_block_update(e_block, 0, end);
/* reset md_blocks in the middle */
+ nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
block->first_free = 0;
+ block->scan_hint = 0;
block->contig_hint_start = 0;
block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
block->left_free = PCPU_BITMAP_BLOCK_BITS;
@@ -859,19 +1008,21 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
}
}
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, nr_empty_pages);
+
/*
- * Refresh chunk metadata when the free makes a page free, a block
- * free, or spans across blocks. The contig hint may be off by up to
- * a page, but if the hint is contained in a block, it will be accurate
- * with the else condition below.
+ * Refresh chunk metadata when the free makes a block free or spans
+ * across blocks. The contig_hint may be off by up to a page, but if
+ * the contig_hint is contained in a block, it will be accurate with
+ * the else condition below.
*/
- if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
- ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
- s_index != e_index)
- pcpu_chunk_refresh_hint(chunk);
+ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk, true);
else
- pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
- s_block->contig_hint);
+ pcpu_block_update(&chunk->chunk_md,
+ pcpu_block_off_to_off(s_index, start),
+ end);
}
/**
@@ -926,6 +1077,7 @@ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, next_off;
/*
@@ -934,12 +1086,12 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
* cannot fit in the global hint, there is memory pressure and creating
* a new chunk would happen soon.
*/
- bit_off = ALIGN(chunk->contig_bits_start, align) -
- chunk->contig_bits_start;
- if (bit_off + alloc_bits > chunk->contig_bits)
+ bit_off = ALIGN(chunk_md->contig_hint_start, align) -
+ chunk_md->contig_hint_start;
+ if (bit_off + alloc_bits > chunk_md->contig_hint)
return -1;
- bit_off = chunk->first_bit;
+ bit_off = pcpu_next_hint(chunk_md, alloc_bits);
bits = 0;
pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
@@ -956,6 +1108,62 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
return bit_off;
}
+/*
+ * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
+ * @map: the address to base the search on
+ * @size: the bitmap size in bits
+ * @start: the bitnumber to start searching at
+ * @nr: the number of zeroed bits we're looking for
+ * @align_mask: alignment mask for zero area
+ * @largest_off: offset of the largest area skipped
+ * @largest_bits: size of the largest area skipped
+ *
+ * The @align_mask should be one less than a power of 2.
+ *
+ * This is a modified version of bitmap_find_next_zero_area_off() to remember
+ * the largest area that was skipped. This is imperfect, but in general is
+ * good enough. The largest remembered region is the largest failed region
+ * seen. This does not include anything we possibly skipped due to alignment.
+ * pcpu_block_update_scan() does scan backwards to try and recover what was
+ * lost to alignment. While this can cause scanning to miss earlier possible
+ * free areas, smaller allocations will eventually fill those holes.
+ */
+static unsigned long pcpu_find_zero_area(unsigned long *map,
+ unsigned long size,
+ unsigned long start,
+ unsigned long nr,
+ unsigned long align_mask,
+ unsigned long *largest_off,
+ unsigned long *largest_bits)
+{
+ unsigned long index, end, i, area_off, area_bits;
+again:
+ index = find_next_zero_bit(map, size, start);
+
+ /* Align allocation */
+ index = __ALIGN_MASK(index, align_mask);
+ area_off = index;
+
+ end = index + nr;
+ if (end > size)
+ return end;
+ i = find_next_bit(map, end, index);
+ if (i < end) {
+ area_bits = i - area_off;
+ /* remember largest unused area with best alignment */
+ if (area_bits > *largest_bits ||
+ (area_bits == *largest_bits && *largest_off &&
+ (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
+ *largest_off = area_off;
+ *largest_bits = area_bits;
+ }
+
+ start = i + 1;
+ goto again;
+ }
+ return index;
+}
+
/**
* pcpu_alloc_area - allocates an area from a pcpu_chunk
* @chunk: chunk of interest
@@ -978,7 +1186,9 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0;
+ unsigned long area_off = 0, area_bits = 0;
int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
@@ -988,12 +1198,16 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
/*
* Search to find a fit.
*/
- end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
- bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
- alloc_bits, align_mask);
+ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
+ pcpu_chunk_map_bits(chunk));
+ bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
+ align_mask, &area_off, &area_bits);
if (bit_off >= end)
return -1;
+ if (area_bits)
+ pcpu_block_update_scan(chunk, area_off, area_bits);
+
/* update alloc map */
bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
@@ -1005,8 +1219,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
- if (bit_off == chunk->first_bit)
- chunk->first_bit = find_next_zero_bit(
+ if (bit_off == chunk_md->first_free)
+ chunk_md->first_free = find_next_zero_bit(
chunk->alloc_map,
pcpu_chunk_map_bits(chunk),
bit_off + alloc_bits);
@@ -1028,6 +1242,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
*/
static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, end, oslot;
lockdep_assert_held(&pcpu_lock);
@@ -1047,24 +1262,34 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
- chunk->first_bit = min(chunk->first_bit, bit_off);
+ chunk_md->first_free = min(chunk_md->first_free, bit_off);
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
}
+static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
+{
+ block->scan_hint = 0;
+ block->contig_hint = nr_bits;
+ block->left_free = nr_bits;
+ block->right_free = nr_bits;
+ block->first_free = 0;
+ block->nr_bits = nr_bits;
+}
+
static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
struct pcpu_block_md *md_block;
+ /* init the chunk's block */
+ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
+
for (md_block = chunk->md_blocks;
md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
- md_block++) {
- md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
- md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
- md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
- }
+ md_block++)
+ pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}
/**
@@ -1143,11 +1368,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
chunk->immutable = true;
bitmap_fill(chunk->populated, chunk->nr_pages);
chunk->nr_populated = chunk->nr_pages;
- chunk->nr_empty_pop_pages =
- pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
- map_size / PCPU_MIN_ALLOC_SIZE);
+ chunk->nr_empty_pop_pages = chunk->nr_pages;
- chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
chunk->free_bytes = map_size;
if (chunk->start_offset) {
@@ -1157,7 +1379,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
set_bit(0, chunk->bound_map);
set_bit(offset_bits, chunk->bound_map);
- chunk->first_bit = offset_bits;
+ chunk->chunk_md.first_free = offset_bits;
pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
@@ -1210,7 +1432,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
pcpu_init_md_blocks(chunk);
/* init metadata */
- chunk->contig_bits = region_bits;
chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
@@ -1240,7 +1461,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
- * @for_alloc: if this is to populate for allocation
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
@@ -1250,7 +1470,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* is to serve an allocation in that area.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
- int page_end, bool for_alloc)
+ int page_end)
{
int nr = page_end - page_start;
@@ -1260,10 +1480,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
chunk->nr_populated += nr;
pcpu_nr_populated += nr;
- if (!for_alloc) {
- chunk->nr_empty_pop_pages += nr;
- pcpu_nr_empty_pop_pages += nr;
- }
+ pcpu_update_empty_pages(chunk, nr);
}
/**
@@ -1285,9 +1502,9 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
- chunk->nr_empty_pop_pages -= nr;
- pcpu_nr_empty_pop_pages -= nr;
pcpu_nr_populated -= nr;
+
+ pcpu_update_empty_pages(chunk, -nr);
}
/*
@@ -1374,7 +1591,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10;
- struct pcpu_chunk *chunk;
+ struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
unsigned long flags;
@@ -1436,11 +1653,14 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
- if (off < 0)
+ if (off < 0) {
+ if (slot < PCPU_SLOT_FAIL_THRESHOLD)
+ pcpu_chunk_move(chunk, 0);
continue;
+ }
off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
@@ -1499,7 +1719,7 @@ area_found:
err = "failed to populate";
goto fail_unlock;
}
- pcpu_chunk_populated(chunk, rs, re, true);
+ pcpu_chunk_populated(chunk, rs, re);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
@@ -1698,7 +1918,7 @@ retry_pop:
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, rs, rs + nr, false);
+ pcpu_chunk_populated(chunk, rs, rs + nr);
spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
@@ -1738,6 +1958,7 @@ void free_percpu(void __percpu *ptr)
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
+ bool need_balance = false;
if (!ptr)
return;
@@ -1759,7 +1980,7 @@ void free_percpu(void __percpu *ptr)
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
if (pos != chunk) {
- pcpu_schedule_balance_work();
+ need_balance = true;
break;
}
}
@@ -1767,6 +1988,9 @@ void free_percpu(void __percpu *ptr)
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ if (need_balance)
+ pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index a447092d4635..357aa7bef6c0 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* linux/mm/process_vm_access.c
*
* Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/mm.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index a4593654a26c..2fe72cd29b47 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/readahead.c - address_space-level file readahead.
*
diff --git a/mm/rmap.c b/mm/rmap.c
index b30c7c71d1d9..e5dfe2ae6b0d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -850,7 +850,7 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
- if (!page_mapped(page))
+ if (!pra.mapcount)
return 0;
if (!page_rmapping(page))
@@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free from this function.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, vma->vm_mm, address,
min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
mmu_notifier_invalidate_range_start(&range);
@@ -928,7 +929,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
continue;
flush_cache_page(vma, address, page_to_pfn(page));
- entry = pmdp_huge_clear_flush(vma, address, pmd);
+ entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
@@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address,
min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index d908c8769b48..5e313fa93276 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* rodata_test.c: functional test for mark_rodata_ro function
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#define pr_fmt(fmt) "rodata_test: " fmt
diff --git a/mm/shmem.c b/mm/shmem.c
index 2275a0ff7c30..1bb3b8dc8bb2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page,
if (xas_error(&xas))
goto unlock;
next:
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
@@ -3631,9 +3631,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
return &info->vfs_inode;
}
-static void shmem_destroy_callback(struct rcu_head *head)
+static void shmem_free_in_core_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
@@ -3643,7 +3642,6 @@ static void shmem_destroy_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
static void shmem_init_inode(void *foo)
@@ -3734,6 +3732,7 @@ static const struct inode_operations shmem_special_inode_operations = {
static const struct super_operations shmem_ops = {
.alloc_inode = shmem_alloc_inode,
+ .free_inode = shmem_free_in_core_inode,
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
diff --git a/mm/shuffle.c b/mm/shuffle.c
new file mode 100644
index 000000000000..3ce12481b1dc
--- /dev/null
+++ b/mm/shuffle.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mmzone.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include "internal.h"
+#include "shuffle.h"
+
+DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+static unsigned long shuffle_state __ro_after_init;
+
+/*
+ * Depending on the architecture, module parameter parsing may run
+ * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
+ * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
+ * attempts to turn on the implementation, but aborts if it finds
+ * SHUFFLE_FORCE_DISABLE already set.
+ */
+__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+ if (ctl == SHUFFLE_FORCE_DISABLE)
+ set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
+
+ if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
+ if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_disable(&page_alloc_shuffle_key);
+ } else if (ctl == SHUFFLE_ENABLE
+ && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_enable(&page_alloc_shuffle_key);
+}
+
+static bool shuffle_param;
+extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
+ ? 'Y' : 'N');
+}
+
+static __meminit int shuffle_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+ if (shuffle_param)
+ page_alloc_shuffle(SHUFFLE_ENABLE);
+ else
+ page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ return 0;
+}
+module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+/*
+ * For two pages to be swapped in the shuffle, they must be free (on a
+ * 'free_area' lru), have the same order, and have the same migratetype.
+ */
+static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+{
+ struct page *page;
+
+ /*
+ * Given we're dealing with randomly selected pfns in a zone we
+ * need to ask questions like...
+ */
+
+ /* ...is the pfn even in the memmap? */
+ if (!pfn_valid_within(pfn))
+ return NULL;
+
+ /* ...is the pfn in a present section or a hole? */
+ if (!pfn_present(pfn))
+ return NULL;
+
+ /* ...is the page free and currently on a free_area list? */
+ page = pfn_to_page(pfn);
+ if (!PageBuddy(page))
+ return NULL;
+
+ /*
+ * ...is the page on the same list as the page we will
+ * shuffle it with?
+ */
+ if (page_order(page) != order)
+ return NULL;
+
+ return page;
+}
+
+/*
+ * Fisher-Yates shuffle the freelist which prescribes iterating through an
+ * array, pfns in this case, and randomly swapping each entry with another in
+ * the span, end_pfn - start_pfn.
+ *
+ * To keep the implementation simple it does not attempt to correct for sources
+ * of bias in the distribution, like modulo bias or pseudo-random number
+ * generator bias. I.e. the expectation is that this shuffling raises the bar
+ * for attacks that exploit the predictability of page allocations, but need not
+ * be a perfect shuffle.
+ */
+#define SHUFFLE_RETRY 10
+void __meminit __shuffle_zone(struct zone *z)
+{
+ unsigned long i, flags;
+ unsigned long start_pfn = z->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(z);
+ const int order = SHUFFLE_ORDER;
+ const int order_pages = 1 << order;
+
+ spin_lock_irqsave(&z->lock, flags);
+ start_pfn = ALIGN(start_pfn, order_pages);
+ for (i = start_pfn; i < end_pfn; i += order_pages) {
+ unsigned long j;
+ int migratetype, retry;
+ struct page *page_i, *page_j;
+
+ /*
+ * We expect page_i, in the sub-range of a zone being added
+ * (@start_pfn to @end_pfn), to more likely be valid compared to
+ * page_j randomly selected in the span @zone_start_pfn to
+ * @spanned_pages.
+ */
+ page_i = shuffle_valid_page(i, order);
+ if (!page_i)
+ continue;
+
+ for (retry = 0; retry < SHUFFLE_RETRY; retry++) {
+ /*
+ * Pick a random order aligned page in the zone span as
+ * a swap target. If the selected pfn is a hole, retry
+ * up to SHUFFLE_RETRY attempts find a random valid pfn
+ * in the zone.
+ */
+ j = z->zone_start_pfn +
+ ALIGN_DOWN(get_random_long() % z->spanned_pages,
+ order_pages);
+ page_j = shuffle_valid_page(j, order);
+ if (page_j && page_j != page_i)
+ break;
+ }
+ if (retry >= SHUFFLE_RETRY) {
+ pr_debug("%s: failed to swap %#lx\n", __func__, i);
+ continue;
+ }
+
+ /*
+ * Each migratetype corresponds to its own list, make sure the
+ * types match otherwise we're moving pages to lists where they
+ * do not belong.
+ */
+ migratetype = get_pageblock_migratetype(page_i);
+ if (get_pageblock_migratetype(page_j) != migratetype) {
+ pr_debug("%s: migratetype mismatch %#lx\n", __func__, i);
+ continue;
+ }
+
+ list_swap(&page_i->lru, &page_j->lru);
+
+ pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j);
+
+ /* take it easy on the zone lock */
+ if ((i % (100 * order_pages)) == 0) {
+ spin_unlock_irqrestore(&z->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+/**
+ * shuffle_free_memory - reduce the predictability of the page allocator
+ * @pgdat: node page data
+ */
+void __meminit __shuffle_free_memory(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ shuffle_zone(z);
+}
+
+void add_to_free_area_random(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ static u64 rand;
+ static u8 rand_bits;
+
+ /*
+ * The lack of locking is deliberate. If 2 threads race to
+ * update the rand state it just adds to the entropy.
+ */
+ if (rand_bits == 0) {
+ rand_bits = 64;
+ rand = get_random_u64();
+ }
+
+ if (rand & 1)
+ add_to_free_area(page, area, migratetype);
+ else
+ add_to_free_area_tail(page, area, migratetype);
+ rand_bits--;
+ rand >>= 1;
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
new file mode 100644
index 000000000000..777a257a0d2f
--- /dev/null
+++ b/mm/shuffle.h
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+#ifndef _MM_SHUFFLE_H
+#define _MM_SHUFFLE_H
+#include <linux/jump_label.h>
+
+/*
+ * SHUFFLE_ENABLE is called from the command line enabling path, or by
+ * platform-firmware enabling that indicates the presence of a
+ * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
+ * the command line path and overrides any previous or future
+ * SHUFFLE_ENABLE.
+ */
+enum mm_shuffle_ctl {
+ SHUFFLE_ENABLE,
+ SHUFFLE_FORCE_DISABLE,
+};
+
+#define SHUFFLE_ORDER (MAX_ORDER-1)
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
+extern void __shuffle_free_memory(pg_data_t *pgdat);
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_free_memory(pgdat);
+}
+
+extern void __shuffle_zone(struct zone *z);
+static inline void shuffle_zone(struct zone *z)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_zone(z);
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return false;
+ return order >= SHUFFLE_ORDER;
+}
+#else
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+}
+
+static inline void shuffle_zone(struct zone *z)
+{
+}
+
+static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ return false;
+}
+#endif
+#endif /* _MM_SHUFFLE_H */
diff --git a/mm/slab.c b/mm/slab.c
index 284ab737faee..f7117ad9b3a3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -362,29 +362,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static inline bool is_store_user_clean(struct kmem_cache *cachep)
-{
- return atomic_read(&cachep->store_user_clean) == 1;
-}
-
-static inline void set_store_user_clean(struct kmem_cache *cachep)
-{
- atomic_set(&cachep->store_user_clean, 1);
-}
-
-static inline void set_store_user_dirty(struct kmem_cache *cachep)
-{
- if (is_store_user_clean(cachep))
- atomic_set(&cachep->store_user_clean, 0);
-}
-
-#else
-static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
-
-#endif
-
/*
* Do not go above this order unless 0 objects fit into the slab or
* overridden on the command line.
@@ -990,10 +967,8 @@ static void cpuup_canceled(long cpu)
/* cpu is dead; no one can alloc from it. */
nc = per_cpu_ptr(cachep->cpu_cache, cpu);
- if (nc) {
- free_block(cachep, nc->entry, nc->avail, node, &list);
- nc->avail = 0;
- }
+ free_block(cachep, nc->entry, nc->avail, node, &list);
+ nc->avail = 0;
if (!cpumask_empty(mask)) {
spin_unlock_irq(&n->list_lock);
@@ -1674,8 +1649,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
struct page *page, *n;
- list_for_each_entry_safe(page, n, list, lru) {
- list_del(&page->lru);
+ list_for_each_entry_safe(page, n, list, slab_list) {
+ list_del(&page->slab_list);
slab_destroy(cachep, page);
}
}
@@ -2231,8 +2206,8 @@ static int drain_freelist(struct kmem_cache *cache,
goto out;
}
- page = list_entry(p, struct page, lru);
- list_del(&page->lru);
+ page = list_entry(p, struct page, slab_list);
+ list_del(&page->slab_list);
n->free_slabs--;
n->total_slabs--;
/*
@@ -2554,11 +2529,6 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
-#if DEBUG
- if (cachep->flags & SLAB_STORE_USER)
- set_store_user_dirty(cachep);
-#endif
-
return objp;
}
@@ -2691,13 +2661,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
if (!page)
return;
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->slab_list);
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
- list_add_tail(&page->lru, &(n->slabs_free));
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
fixup_slab_list(cachep, n, page, &list);
@@ -2764,10 +2734,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER) {
- set_store_user_dirty(cachep);
+ if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = (void *)caller;
- }
objnr = obj_to_index(cachep, page, objp);
@@ -2806,9 +2774,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
void **list)
{
/* move slabp to correct slabp list: */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (page->active == cachep->num) {
- list_add(&page->lru, &n->slabs_full);
+ list_add(&page->slab_list, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
@@ -2822,7 +2790,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
page->freelist = NULL;
}
} else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&page->slab_list, &n->slabs_partial);
}
/* Try to find non-pfmemalloc slab if needed */
@@ -2845,20 +2813,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
}
/* Move pfmemalloc slab to the end of list to speed up next search */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (!page->active) {
- list_add_tail(&page->lru, &n->slabs_free);
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
- list_for_each_entry(page, &n->slabs_partial, lru) {
+ list_for_each_entry(page, &n->slabs_partial, slab_list) {
if (!PageSlabPfmemalloc(page))
return page;
}
n->free_touched = 1;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
if (!PageSlabPfmemalloc(page)) {
n->free_slabs--;
return page;
@@ -2873,11 +2841,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
struct page *page;
assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ slab_list);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free, struct page,
- lru);
+ slab_list);
if (page)
n->free_slabs--;
}
@@ -3378,29 +3347,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
objp = objpp[i];
page = virt_to_head_page(objp);
- list_del(&page->lru);
+ list_del(&page->slab_list);
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
if (page->active == 0) {
- list_add(&page->lru, &n->slabs_free);
+ list_add(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
}
}
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num;
- page = list_last_entry(&n->slabs_free, struct page, lru);
- list_move(&page->lru, list);
+ page = list_last_entry(&n->slabs_free, struct page, slab_list);
+ list_move(&page->slab_list, list);
n->free_slabs--;
n->total_slabs--;
}
@@ -3438,7 +3407,7 @@ free_done:
int i = 0;
struct page *page;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
BUG_ON(page->active);
i++;
@@ -4185,196 +4154,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return res;
}
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static inline int add_caller(unsigned long *n, unsigned long v)
-{
- unsigned long *p;
- int l;
- if (!v)
- return 1;
- l = n[1];
- p = n + 2;
- while (l) {
- int i = l/2;
- unsigned long *q = p + 2 * i;
- if (*q == v) {
- q[1]++;
- return 1;
- }
- if (*q > v) {
- l = i;
- } else {
- p = q + 2;
- l -= i + 1;
- }
- }
- if (++n[1] == n[0])
- return 0;
- memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
- p[0] = v;
- p[1] = 1;
- return 1;
-}
-
-static void handle_slab(unsigned long *n, struct kmem_cache *c,
- struct page *page)
-{
- void *p;
- int i, j;
- unsigned long v;
-
- if (n[0] == n[1])
- return;
- for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- bool active = true;
-
- for (j = page->active; j < c->num; j++) {
- if (get_free_obj(page, j) == i) {
- active = false;
- break;
- }
- }
-
- if (!active)
- continue;
-
- /*
- * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
- * mapping is established when actual object allocation and
- * we could mistakenly access the unmapped object in the cpu
- * cache.
- */
- if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
- continue;
-
- if (!add_caller(n, v))
- return;
- }
-}
-
-static void show_symbol(struct seq_file *m, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
- unsigned long offset, size;
- char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
-
- if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
- seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
- if (modname[0])
- seq_printf(m, " [%s]", modname);
- return;
- }
-#endif
- seq_printf(m, "%px", (void *)address);
-}
-
-static int leaks_show(struct seq_file *m, void *p)
-{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
- root_caches_node);
- struct page *page;
- struct kmem_cache_node *n;
- const char *name;
- unsigned long *x = m->private;
- int node;
- int i;
-
- if (!(cachep->flags & SLAB_STORE_USER))
- return 0;
- if (!(cachep->flags & SLAB_RED_ZONE))
- return 0;
-
- /*
- * Set store_user_clean and start to grab stored user information
- * for all objects on this cache. If some alloc/free requests comes
- * during the processing, information would be wrong so restart
- * whole processing.
- */
- do {
- set_store_user_clean(cachep);
- drain_cpu_caches(cachep);
-
- x[1] = 0;
-
- for_each_kmem_cache_node(cachep, node, n) {
-
- check_irq_on();
- spin_lock_irq(&n->list_lock);
-
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
- } while (!is_store_user_clean(cachep));
-
- name = cachep->name;
- if (x[0] == x[1]) {
- /* Increase the buffer size */
- mutex_unlock(&slab_mutex);
- m->private = kcalloc(x[0] * 4, sizeof(unsigned long),
- GFP_KERNEL);
- if (!m->private) {
- /* Too bad, we are really out */
- m->private = x;
- mutex_lock(&slab_mutex);
- return -ENOMEM;
- }
- *(unsigned long *)m->private = x[0] * 2;
- kfree(x);
- mutex_lock(&slab_mutex);
- /* Now make sure this entry will be retried */
- m->count = m->size;
- return 0;
- }
- for (i = 0; i < x[1]; i++) {
- seq_printf(m, "%s: %lu ", name, x[2*i+3]);
- show_symbol(m, x[2*i+2]);
- seq_putc(m, '\n');
- }
-
- return 0;
-}
-
-static const struct seq_operations slabstats_op = {
- .start = slab_start,
- .next = slab_next,
- .stop = slab_stop,
- .show = leaks_show,
-};
-
-static int slabstats_open(struct inode *inode, struct file *file)
-{
- unsigned long *n;
-
- n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
- if (!n)
- return -ENOMEM;
-
- *n = PAGE_SIZE / (2 * sizeof(unsigned long));
-
- return 0;
-}
-
-static const struct file_operations proc_slabstats_operations = {
- .open = slabstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-#endif
-
-static int __init slab_proc_init(void)
-{
-#ifdef CONFIG_DEBUG_SLAB_LEAK
- proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
-#endif
- return 0;
-}
-module_init(slab_proc_init);
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
diff --git a/mm/slob.c b/mm/slob.c
index 307c2c9feb44..84aefd9b91ee 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp)
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->lru, list);
+ list_add(&sp->slab_list, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->lru);
+ list_del(&sp->slab_list);
__ClearPageSlobFree(sp);
}
@@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order)
}
/*
- * Allocate a slob block within a given slob_page sp.
+ * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+ *
+ * Return: Pointer to memory if allocated, %NULL otherwise. If the
+ * allocation fills up @page then the page is removed from the
+ * freelist, in this case @page_removed_from_list will be set to
+ * true (set to false otherwise).
*/
-static void *slob_page_alloc(struct page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
+ *page_removed_from_list = false;
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
@@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
}
sp->units -= units;
- if (!sp->units)
+ if (!sp->units) {
clear_slob_page_free(sp);
+ *page_removed_from_list = true;
+ }
return cur;
}
if (slob_last(cur))
@@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
{
struct page *sp;
- struct list_head *prev;
struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
+ bool _unused;
if (size < SLOB_BREAK1)
slob_list = &free_slob_small;
@@ -283,7 +298,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, lru) {
+ list_for_each_entry(sp, slob_list, slab_list) {
+ bool page_removed_from_list = false;
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- /* Attempt to alloc */
- prev = sp->lru.prev;
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, &page_removed_from_list);
if (!b)
continue;
- /* Improve fragment distribution and reduce our average
- * search time by starting our next search here. (see
- * Knuth vol 1, sec 2.5, pg 449) */
- if (prev != slob_list->prev &&
- slob_list->next != prev->next)
- list_move_tail(slob_list, prev->next);
+ /*
+ * If slob_page_alloc() removed sp from the list then we
+ * cannot call list functions on sp. If so allocation
+ * did not fragment the page anyway so optimisation is
+ * unnecessary.
+ */
+ if (!page_removed_from_list) {
+ /*
+ * Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449)
+ */
+ if (!list_is_first(&sp->slab_list, slob_list))
+ list_rotate_to_front(&sp->slab_list, slob_list);
+ }
break;
}
spin_unlock_irqrestore(&slob_lock, flags);
@@ -323,10 +346,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->lru);
+ INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
diff --git a/mm/slub.c b/mm/slub.c
index 6b28cd2b5a58..cd04dbd2b5d0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -58,10 +58,11 @@
* D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
- * on any list. The processor that froze the slab is the one who can
- * perform list operations on the page. Other processors may put objects
- * onto the freelist but the processor that froze the slab is the only
- * one that can retrieve the objects from the page's freelist.
+ * on any list except per cpu partial list. The processor that froze the
+ * slab is the one who can perform list operations on the page. Other
+ * processors may put objects onto the freelist but the processor that
+ * froze the slab is the only one that can retrieve the objects from the
+ * page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -1014,7 +1015,7 @@ static void add_full(struct kmem_cache *s,
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->lru, &n->full);
+ list_add(&page->slab_list, &n->full);
}
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
@@ -1023,7 +1024,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -1764,9 +1765,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->lru, &n->partial);
+ list_add_tail(&page->slab_list, &n->partial);
else
- list_add(&page->lru, &n->partial);
+ list_add(&page->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
@@ -1780,7 +1781,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
n->nr_partial--;
}
@@ -1854,7 +1855,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
if (!pfmemalloc_match(page, flags))
@@ -1942,7 +1943,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
}
}
} while (read_mems_allowed_retry(cpuset_mems_cookie));
-#endif
+#endif /* CONFIG_NUMA */
return NULL;
}
@@ -2240,7 +2241,7 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
@@ -2299,7 +2300,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
local_irq_restore(flags);
}
preempt_enable();
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2398,7 +2399,7 @@ static unsigned long count_partial(struct kmem_cache_node *n,
struct page *page;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
@@ -2804,7 +2805,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
-#endif
+#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
@@ -2903,8 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- if (kmem_cache_debug(s))
- remove_full(s, n, page);
+ remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -3696,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, lru) {
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
- list_add(&page->lru, &discard);
+ list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
"Objects remaining in %s on __kmem_cache_shutdown()");
@@ -3707,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, lru)
+ list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -3839,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -3987,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, lru) {
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
int free = page->objects - page->inuse;
/* Do not reread page->inuse */
@@ -3997,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s)
BUG_ON(free <= 0);
if (free == page->objects) {
- list_move(&page->lru, &discard);
+ list_move(&page->slab_list, &discard);
n->nr_partial--;
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->lru, promote + free - 1);
+ list_move(&page->slab_list, promote + free - 1);
}
/*
@@ -4013,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, lru)
+ list_for_each_entry_safe(page, t, &discard, slab_list)
discard_slab(s, page);
if (slabs_node(s, node))
@@ -4057,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
*/
slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
}
-#endif
+#endif /* CONFIG_MEMCG */
static int slab_mem_going_offline_callback(void *arg)
{
@@ -4205,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
for_each_kmem_cache_node(s, node, n) {
struct page *p;
- list_for_each_entry(p, &n->partial, lru)
+ list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
- list_for_each_entry(p, &n->full, lru)
+ list_for_each_entry(p, &n->full, slab_list)
p->slab_cache = s;
#endif
}
@@ -4426,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru) {
+ list_for_each_entry(page, &n->partial, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4437,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, lru) {
+ list_for_each_entry(page, &n->full, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4633,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
continue;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc, map);
- list_for_each_entry(page, &n->full, lru)
+ list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc, map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4690,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf, "No data\n");
return len;
}
-#endif
+#endif /* CONFIG_SLUB_DEBUG */
#ifdef SLUB_RESILIENCY_TEST
static void __init resiliency_test(void)
@@ -4750,7 +4750,7 @@ static void __init resiliency_test(void)
#ifdef CONFIG_SYSFS
static void resiliency_test(void) {};
#endif
-#endif
+#endif /* SLUB_RESILIENCY_TEST */
#ifdef CONFIG_SYSFS
enum slab_stat_type {
@@ -5407,7 +5407,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
-#endif
+#endif /* CONFIG_SLUB_STATS */
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
@@ -5608,7 +5608,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (buffer)
free_page((unsigned long)buffer);
-#endif
+#endif /* CONFIG_MEMCG */
}
static void kmem_cache_release(struct kobject *k)
diff --git a/mm/sparse.c b/mm/sparse.c
index 56e057c432f9..fd13166949b5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap)
#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-/*
- * returns the number of sections whose mem_maps were properly
- * set. If this is <=0, then that means that the passed-in
- * map was not consumed and must be freed.
+/**
+ * sparse_add_one_section - add a memory section
+ * @nid: The node to add section on
+ * @start_pfn: start pfn of the memory range
+ * @altmap: device page map
+ *
+ * This is only intended for hotplug.
+ *
+ * Return:
+ * * 0 - On success.
+ * * -EEXIST - Section has been present.
+ * * -ENOMEM - Out of memory.
*/
int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
struct vmem_altmap *altmap)
diff --git a/mm/swap.c b/mm/swap.c
index 301ed4e04320..7ede3eddc12a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/swap.c
*
@@ -867,7 +868,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
SetPageLRU(page);
/*
* Page becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+ * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the page to correct LRU and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85245fdec8d9..eb714165afd2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
set_page_private(page + i, entry.val + i);
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
xas_next(&xas);
}
address_space->nrpages += nr;
@@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, NULL);
- VM_BUG_ON_PAGE(entry != page + i, entry);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cf63b5f01adf..596ac98051c5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/swapfile.c
*
diff --git a/mm/truncate.c b/mm/truncate.c
index b7d3c99f00c9..8563339041f6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/truncate.c - code for taking down pages from address_spaces
*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d59b5a73dfb3..9932d5755e4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -271,8 +271,7 @@ retry:
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
diff --git a/mm/util.c b/mm/util.c
index 43a2984bccaa..9834c4ab7d8e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
@@ -318,7 +319,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -339,10 +340,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* were pinned, returns -errno.
*/
int __weak get_user_pages_fast(unsigned long start,
- int nr_pages, int write, struct page **pages)
+ int nr_pages, unsigned int gup_flags,
+ struct page **pages)
{
- return get_user_pages_unlocked(start, nr_pages, pages,
- write ? FOLL_WRITE : 0);
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -652,7 +653,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- long free, allowed, reserve;
+ long allowed;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -667,51 +668,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_zone_page_state(NR_FREE_PAGES);
- free += global_node_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_node_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_node_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Part of the kernel memory, which can be released
- * under memory pressure.
- */
- free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
+ if (pages > totalram_pages() + total_swap_pages)
goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
+ return 0;
}
allowed = vm_commit_limit();
@@ -725,7 +684,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
@@ -758,12 +718,12 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
len = arg_end - arg_start;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e5e9e1fcac01..7350a124524b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/vmalloc.c
*
@@ -32,6 +33,7 @@
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>
+#include <linux/rbtree_augmented.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -324,6 +326,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
+
#define VM_LAZY_FREE 0x02
#define VM_VM_AREA 0x04
@@ -332,14 +337,67 @@ static DEFINE_SPINLOCK(vmap_area_lock);
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
+static bool vmap_initialized __read_mostly;
+
+/*
+ * This kmem_cache is used for vmap_area objects. Instead of
+ * allocating from slab we reuse an object from this cache to
+ * make things faster. Especially in "no edge" splitting of
+ * free block.
+ */
+static struct kmem_cache *vmap_area_cachep;
+
+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
-/* The vmap cache globals are protected by vmap_area_lock */
-static struct rb_node *free_vmap_cache;
-static unsigned long cached_hole_size;
-static unsigned long cached_vstart;
-static unsigned long cached_align;
+/*
+ * This augment red-black tree represents the free vmap space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
-static unsigned long vmap_area_pcpu_hole;
+static __always_inline unsigned long
+va_size(struct vmap_area *va)
+{
+ return (va->va_end - va->va_start);
+}
+
+static __always_inline unsigned long
+get_subtree_max_size(struct rb_node *node)
+{
+ struct vmap_area *va;
+
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
+ return va ? va->subtree_max_size : 0;
+}
+
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
+RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size,
+ compute_subtree_max_size)
+
+static void purge_vmap_area_lazy(void);
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+static unsigned long lazy_max_pages(void);
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
@@ -360,41 +418,610 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
return NULL;
}
-static void __insert_vmap_area(struct vmap_area *va)
-{
- struct rb_node **p = &vmap_area_root.rb_node;
- struct rb_node *parent = NULL;
- struct rb_node *tmp;
+/*
+ * This function returns back addresses of parent node
+ * and its left or right link for further processing.
+ */
+static __always_inline struct rb_node **
+find_va_links(struct vmap_area *va,
+ struct rb_root *root, struct rb_node *from,
+ struct rb_node **parent)
+{
+ struct vmap_area *tmp_va;
+ struct rb_node **link;
+
+ if (root) {
+ link = &root->rb_node;
+ if (unlikely(!*link)) {
+ *parent = NULL;
+ return link;
+ }
+ } else {
+ link = &from;
+ }
- while (*p) {
- struct vmap_area *tmp_va;
+ /*
+ * Go to the bottom of the tree. When we hit the last point
+ * we end up with parent rb_node and correct direction, i name
+ * it link, where the new va->rb_node will be attached to.
+ */
+ do {
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
- parent = *p;
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp_va->va_end)
- p = &(*p)->rb_left;
- else if (va->va_end > tmp_va->va_start)
- p = &(*p)->rb_right;
+ /*
+ * During the traversal we also do some sanity check.
+ * Trigger the BUG() if there are sides(left/right)
+ * or full overlaps.
+ */
+ if (va->va_start < tmp_va->va_end &&
+ va->va_end <= tmp_va->va_start)
+ link = &(*link)->rb_left;
+ else if (va->va_end > tmp_va->va_start &&
+ va->va_start >= tmp_va->va_end)
+ link = &(*link)->rb_right;
else
BUG();
+ } while (*link);
+
+ *parent = &tmp_va->rb_node;
+ return link;
+}
+
+static __always_inline struct list_head *
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
+{
+ struct list_head *list;
+
+ if (unlikely(!parent))
+ /*
+ * The red-black tree where we try to find VA neighbors
+ * before merging or inserting is empty, i.e. it means
+ * there is no free vmap space. Normally it does not
+ * happen but we handle this case anyway.
+ */
+ return NULL;
+
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ return (&parent->rb_right == link ? list->next : list);
+}
+
+static __always_inline void
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
+{
+ /*
+ * VA is still not in the list, but we can
+ * identify its future previous list_head node.
+ */
+ if (likely(parent)) {
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ if (&parent->rb_right != link)
+ head = head->prev;
}
- rb_link_node(&va->rb_node, parent, p);
- rb_insert_color(&va->rb_node, &vmap_area_root);
+ /* Insert to the rb-tree */
+ rb_link_node(&va->rb_node, parent, link);
+ if (root == &free_vmap_area_root) {
+ /*
+ * Some explanation here. Just perform simple insertion
+ * to the tree. We do not set va->subtree_max_size to
+ * its current size before calling rb_insert_augmented().
+ * It is because of we populate the tree from the bottom
+ * to parent levels when the node _is_ in the tree.
+ *
+ * Therefore we set subtree_max_size to zero after insertion,
+ * to let __augment_tree_propagate_from() puts everything to
+ * the correct order later on.
+ */
+ rb_insert_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ va->subtree_max_size = 0;
+ } else {
+ rb_insert_color(&va->rb_node, root);
+ }
- /* address-sort this list */
- tmp = rb_prev(&va->rb_node);
- if (tmp) {
- struct vmap_area *prev;
- prev = rb_entry(tmp, struct vmap_area, rb_node);
- list_add_rcu(&va->list, &prev->list);
- } else
- list_add_rcu(&va->list, &vmap_area_list);
+ /* Address-sort this list */
+ list_add(&va->list, head);
}
-static void purge_vmap_area_lazy(void);
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ /*
+ * During merging a VA node can be empty, therefore
+ * not linked with the tree nor list. Just check it.
+ */
+ if (!RB_EMPTY_NODE(&va->rb_node)) {
+ if (root == &free_vmap_area_root)
+ rb_erase_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ else
+ rb_erase(&va->rb_node, root);
-static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+ list_del(&va->list);
+ RB_CLEAR_NODE(&va->rb_node);
+ }
+}
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+static void
+augment_tree_propagate_check(struct rb_node *n)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long size;
+ bool found = false;
+
+ if (n == NULL)
+ return;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ size = va->subtree_max_size;
+ node = n;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) == size) {
+ node = node->rb_left;
+ } else {
+ if (va_size(va) == size) {
+ found = true;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+ }
+
+ if (!found) {
+ va = rb_entry(n, struct vmap_area, rb_node);
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
+ }
+
+ augment_tree_propagate_check(n->rb_left);
+ augment_tree_propagate_check(n->rb_right);
+}
+#endif
+
+/*
+ * This function populates subtree_max_size from bottom to upper
+ * levels starting from VA point. The propagation must be done
+ * when VA size is modified by changing its va_start/va_end. Or
+ * in case of newly inserting of VA to the tree.
+ *
+ * It means that __augment_tree_propagate_from() must be called:
+ * - After VA has been inserted to the tree(free path);
+ * - After VA has been shrunk(allocation path);
+ * - After VA has been increased(merging path).
+ *
+ * Please note that, it does not mean that upper parent nodes
+ * and their subtree_max_size are recalculated all the time up
+ * to the root node.
+ *
+ * 4--8
+ * /\
+ * / \
+ * / \
+ * 2--2 8--8
+ *
+ * For example if we modify the node 4, shrinking it to 2, then
+ * no any modification is required. If we shrink the node 2 to 1
+ * its subtree_max_size is updated only, and set to 1. If we shrink
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
+ * node becomes 4--6.
+ */
+static __always_inline void
+augment_tree_propagate_from(struct vmap_area *va)
+{
+ struct rb_node *node = &va->rb_node;
+ unsigned long new_va_sub_max_size;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ new_va_sub_max_size = compute_subtree_max_size(va);
+
+ /*
+ * If the newly calculated maximum available size of the
+ * subtree is equal to the current one, then it means that
+ * the tree is propagated correctly. So we have to stop at
+ * this point to save cycles.
+ */
+ if (va->subtree_max_size == new_va_sub_max_size)
+ break;
+
+ va->subtree_max_size = new_va_sub_max_size;
+ node = rb_parent(&va->rb_node);
+ }
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+ augment_tree_propagate_check(free_vmap_area_root.rb_node);
+#endif
+}
+
+static void
+insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ link = find_va_links(va, root, NULL, &parent);
+ link_va(va, root, parent, link, head);
+}
+
+static void
+insert_vmap_area_augment(struct vmap_area *va,
+ struct rb_node *from, struct rb_root *root,
+ struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ if (from)
+ link = find_va_links(va, NULL, from, &parent);
+ else
+ link = find_va_links(va, root, NULL, &parent);
+
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. If coalesce is not done a new
+ * free area is inserted. If VA has been merged, it is
+ * freed.
+ */
+static __always_inline void
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct vmap_area *sibling;
+ struct list_head *next;
+ struct rb_node **link;
+ struct rb_node *parent;
+ bool merged = false;
+
+ /*
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
+ */
+ link = find_va_links(va, root, NULL, &parent);
+
+ /*
+ * Get next node of VA to check if merging can be done.
+ */
+ next = get_va_next_sibling(parent, link);
+ if (unlikely(next == NULL))
+ goto insert;
+
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Remove this VA, it has been merged. */
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (next->prev != head) {
+ sibling = list_entry(next->prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ sibling->va_end = va->va_end;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Remove this VA, it has been merged. */
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ return;
+ }
+ }
+
+insert:
+ if (!merged) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
+}
+
+static __always_inline bool
+is_within_this_va(struct vmap_area *va, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ unsigned long nva_start_addr;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Can be overflowed due to big size or alignment. */
+ if (nva_start_addr + size < nva_start_addr ||
+ nva_start_addr < vstart)
+ return false;
+
+ return (nva_start_addr + size <= va->va_end);
+}
+
+/*
+ * Find the first free block(lowest start address) in the tree,
+ * that will accomplish the request corresponding to passing
+ * parameters.
+ */
+static __always_inline struct vmap_area *
+find_vmap_lowest_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long length;
+
+ /* Start from the root. */
+ node = free_vmap_area_root.rb_node;
+
+ /* Adjust the search size for alignment overhead. */
+ length = size + align - 1;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) >= length &&
+ vstart < va->va_start) {
+ node = node->rb_left;
+ } else {
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ /*
+ * Does not make sense to go deeper towards the right
+ * sub-tree if it does not have a free block that is
+ * equal or bigger to the requested search length.
+ */
+ if (get_subtree_max_size(node->rb_right) >= length) {
+ node = node->rb_right;
+ continue;
+ }
+
+ /*
+ * OK. We roll back and find the first right sub-tree,
+ * that will satisfy the search criteria. It can happen
+ * only once due to "vstart" restriction.
+ */
+ while ((node = rb_parent(node))) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ if (get_subtree_max_size(node->rb_right) >= length &&
+ vstart <= va->va_start) {
+ node = node->rb_right;
+ break;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+#include <linux/random.h>
+
+static struct vmap_area *
+find_vmap_lowest_linear_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ if (!is_within_this_va(va, size, align, vstart))
+ continue;
+
+ return va;
+ }
+
+ return NULL;
+}
+
+static void
+find_vmap_lowest_match_check(unsigned long size)
+{
+ struct vmap_area *va_1, *va_2;
+ unsigned long vstart;
+ unsigned int rnd;
+
+ get_random_bytes(&rnd, sizeof(rnd));
+ vstart = VMALLOC_START + rnd;
+
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+
+ if (va_1 != va_2)
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
+ va_1, va_2, vstart);
+}
+#endif
+
+enum fit_type {
+ NOTHING_FIT = 0,
+ FL_FIT_TYPE = 1, /* full fit */
+ LE_FIT_TYPE = 2, /* left edge fit */
+ RE_FIT_TYPE = 3, /* right edge fit */
+ NE_FIT_TYPE = 4 /* no edge fit */
+};
+
+static __always_inline enum fit_type
+classify_va_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size)
+{
+ enum fit_type type;
+
+ /* Check if it is within VA. */
+ if (nva_start_addr < va->va_start ||
+ nva_start_addr + size > va->va_end)
+ return NOTHING_FIT;
+
+ /* Now classify. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ type = FL_FIT_TYPE;
+ else
+ type = LE_FIT_TYPE;
+ } else if (va->va_end == nva_start_addr + size) {
+ type = RE_FIT_TYPE;
+ } else {
+ type = NE_FIT_TYPE;
+ }
+
+ return type;
+}
+
+static __always_inline int
+adjust_va_to_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size,
+ enum fit_type type)
+{
+ struct vmap_area *lva;
+
+ if (type == FL_FIT_TYPE) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ unlink_va(va, &free_vmap_area_root);
+ kmem_cache_free(vmap_area_cachep, va);
+ } else if (type == LE_FIT_TYPE) {
+ /*
+ * Split left edge of fit VA.
+ *
+ * | |
+ * V NVA V R
+ * |-------|-------|
+ */
+ va->va_start += size;
+ } else if (type == RE_FIT_TYPE) {
+ /*
+ * Split right edge of fit VA.
+ *
+ * | |
+ * L V NVA V
+ * |-------|-------|
+ */
+ va->va_end = nva_start_addr;
+ } else if (type == NE_FIT_TYPE) {
+ /*
+ * Split no edge of fit VA.
+ *
+ * | |
+ * L V NVA V R
+ * |---|-------|---|
+ */
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+ if (unlikely(!lva))
+ return -1;
+
+ /*
+ * Build the remainder.
+ */
+ lva->va_start = va->va_start;
+ lva->va_end = nva_start_addr;
+
+ /*
+ * Shrink this VA to remaining size.
+ */
+ va->va_start = nva_start_addr + size;
+ } else {
+ return -1;
+ }
+
+ if (type != FL_FIT_TYPE) {
+ augment_tree_propagate_from(va);
+
+ if (type == NE_FIT_TYPE)
+ insert_vmap_area_augment(lva, &va->rb_node,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a start address of the newly allocated area, if success.
+ * Otherwise a vend is returned that indicates failure.
+ */
+static __always_inline unsigned long
+__alloc_vmap_area(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend, int node)
+{
+ unsigned long nva_start_addr;
+ struct vmap_area *va;
+ enum fit_type type;
+ int ret;
+
+ va = find_vmap_lowest_match(size, align, vstart);
+ if (unlikely(!va))
+ return vend;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return vend;
+
+ /* Classify what we have found. */
+ type = classify_va_fit_type(va, nva_start_addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ return vend;
+
+ /* Update the free vmap_area. */
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+ if (ret)
+ return vend;
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+ find_vmap_lowest_match_check(size);
+#endif
+
+ return nva_start_addr;
+}
/*
* Allocate a region of KVA of the specified size and alignment, within the
@@ -406,18 +1033,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
- struct rb_node *n;
unsigned long addr;
int purged = 0;
- struct vmap_area *first;
BUG_ON(!size);
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
+ if (unlikely(!vmap_initialized))
+ return ERR_PTR(-EBUSY);
+
might_sleep();
- va = kmalloc_node(sizeof(struct vmap_area),
+ va = kmem_cache_alloc_node(vmap_area_cachep,
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -430,87 +1058,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
retry:
spin_lock(&vmap_area_lock);
- /*
- * Invalidate cache if we have more permissive parameters.
- * cached_hole_size notes the largest hole noticed _below_
- * the vmap_area cached in free_vmap_cache: if size fits
- * into that hole, we want to scan from vstart to reuse
- * the hole instead of allocating above free_vmap_cache.
- * Note that __free_vmap_area may update free_vmap_cache
- * without updating cached_hole_size or cached_align.
- */
- if (!free_vmap_cache ||
- size < cached_hole_size ||
- vstart < cached_vstart ||
- align < cached_align) {
-nocache:
- cached_hole_size = 0;
- free_vmap_cache = NULL;
- }
- /* record if we encounter less permissive parameters */
- cached_vstart = vstart;
- cached_align = align;
-
- /* find starting point for our search */
- if (free_vmap_cache) {
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end, align);
- if (addr < vstart)
- goto nocache;
- if (addr + size < addr)
- goto overflow;
- } else {
- addr = ALIGN(vstart, align);
- if (addr + size < addr)
- goto overflow;
-
- n = vmap_area_root.rb_node;
- first = NULL;
-
- while (n) {
- struct vmap_area *tmp;
- tmp = rb_entry(n, struct vmap_area, rb_node);
- if (tmp->va_end >= addr) {
- first = tmp;
- if (tmp->va_start <= addr)
- break;
- n = n->rb_left;
- } else
- n = n->rb_right;
- }
-
- if (!first)
- goto found;
- }
-
- /* from the starting point, walk areas until a suitable hole is found */
- while (addr + size > first->va_start && addr + size <= vend) {
- if (addr + cached_hole_size < first->va_start)
- cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end, align);
- if (addr + size < addr)
- goto overflow;
-
- if (list_is_last(&first->list, &vmap_area_list))
- goto found;
-
- first = list_next_entry(first, list);
- }
-
-found:
/*
- * Check also calculated address against the vstart,
- * because it can be 0 because of big align request.
+ * If an allocation fails, the "vend" address is
+ * returned. Therefore trigger the overflow path.
*/
- if (addr + size > vend || addr < vstart)
+ addr = __alloc_vmap_area(size, align, vstart, vend, node);
+ if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
- __insert_vmap_area(va);
- free_vmap_cache = &va->rb_node;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -539,7 +1100,8 @@ overflow:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
- kfree(va);
+
+ kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
@@ -559,35 +1121,16 @@ static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
- if (free_vmap_cache) {
- if (va->va_end < cached_vstart) {
- free_vmap_cache = NULL;
- } else {
- struct vmap_area *cache;
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- if (va->va_start <= cache->va_start) {
- free_vmap_cache = rb_prev(&va->rb_node);
- /*
- * We don't try to update cached_hole_size or
- * cached_align, but it won't go very wrong.
- */
- }
- }
- }
- rb_erase(&va->rb_node, &vmap_area_root);
- RB_CLEAR_NODE(&va->rb_node);
- list_del_rcu(&va->list);
-
/*
- * Track the highest possible candidate for pcpu area
- * allocation. Areas outside of vmalloc area can be returned
- * here too, consider only end addresses which fall inside
- * vmalloc area proper.
+ * Remove from the busy tree/list.
*/
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+ unlink_va(va, &vmap_area_root);
- kfree_rcu(va, rcu_head);
+ /*
+ * Merge VA with its neighbors, otherwise just add it.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
}
/*
@@ -633,7 +1176,7 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
* Serialize vmap purging. There is no actual criticial section protected
@@ -651,7 +1194,7 @@ static void purge_fragmented_blocks_allcpus(void);
*/
void set_iounmap_nonlazy(void)
{
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
/*
@@ -659,34 +1202,40 @@ void set_iounmap_nonlazy(void)
*/
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
+ unsigned long resched_threshold;
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- bool do_free = false;
lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
+ if (unlikely(valist == NULL))
+ return false;
+
+ /*
+ * TODO: to calculate a flush range without looping.
+ * The list can be up to lazy_max_pages() elements.
+ */
llist_for_each_entry(va, valist, purge_list) {
if (va->va_start < start)
start = va->va_start;
if (va->va_end > end)
end = va->va_end;
- do_free = true;
}
- if (!do_free)
- return false;
-
flush_tlb_kernel_range(start, end);
+ resched_threshold = lazy_max_pages() << 1;
spin_lock(&vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
- int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
__free_vmap_area(va);
- atomic_sub(nr, &vmap_lazy_nr);
- cond_resched_lock(&vmap_area_lock);
+ atomic_long_sub(nr, &vmap_lazy_nr);
+
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
+ cond_resched_lock(&vmap_area_lock);
}
spin_unlock(&vmap_area_lock);
return true;
@@ -722,10 +1271,10 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- int nr_lazy;
+ unsigned long nr_lazy;
- nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
- &vmap_lazy_nr);
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
+ PAGE_SHIFT, &vmap_lazy_nr);
/* After this point, we may free va at any time */
llist_add(&va->purge_list, &vmap_purge_list);
@@ -788,8 +1337,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
-static bool vmap_initialized __read_mostly = false;
-
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
@@ -1250,12 +1797,58 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
vm_area_add_early(vm);
}
+static void vmap_init_free_space(void)
+{
+ unsigned long vmap_start = 1;
+ const unsigned long vmap_end = ULONG_MAX;
+ struct vmap_area *busy, *free;
+
+ /*
+ * B F B B B F
+ * -|-----|.....|-----|-----|-----|.....|-
+ * | The KVA space |
+ * |<--------------------------------->|
+ */
+ list_for_each_entry(busy, &vmap_area_list, list) {
+ if (busy->va_start - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = busy->va_start;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+
+ vmap_start = busy->va_end;
+ }
+
+ if (vmap_end - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = vmap_end;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+}
+
void __init vmalloc_init(void)
{
struct vmap_area *va;
struct vm_struct *tmp;
int i;
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
struct vfree_deferred *p;
@@ -1270,16 +1863,21 @@ void __init vmalloc_init(void)
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ continue;
+
va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
- __insert_vmap_area(va);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- vmap_area_pcpu_hole = VMALLOC_END;
-
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
vmap_initialized = true;
}
@@ -2471,81 +3069,64 @@ static struct vmap_area *node_to_va(struct rb_node *n)
}
/**
- * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
- * @end: target address
- * @pnext: out arg for the next vmap_area
- * @pprev: out arg for the previous vmap_area
- *
- * Returns: %true if either or both of next and prev are found,
- * %false if no vmap_area exists
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
+ * @addr: target address
*
- * Find vmap_areas end addresses of which enclose @end. ie. if not
- * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ * Returns: vmap_area if it is found. If there is no such area
+ * the first highest(reverse order) vmap_area is returned
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
+ * if there are no any areas before @addr.
*/
-static bool pvm_find_next_prev(unsigned long end,
- struct vmap_area **pnext,
- struct vmap_area **pprev)
+static struct vmap_area *
+pvm_find_va_enclose_addr(unsigned long addr)
{
- struct rb_node *n = vmap_area_root.rb_node;
- struct vmap_area *va = NULL;
+ struct vmap_area *va, *tmp;
+ struct rb_node *n;
+
+ n = free_vmap_area_root.rb_node;
+ va = NULL;
while (n) {
- va = rb_entry(n, struct vmap_area, rb_node);
- if (end < va->va_end)
- n = n->rb_left;
- else if (end > va->va_end)
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_start <= addr) {
+ va = tmp;
+ if (tmp->va_end >= addr)
+ break;
+
n = n->rb_right;
- else
- break;
+ } else {
+ n = n->rb_left;
+ }
}
- if (!va)
- return false;
-
- if (va->va_end > end) {
- *pnext = va;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
- } else {
- *pprev = va;
- *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
- }
- return true;
+ return va;
}
/**
- * pvm_determine_end - find the highest aligned address between two vmap_areas
- * @pnext: in/out arg for the next vmap_area
- * @pprev: in/out arg for the previous vmap_area
- * @align: alignment
+ * pvm_determine_end_from_reverse - find the highest aligned address
+ * of free block below VMALLOC_END
+ * @va:
+ * in - the VA we start the search(reverse order);
+ * out - the VA with the highest aligned end address.
*
- * Returns: determined end address
- *
- * Find the highest aligned address between *@pnext and *@pprev below
- * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
- * down address is between the end addresses of the two vmap_areas.
- *
- * Please note that the address returned by this function may fall
- * inside *@pnext vmap_area. The caller is responsible for checking
- * that.
+ * Returns: determined end address within vmap_area
*/
-static unsigned long pvm_determine_end(struct vmap_area **pnext,
- struct vmap_area **pprev,
- unsigned long align)
+static unsigned long
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
- const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
- if (*pnext)
- addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
- else
- addr = vmalloc_end;
-
- while (*pprev && (*pprev)->va_end > addr) {
- *pnext = *pprev;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ if (likely(*va)) {
+ list_for_each_entry_from_reverse((*va),
+ &free_vmap_area_list, list) {
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
+ if ((*va)->va_start < addr)
+ return addr;
+ }
}
- return addr;
+ return 0;
}
/**
@@ -2565,12 +3146,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* to gigabytes. To avoid interacting with regular vmallocs, these
* areas are allocated from top.
*
- * Despite its complicated look, this allocator is rather simple. It
- * does everything top-down and scans areas from the end looking for
- * matching slot. While scanning, if any of the areas overlaps with
- * existing vmap_area, the base address is pulled down to fit the
- * area. Scanning is repeated till all the areas fit and then all
- * necessary data structures are inserted and the result is returned.
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans free blocks from the end looking
+ * for matching base. While scanning, if any of the areas do not fit the
+ * base address is pulled down to fit the area. Scanning is repeated till
+ * all the areas fit and then all necessary data structures are inserted
+ * and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
@@ -2578,11 +3159,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
- struct vmap_area **vas, *prev, *next;
+ struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
- unsigned long base, start, end, last_end;
+ unsigned long base, start, size, end, last_end;
bool purged = false;
+ enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2618,7 +3200,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
goto err_free2;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
@@ -2631,49 +3213,29 @@ retry:
start = offsets[area];
end = start + sizes[area];
- if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
- base = vmalloc_end - last_end;
- goto found;
- }
- base = pvm_determine_end(&next, &prev, align) - end;
+ va = pvm_find_va_enclose_addr(vmalloc_end);
+ base = pvm_determine_end_from_reverse(&va, align) - end;
while (true) {
- BUG_ON(next && next->va_end <= base + end);
- BUG_ON(prev && prev->va_end > base + end);
-
/*
* base might have underflowed, add last_end before
* comparing.
*/
- if (base + last_end < vmalloc_start + last_end) {
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = true;
- goto retry;
- }
- goto err_free;
- }
+ if (base + last_end < vmalloc_start + last_end)
+ goto overflow;
/*
- * If next overlaps, move base downwards so that it's
- * right below next and then recheck.
+ * Fitting base has not been found.
*/
- if (next && next->va_start < base + end) {
- base = pvm_determine_end(&next, &prev, align) - end;
- term_area = area;
- continue;
- }
+ if (va == NULL)
+ goto overflow;
/*
- * If prev overlaps, shift down next and prev and move
- * base so that it's right below new next and then
- * recheck.
+ * If this VA does not fit, move base downwards and recheck.
*/
- if (prev && prev->va_end > base + start) {
- next = prev;
- prev = node_to_va(rb_prev(&next->rb_node));
- base = pvm_determine_end(&next, &prev, align) - end;
+ if (base + start < va->va_start || base + end > va->va_end) {
+ va = node_to_va(rb_prev(&va->rb_node));
+ base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
@@ -2685,21 +3247,40 @@ retry:
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
+
start = offsets[area];
end = start + sizes[area];
- pvm_find_next_prev(base + end, &next, &prev);
+ va = pvm_find_va_enclose_addr(base + end);
}
-found:
+
/* we've found a fitting base, insert all va's */
for (area = 0; area < nr_vms; area++) {
- struct vmap_area *va = vas[area];
+ int ret;
- va->va_start = base + offsets[area];
- va->va_end = va->va_start + sizes[area];
- __insert_vmap_area(va);
- }
+ start = base + offsets[area];
+ size = sizes[area];
+
+ va = pvm_find_va_enclose_addr(start);
+ if (WARN_ON_ONCE(va == NULL))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ type = classify_va_fit_type(va, start, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ ret = adjust_va_to_fit_type(va, start, size, type);
+ if (unlikely(ret))
+ goto recovery;
- vmap_area_pcpu_hole = base + offsets[last_area];
+ /* Allocated area. */
+ va = vas[area];
+ va->va_start = start;
+ va->va_end = start + size;
+
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ }
spin_unlock(&vmap_area_lock);
@@ -2711,9 +3292,38 @@ found:
kfree(vas);
return vms;
+recovery:
+ /* Remove previously inserted areas. */
+ while (area--) {
+ __free_vmap_area(vas[area]);
+ vas[area] = NULL;
+ }
+
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = true;
+
+ /* Before "retry", check if we recover. */
+ for (area = 0; area < nr_vms; area++) {
+ if (vas[area])
+ continue;
+
+ vas[area] = kmem_cache_zalloc(
+ vmap_area_cachep, GFP_KERNEL);
+ if (!vas[area])
+ goto err_free;
+ }
+
+ goto retry;
+ }
+
err_free:
for (area = 0; area < nr_vms; area++) {
- kfree(vas[area]);
+ if (vas[area])
+ kmem_cache_free(vmap_area_cachep, vas[area]);
+
kfree(vms[area]);
}
err_free2:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a815f73ee4d5..7acd0afdfc2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
int zid;
if (!mem_cgroup_disabled())
- lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+ lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
@@ -493,7 +493,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
total_scan += delta;
if (total_scan < 0) {
- pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+ pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
next_deferred = nr;
@@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
unsigned nr_reclaimed = 0;
+ unsigned pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1466,8 +1467,10 @@ activate_locked:
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
+ int type = page_is_file_cache(page);
SetPageActive(page);
- stat->nr_activate++;
+ pgactivate++;
+ stat->nr_activate[type] += hpage_nr_pages(page);
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1482,7 +1485,7 @@ keep:
free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
- count_vm_events(PGACTIVATE, stat->nr_activate);
+ count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -1804,40 +1807,54 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
return isolated > inactive;
}
-static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
+ struct page *page;
+ enum lru_list lru;
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(page_list)) {
- struct page *page = lru_to_page(page_list);
- int lru;
-
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
+ list_del(&page->lru);
spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
spin_lock_irq(&pgdat->lru_lock);
continue;
}
-
lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(page, lruvec, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- int numpages = hpage_nr_pages(page);
- reclaim_stat->recent_rotated[file] += numpages;
- }
+ nr_pages = hpage_nr_pages(page);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1850,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, page_list);
+ list_splice(&pages_to_free, list);
+
+ return nr_moved;
}
/*
@@ -1886,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_taken;
struct reclaim_stat stat;
int file = is_file_lru(lru);
+ enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
@@ -1913,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
- nr_scanned);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_DIRECT, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
- nr_scanned);
- }
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1934,19 +1949,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
- nr_reclaimed);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
- nr_reclaimed);
- }
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ reclaim_stat->recent_rotated[0] = stat.nr_activate[0];
+ reclaim_stat->recent_rotated[1] = stat.nr_activate[1];
- putback_inactive_pages(lruvec, &page_list);
+ move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1983,73 +1993,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return nr_reclaimed;
}
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold pgdat->lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop pgdat->lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
- *
- * Returns the number of pages moved to the given lru.
- */
-
-static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list,
- struct list_head *pages_to_free,
- enum lru_list lru)
-{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct page *page;
- int nr_pages;
- int nr_moved = 0;
-
- while (!list_empty(list)) {
- page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
-
- nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
-
- if (put_page_testzero(page)) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
-
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
- (*get_compound_page_dtor(page))(page);
- spin_lock_irq(&pgdat->lru_lock);
- } else
- list_add(&page->lru, pages_to_free);
- } else {
- nr_moved += nr_pages;
- }
- }
-
- if (!is_active_lru(lru)) {
- __count_vm_events(PGDEACTIVATE, nr_moved);
- count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
- nr_moved);
- }
-
- return nr_moved;
-}
-
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2079,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2136,13 +2079,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_pages_to_lru(lruvec, &l_active);
+ nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ /* Keep all free pages in l_active list */
+ list_splice(&l_inactive, &l_active);
+
+ __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge_list(&l_hold);
- free_unref_page_list(&l_hold);
+ mem_cgroup_uncharge_list(&l_active);
+ free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2201,7 +2150,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
- refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
if (file && actual_reclaim && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
@@ -2963,7 +2912,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
- refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
@@ -3212,10 +3161,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
- trace_mm_vmscan_direct_reclaim_begin(order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3246,9 +3192,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3297,10 +3241,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
@@ -4149,6 +4090,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.reclaim_idx = gfp_zone(gfp_mask),
};
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+ sc.gfp_mask);
+
cond_resched();
fs_reclaim_acquire(sc.gfp_mask);
/*
@@ -4175,6 +4119,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+
+ trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed >= nr_pages;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7d493366a65..fd7e16ca6996 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/vmstat.c
*
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67502d5..e0b4edcb88c8 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -426,12 +426,14 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
#ifdef CONFIG_MEMCG
if (sc->memcg) {
struct lruvec *lruvec;
+ int i;
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL);
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
- pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
- pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state_local(lruvec,
+ NR_LRU_BASE + i);
+ pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
+ pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
} else
#endif
pages = node_present_pages(sc->nid);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index aee9b0b8d907..985732c8b025 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* z3fold.c
*
@@ -24,16 +25,47 @@
#include <linux/atomic.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/dcache.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
#include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK (0x3)
+#define BUDDY_SHIFT 2
+#define SLOTS_ALIGN (0x40)
+
/*****************
* Structures
*****************/
@@ -47,8 +79,18 @@ enum buddy {
FIRST,
MIDDLE,
LAST,
- BUDDIES_MAX
+ BUDDIES_MAX = LAST
+};
+
+struct z3fold_buddy_slots {
+ /*
+ * we are using BUDDY_MASK in handle_to_buddy etc. so there should
+ * be enough slots to hold all possible variants
+ */
+ unsigned long slot[BUDDY_MASK + 1];
+ unsigned long pool; /* back link + flags */
};
+#define HANDLE_FLAG_MASK (0x03)
/*
* struct z3fold_header - z3fold page metadata occupying first chunks of each
@@ -58,49 +100,29 @@ enum buddy {
* @page_lock: per-page lock
* @refcount: reference count for the z3fold page
* @work: work_struct for page layout optimization
- * @pool: pointer to the pool which this page belongs to
+ * @slots: pointer to the structure holding buddy slots
* @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
* @first_num: the starting number (for the first handle)
+ * @mapped_count: the number of objects currently mapped
*/
struct z3fold_header {
struct list_head buddy;
spinlock_t page_lock;
struct kref refcount;
struct work_struct work;
- struct z3fold_pool *pool;
+ struct z3fold_buddy_slots *slots;
short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
unsigned short last_chunks;
unsigned short start_middle;
unsigned short first_num:2;
+ unsigned short mapped_count:2;
};
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-#define BUDDY_MASK (0x3)
-#define BUDDY_SHIFT 2
-
/**
* struct z3fold_pool - stores metadata for each z3fold pool
* @name: pool name
@@ -113,11 +135,13 @@ struct z3fold_header {
* added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
+ * @c_handle: cache for z3fold_buddy_slots allocation
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
+ * @inode: inode for z3fold pseudo filesystem
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -130,12 +154,14 @@ struct z3fold_pool {
struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
+ struct kmem_cache *c_handle;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
+ struct inode *inode;
};
/*
@@ -164,11 +190,119 @@ static int size_to_chunks(size_t size)
static void compact_page_work(struct work_struct *w);
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
+ gfp_t gfp)
+{
+ struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
+ gfp);
+
+ if (slots) {
+ memset(slots->slot, 0, sizeof(slots->slot));
+ slots->pool = (unsigned long)pool;
+ }
+
+ return slots;
+}
+
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
+{
+ return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
+}
+
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
+{
+ return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
+}
+
+static inline void free_handle(unsigned long handle)
+{
+ struct z3fold_buddy_slots *slots;
+ int i;
+ bool is_free;
+
+ if (handle & (1 << PAGE_HEADLESS))
+ return;
+
+ WARN_ON(*(unsigned long *)handle == 0);
+ *(unsigned long *)handle = 0;
+ slots = handle_to_slots(handle);
+ is_free = true;
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+
+ if (is_free) {
+ struct z3fold_pool *pool = slots_to_pool(slots);
+
+ kmem_cache_free(pool->c_handle, slots);
+ }
+}
+
+static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+
+ return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
+}
+
+static struct file_system_type z3fold_fs = {
+ .name = "z3fold",
+ .mount = z3fold_do_mount,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *z3fold_mnt;
+static int z3fold_mount(void)
+{
+ int ret = 0;
+
+ z3fold_mnt = kern_mount(&z3fold_fs);
+ if (IS_ERR(z3fold_mnt))
+ ret = PTR_ERR(z3fold_mnt);
+
+ return ret;
+}
+
+static void z3fold_unmount(void)
+{
+ kern_unmount(z3fold_mnt);
+}
+
+static const struct address_space_operations z3fold_aops;
+static int z3fold_register_migration(struct z3fold_pool *pool)
+{
+ pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &z3fold_aops;
+ return 0;
+}
+
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
+{
+ if (pool->inode)
+ iput(pool->inode);
+ }
+
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page,
- struct z3fold_pool *pool)
+ struct z3fold_pool *pool, gfp_t gfp)
{
struct z3fold_header *zhdr = page_address(page);
+ struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp);
+
+ if (!slots)
+ return NULL;
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
@@ -185,15 +319,21 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
zhdr->first_num = 0;
zhdr->start_middle = 0;
zhdr->cpu = -1;
- zhdr->pool = pool;
+ zhdr->slots = slots;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
}
/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page)
+static void free_z3fold_page(struct page *page, bool headless)
{
+ if (!headless) {
+ lock_page(page);
+ __ClearPageMovable(page);
+ unlock_page(page);
+ }
+ ClearPagePrivate(page);
__free_page(page);
}
@@ -215,33 +355,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
spin_unlock(&zhdr->page_lock);
}
+/* Helper function to build the index */
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return (bud + zhdr->first_num) & BUDDY_MASK;
+}
+
/*
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
*/
static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
{
- unsigned long handle;
+ struct z3fold_buddy_slots *slots;
+ unsigned long h = (unsigned long)zhdr;
+ int idx = 0;
- handle = (unsigned long)zhdr;
- if (bud != HEADLESS) {
- handle |= (bud + zhdr->first_num) & BUDDY_MASK;
- if (bud == LAST)
- handle |= (zhdr->last_chunks << BUDDY_SHIFT);
- }
- return handle;
+ /*
+ * For a headless page, its handle is its pointer with the extra
+ * PAGE_HEADLESS bit set
+ */
+ if (bud == HEADLESS)
+ return h | (1 << PAGE_HEADLESS);
+
+ /* otherwise, return pointer to encoded handle */
+ idx = __idx(zhdr, bud);
+ h += idx;
+ if (bud == LAST)
+ h |= (zhdr->last_chunks << BUDDY_SHIFT);
+
+ slots = zhdr->slots;
+ slots->slot[idx] = h;
+ return (unsigned long)&slots->slot[idx];
}
/* Returns the z3fold page where a given handle is stored */
-static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
{
- return (struct z3fold_header *)(handle & PAGE_MASK);
+ unsigned long addr = h;
+
+ if (!(addr & (1 << PAGE_HEADLESS)))
+ addr = *(unsigned long *)h;
+
+ return (struct z3fold_header *)(addr & PAGE_MASK);
}
/* only for LAST bud, returns zero otherwise */
static unsigned short handle_to_chunks(unsigned long handle)
{
- return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+ unsigned long addr = *(unsigned long *)handle;
+
+ return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}
/*
@@ -251,21 +415,31 @@ static unsigned short handle_to_chunks(unsigned long handle)
*/
static enum buddy handle_to_buddy(unsigned long handle)
{
- struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
- return (handle - zhdr->first_num) & BUDDY_MASK;
+ struct z3fold_header *zhdr;
+ unsigned long addr;
+
+ WARN_ON(handle & (1 << PAGE_HEADLESS));
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ return (addr - zhdr->first_num) & BUDDY_MASK;
+}
+
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
+{
+ return slots_to_pool(zhdr->slots);
}
static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
- list_del(&page->lru);
+ list_del_init(&page->lru);
spin_unlock(&pool->lock);
if (locked)
z3fold_page_unlock(zhdr);
@@ -295,9 +469,10 @@ static void release_z3fold_page_locked_list(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
- spin_lock(&zhdr->pool->lock);
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
list_del_init(&zhdr->buddy);
- spin_unlock(&zhdr->pool->lock);
+ spin_unlock(&pool->lock);
WARN_ON(z3fold_page_trylock(zhdr));
__release_z3fold_page(zhdr, true);
@@ -318,7 +493,7 @@ static void free_pages_work(struct work_struct *w)
continue;
spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
- free_z3fold_page(page);
+ free_z3fold_page(page, false);
cond_resched();
spin_lock(&pool->stale_lock);
}
@@ -349,6 +524,23 @@ static int num_free_chunks(struct z3fold_header *zhdr)
return nfree;
}
+/* Add to the appropriate unbuddied list */
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
+ struct z3fold_header *zhdr)
+{
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
+ int freechunks = num_free_chunks(zhdr);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
+ }
+}
+
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -367,6 +559,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
return 0; /* can't move middle chunk, it's used */
+ if (unlikely(PageIsolated(page)))
+ return 0;
+
if (zhdr->middle_chunks == 0)
return 0; /* nothing to compact */
@@ -406,10 +601,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
static void do_compact_page(struct z3fold_header *zhdr, bool locked)
{
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
struct page *page;
- struct list_head *unbuddied;
- int fchunks;
page = virt_to_page(zhdr);
if (locked)
@@ -429,19 +622,14 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}
- z3fold_compact_page(zhdr);
- unbuddied = get_cpu_ptr(pool->unbuddied);
- fchunks = num_free_chunks(zhdr);
- if (fchunks < NCHUNKS &&
- (!zhdr->first_chunks || !zhdr->middle_chunks ||
- !zhdr->last_chunks)) {
- /* the page's not completely free and it's unbuddied */
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[fchunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
+ if (unlikely(PageIsolated(page) ||
+ test_bit(PAGE_STALE, &page->private))) {
+ z3fold_page_unlock(zhdr);
+ return;
}
- put_cpu_ptr(pool->unbuddied);
+
+ z3fold_compact_page(zhdr);
+ add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
@@ -453,6 +641,103 @@ static void compact_page_work(struct work_struct *w)
do_compact_page(zhdr, false);
}
+/* returns _locked_ z3fold page header or NULL */
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
+ size_t size, bool can_sleep)
+{
+ struct z3fold_header *zhdr = NULL;
+ struct page *page;
+ struct list_head *unbuddied;
+ int chunks = size_to_chunks(size), i;
+
+lookup:
+ /* First, try to find an unbuddied z3fold page. */
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ for_each_unbuddied_list(i, chunks) {
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr)
+ continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (!zhdr) {
+ int cpu;
+
+ /* look for _exact_ match on other cpus' lists */
+ for_each_online_cpu(cpu) {
+ struct list_head *l;
+
+ unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
+ spin_lock(&pool->lock);
+ l = &unbuddied[chunks];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ continue;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ if (can_sleep)
+ cond_resched();
+ continue;
+ }
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ }
+
+ return zhdr;
+}
/*
* API Functions
@@ -476,6 +761,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool = kzalloc(sizeof(struct z3fold_pool), gfp);
if (!pool)
goto out;
+ pool->c_handle = kmem_cache_create("z3fold_handle",
+ sizeof(struct z3fold_buddy_slots),
+ SLOTS_ALIGN, 0, NULL);
+ if (!pool->c_handle)
+ goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
@@ -497,15 +787,21 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
+ if (z3fold_register_migration(pool))
+ goto out_rwq;
INIT_WORK(&pool->work, free_pages_work);
pool->ops = ops;
return pool;
+out_rwq:
+ destroy_workqueue(pool->release_wq);
out_wq:
destroy_workqueue(pool->compact_wq);
out_unbuddied:
free_percpu(pool->unbuddied);
out_pool:
+ kmem_cache_destroy(pool->c_handle);
+out_c:
kfree(pool);
out:
return NULL;
@@ -519,6 +815,8 @@ out:
*/
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
+ kmem_cache_destroy(pool->c_handle);
+ z3fold_unregister_migration(pool);
destroy_workqueue(pool->release_wq);
destroy_workqueue(pool->compact_wq);
kfree(pool);
@@ -546,7 +844,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- int chunks = 0, i, freechunks;
+ int chunks = size_to_chunks(size);
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
enum buddy bud;
@@ -561,56 +859,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
bud = HEADLESS;
else {
- struct list_head *unbuddied;
- chunks = size_to_chunks(size);
-
-lookup:
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- l = &unbuddied[i];
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- put_cpu_ptr(pool->unbuddied);
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- put_cpu_ptr(pool->unbuddied);
-
+retry:
+ zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
@@ -630,8 +880,9 @@ lookup:
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
- goto lookup;
+ goto retry;
}
+ page = virt_to_page(zhdr);
goto found;
}
bud = FIRST;
@@ -662,13 +913,18 @@ lookup:
if (!page)
return -ENOMEM;
+ zhdr = init_z3fold_page(page, pool, gfp);
+ if (!zhdr) {
+ __free_page(page);
+ return -ENOMEM;
+ }
atomic64_inc(&pool->pages_nr);
- zhdr = init_z3fold_page(page, pool);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
goto headless;
}
+ __SetPageMovable(page, pool->inode->i_mapping);
z3fold_page_lock(zhdr);
found:
@@ -680,19 +936,7 @@ found:
zhdr->middle_chunks = chunks;
zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
- }
+ add_to_unbuddied(pool, zhdr);
headless:
spin_lock(&pool->lock);
@@ -739,7 +983,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
}
return;
@@ -766,6 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
+ free_handle(handle);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -774,7 +1019,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
z3fold_page_unlock(zhdr);
return;
}
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ if (unlikely(PageIsolated(page)) ||
+ test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -855,10 +1101,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
if (test_and_set_bit(PAGE_CLAIMED, &page->private))
continue;
- zhdr = page_address(page);
+ if (unlikely(PageIsolated(page)))
+ continue;
if (test_bit(PAGE_HEADLESS, &page->private))
break;
+ zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr)) {
zhdr = NULL;
continue; /* can't evict at this point */
@@ -919,7 +1167,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
return 0;
}
@@ -996,6 +1244,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
break;
}
+ if (addr)
+ zhdr->mapped_count++;
z3fold_page_unlock(zhdr);
out:
return addr;
@@ -1022,6 +1272,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ zhdr->mapped_count--;
z3fold_page_unlock(zhdr);
}
@@ -1036,6 +1287,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
return atomic64_read(&pool->pages_nr);
}
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ return false;
+
+ zhdr = page_address(page);
+ z3fold_page_lock(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_STALE, &page->private))
+ goto out;
+
+ pool = zhdr_to_pool(zhdr);
+
+ if (zhdr->mapped_count == 0) {
+ kref_get(&zhdr->refcount);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ return true;
+ }
+out:
+ z3fold_page_unlock(zhdr);
+ return false;
+}
+
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct z3fold_header *zhdr, *new_zhdr;
+ struct z3fold_pool *pool;
+ struct address_space *new_mapping;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ if (!trylock_page(page))
+ return -EAGAIN;
+
+ if (!z3fold_page_trylock(zhdr)) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ if (zhdr->mapped_count != 0) {
+ z3fold_page_unlock(zhdr);
+ unlock_page(page);
+ return -EBUSY;
+ }
+ new_zhdr = page_address(newpage);
+ memcpy(new_zhdr, zhdr, PAGE_SIZE);
+ newpage->private = page->private;
+ page->private = 0;
+ z3fold_page_unlock(zhdr);
+ spin_lock_init(&new_zhdr->page_lock);
+ new_mapping = page_mapping(page);
+ __ClearPageMovable(page);
+ ClearPagePrivate(page);
+
+ get_page(newpage);
+ z3fold_page_lock(new_zhdr);
+ if (new_zhdr->first_chunks)
+ encode_handle(new_zhdr, FIRST);
+ if (new_zhdr->last_chunks)
+ encode_handle(new_zhdr, LAST);
+ if (new_zhdr->middle_chunks)
+ encode_handle(new_zhdr, MIDDLE);
+ set_bit(NEEDS_COMPACTING, &newpage->private);
+ new_zhdr->cpu = smp_processor_id();
+ spin_lock(&pool->lock);
+ list_add(&newpage->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ __SetPageMovable(newpage, new_mapping);
+ z3fold_page_unlock(new_zhdr);
+
+ queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+
+ page_mapcount_reset(page);
+ unlock_page(page);
+ put_page(page);
+ return 0;
+}
+
+static void z3fold_page_putback(struct page *page)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ z3fold_page_lock(zhdr);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ INIT_LIST_HEAD(&page->lru);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+}
+
+static const struct address_space_operations z3fold_aops = {
+ .isolate_page = z3fold_page_isolate,
+ .migratepage = z3fold_page_migrate,
+ .putback_page = z3fold_page_putback,
+};
+
/*****************
* zpool
****************/
@@ -1133,8 +1506,14 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
+ int ret;
+
/* Make sure the z3fold header is not larger than the page size */
BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ ret = z3fold_mount();
+ if (ret)
+ return ret;
+
zpool_register_driver(&z3fold_zpool_driver);
return 0;
@@ -1142,6 +1521,7 @@ static int __init init_z3fold(void)
static void __exit exit_z3fold(void)
{
+ z3fold_unmount();
zpool_unregister_driver(&z3fold_zpool_driver);
}
diff --git a/mm/zbud.c b/mm/zbud.c
index 28458f7d1e84..de5dd4ddaa82 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* zbud.c
*
diff --git a/mm/zpool.c b/mm/zpool.c
index 01a771e304fa..a2dd9107857d 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* zpool memory storage api
*
diff --git a/mm/zswap.c b/mm/zswap.c
index a4e4d36ec085..2412042f5550 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* zswap.c - zswap driver file
*
@@ -8,16 +9,6 @@
* than reading from the swap device, can also improve workload performance.
*
* Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt