summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2015-02-13 22:12:06 (GMT)
committerScott Wood <scottwood@freescale.com>2015-02-13 22:19:22 (GMT)
commit6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch)
treef558a94f1553814cc122ab8d9e04c0ebad5262a5 /mm
parentfcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff)
downloadlinux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz
Reset to 3.12.37
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/compaction.c376
-rw-r--r--mm/filemap.c470
-rw-r--r--mm/fremap.c28
-rw-r--r--mm/frontswap.c17
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/huge_memory.c139
-rw-r--r--mm/hugetlb.c89
-rw-r--r--mm/internal.h23
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c1
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c60
-rw-r--r--mm/memory-failure.c94
-rw-r--r--mm/memory.c97
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c80
-rw-r--r--mm/migrate.c63
-rw-r--r--mm/mincore.c20
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c72
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c24
-rw-r--r--mm/oom_kill.c80
-rw-r--r--mm/page-writeback.c23
-rw-r--r--mm/page_alloc.c712
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/percpu-vm.c22
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/readahead.c37
-rw-r--r--mm/rmap.c43
-rw-r--r--mm/shmem.c241
-rw-r--r--mm/slab.c23
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c142
-rw-r--r--mm/swap.c135
-rw-r--r--mm/swap_state.c65
-rw-r--r--mm/swapfile.c224
-rw-r--r--mm/truncate.c130
-rw-r--r--mm/util.c9
-rw-r--r--mm/vmacache.c114
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmpressure.c8
-rw-r--r--mm/vmscan.c225
-rw-r--r--mm/vmstat.c19
49 files changed, 2448 insertions, 1548 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 083685a..2a092f5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -254,6 +254,9 @@ config MIGRATION
pages as migration can relocate pages to satisfy a huge page
allocation instead of reclaiming.
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ boolean
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -384,7 +387,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
help
Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 305d10a..fb51bc6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o \
+ compaction.o balloon_compaction.o vmacache.o \
interval_tree.o list_lru.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/bounce.c b/mm/bounce.c
index b09bb4e..5a7d58f 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -51,11 +51,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
unsigned long flags;
unsigned char *vto;
- local_irq_save_nort(flags);
+ local_irq_save(flags);
vto = kmap_atomic(to->bv_page);
memcpy(vto + to->bv_offset, vfrom, to->bv_len);
kunmap_atomic(vto);
- local_irq_restore_nort(flags);
+ local_irq_restore(flags);
}
#else /* CONFIG_HIGHMEM */
diff --git a/mm/compaction.c b/mm/compaction.c
index d2c6751..adb6d05 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
- zone->compact_cached_migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn[0] = start_pfn;
+ zone->compact_cached_migrate_pfn[1] = start_pfn;
zone->compact_cached_free_pfn = end_pfn;
zone->compact_blockskip_flush = false;
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
*/
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ bool set_unsuitable, bool migrate_scanner)
{
struct zone *zone = cc->zone;
+ unsigned long pfn;
if (cc->ignore_skip_hint)
return;
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
if (!page)
return;
- if (!nr_isolated) {
- unsigned long pfn = page_to_pfn(page);
+ if (nr_isolated)
+ return;
+
+ /*
+ * Only skip pageblocks when all forms of compaction will be known to
+ * fail in the near future.
+ */
+ if (set_unsuitable)
set_pageblock_skip(page);
- /* Update where compaction should restart */
- if (migrate_scanner) {
- if (!cc->finished_update_migrate &&
- pfn > zone->compact_cached_migrate_pfn)
- zone->compact_cached_migrate_pfn = pfn;
- } else {
- if (!cc->finished_update_free &&
- pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ pfn = page_to_pfn(page);
+
+ /* Update where async and sync compaction should restart */
+ if (migrate_scanner) {
+ if (cc->finished_update_migrate)
+ return;
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+ } else {
+ if (cc->finished_update_free)
+ return;
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
}
#else
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ bool set_unsuitable, bool migrate_scanner)
{
}
#endif /* CONFIG_COMPACTION */
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
}
/* async aborts if taking too long or contended */
- if (!cc->sync) {
+ if (cc->mode == MIGRATE_ASYNC) {
cc->contended = true;
return false;
}
@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
-static inline bool compact_trylock_irqsave(spinlock_t *lock,
- unsigned long *flags, struct compact_control *cc)
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction or aborts async
+ * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * is used where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
{
- return compact_checklock_irqsave(lock, flags, false, cc);
+ /* async compaction aborts if contended */
+ if (need_resched()) {
+ if (cc->mode == MIGRATE_ASYNC) {
+ cc->contended = true;
+ return true;
+ }
+
+ cond_resched();
+ }
+
+ return false;
}
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_RESERVE)
- return false;
-
- if (is_migrate_isolate(migratetype))
- return false;
-
- /* If the page is a large free page, then allow migration */
+ /* If the page is a large free page, then disallow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
+ return false;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(migratetype))
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
struct page *cursor, *valid_page = NULL;
unsigned long flags;
bool locked = false;
+ bool checked_pageblock = false;
cursor = pfn_to_page(blockpfn);
@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
/* Recheck this is a suitable migration target under lock */
- if (!strict && !suitable_migration_target(page))
- break;
+ if (!strict && !checked_pageblock) {
+ /*
+ * We need to check suitability of pageblock only once
+ * and this isolate_freepages_block() is called with
+ * pageblock range, so just check once is sufficient.
+ */
+ checked_pageblock = true;
+ if (!suitable_migration_target(page))
+ break;
+ }
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -330,7 +362,8 @@ isolate_fail:
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
+ update_pageblock_skip(cc, valid_page, total_isolated, true,
+ false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
- isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
+ bool set_unsuitable = true;
+ const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+ ISOLATE_ASYNC_MIGRATE : 0) |
+ (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
- if (!cc->sync)
+ if (cc->mode == MIGRATE_ASYNC)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
return 0;
}
+ if (compact_should_abort(cc))
+ return 0;
+
/* Time to isolate some pages for migration */
- cond_resched();
for (; low_pfn < end_pfn; low_pfn++) {
/* give a chance to irqs before checking need_resched() */
- if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
if (should_release_lock(&zone->lru_lock)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
locked = false;
@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* If isolation recently failed, do not retry */
pageblock_nr = low_pfn >> pageblock_order;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ if (last_pageblock_nr != pageblock_nr) {
+ int mt;
+
+ last_pageblock_nr = pageblock_nr;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
+ /*
+ * For async migration, also only scan in MOVABLE
+ * blocks. Async migration is optimistic to see if
+ * the minimum amount of work satisfies the allocation
+ */
+ mt = get_pageblock_migratetype(page);
+ if (cc->mode == MIGRATE_ASYNC &&
+ !migrate_async_suitable(mt)) {
+ set_unsuitable = false;
+ goto next_pageblock;
+ }
+ }
/* Skip if free */
if (PageBuddy(page))
continue;
/*
- * For async migration, also only scan in MOVABLE blocks. Async
- * migration is optimistic to see if the minimum amount of work
- * satisfies the allocation
- */
- if (!cc->sync && last_pageblock_nr != pageblock_nr &&
- !migrate_async_suitable(get_pageblock_migratetype(page))) {
- cc->finished_update_migrate = true;
- goto next_pageblock;
- }
-
- /*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (unlikely(balloon_page_movable(page))) {
if (locked && balloon_page_isolate(page)) {
/* Successfully isolated */
- cc->finished_update_migrate = true;
- list_add(&page->lru, migratelist);
- cc->nr_migratepages++;
- nr_isolated++;
- goto check_compact_cluster;
+ goto isolate_success;
}
}
continue;
@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
+ /*
+ * Migration will fail if an anonymous page is pinned in memory,
+ * so avoid taking lru_lock and isolating it unnecessarily in an
+ * admittedly racy check.
+ */
+ if (!page_mapping(page) &&
+ page_count(page) > page_mapcount(page))
+ continue;
+
/* Check if it is ok to still hold the lock */
locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
locked, cc);
@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- if (!cc->sync)
- mode |= ISOLATE_ASYNC_MIGRATE;
-
- if (unevictable)
- mode |= ISOLATE_UNEVICTABLE;
-
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
- cc->finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));
+
+isolate_success:
+ cc->finished_update_migrate = true;
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
-check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -626,7 +670,6 @@ check_compact_cluster:
next_pageblock:
low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
- last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
@@ -634,9 +677,13 @@ next_pageblock:
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
- /* Update the pageblock-skip if the whole pageblock was scanned */
+ /*
+ * Update the pageblock-skip information and cached scanner pfn,
+ * if the whole pageblock was scanned without isolating any page.
+ */
if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ update_pageblock_skip(cc, valid_page, nr_isolated,
+ set_unsuitable, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -657,44 +704,48 @@ static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
- unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
+ unsigned long block_start_pfn; /* start of current pageblock */
+ unsigned long block_end_pfn; /* end of current pageblock */
+ unsigned long low_pfn; /* lowest pfn scanner is able to scan */
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
* Initialise the free scanner. The starting point is where we last
- * scanned from (or the end of the zone if starting). The low point
- * is the end of the pageblock the migration scanner is using.
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. We need this aligned to
+ * the pageblock boundary, because we do
+ * block_start_pfn -= pageblock_nr_pages in the for loop.
+ * For ending point, take care when isolating in last pageblock of a
+ * a zone which ends in the middle of a pageblock.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
*/
- pfn = cc->free_pfn;
- low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-
- /*
- * Take care that if the migration scanner is at the end of the zone
- * that the free scanner does not accidentally move to the next zone
- * in the next isolation cycle.
- */
- high_pfn = min(low_pfn, pfn);
-
- z_end_pfn = zone_end_pfn(zone);
+ block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
+ zone_end_pfn(zone));
+ low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
- pfn -= pageblock_nr_pages) {
+ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+ block_end_pfn = block_start_pfn,
+ block_start_pfn -= pageblock_nr_pages) {
unsigned long isolated;
/*
* This can iterate a massively long zone without finding any
* suitable migration targets, so periodically check if we need
- * to schedule.
+ * to schedule, or even abort async compaction.
*/
- cond_resched();
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+ && compact_should_abort(cc))
+ break;
- if (!pfn_valid(pfn))
+ if (!pfn_valid(block_start_pfn))
continue;
/*
@@ -704,7 +755,7 @@ static void isolate_freepages(struct zone *zone,
* i.e. it's possible that all pages within a zones range of
* pages do not belong to a single zone.
*/
- page = pfn_to_page(pfn);
+ page = pfn_to_page(block_start_pfn);
if (page_zone(page) != zone)
continue;
@@ -717,35 +768,38 @@ static void isolate_freepages(struct zone *zone,
continue;
/* Found a block suitable for isolating free pages from */
- isolated = 0;
+ cc->free_pfn = block_start_pfn;
+ isolated = isolate_freepages_block(cc, block_start_pfn,
+ block_end_pfn, freelist, false);
+ nr_freepages += isolated;
/*
- * As pfn may not start aligned, pfn+pageblock_nr_page
- * may cross a MAX_ORDER_NR_PAGES boundary and miss
- * a pfn_valid check. Ensure isolate_freepages_block()
- * only scans within a pageblock
+ * Set a flag that we successfully isolated in this pageblock.
+ * In the next loop iteration, zone->compact_cached_free_pfn
+ * will not be updated and thus it will effectively contain the
+ * highest pageblock we isolated pages from.
*/
- end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
- end_pfn = min(end_pfn, z_end_pfn);
- isolated = isolate_freepages_block(cc, pfn, end_pfn,
- freelist, false);
- nr_freepages += isolated;
+ if (isolated)
+ cc->finished_update_free = true;
/*
- * Record the highest PFN we isolated pages from. When next
- * looking for free pages, the search will restart here as
- * page migration may have returned some pages to the allocator
+ * isolate_freepages_block() might have aborted due to async
+ * compaction being contended
*/
- if (isolated) {
- cc->finished_update_free = true;
- high_pfn = max(high_pfn, pfn);
- }
+ if (cc->contended)
+ break;
}
/* split_free_page does not map the pages */
map_pages(freelist);
- cc->free_pfn = high_pfn;
+ /*
+ * If we crossed the migrate scanner, we want to keep it that way
+ * so that compact_finished() may detect this
+ */
+ if (block_start_pfn < low_pfn)
+ cc->free_pfn = cc->migrate_pfn;
+
cc->nr_freepages = nr_freepages;
}
@@ -760,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage,
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /* Isolate free pages if necessary */
+ /*
+ * Isolate free pages if necessary, and if we are not aborting due to
+ * contention.
+ */
if (list_empty(&cc->freepages)) {
- isolate_freepages(cc->zone, cc);
+ if (!cc->contended)
+ isolate_freepages(cc->zone, cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -776,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage,
}
/*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
- * running as migrate_pages() has no knowledge of compact_control. When
- * migration is complete, we count the number of pages on the lists by hand.
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist. All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
*/
-static void update_nr_listpages(struct compact_control *cc)
+static void compaction_free(struct page *page, unsigned long data)
{
- int nr_migratepages = 0;
- int nr_freepages = 0;
- struct page *page;
-
- list_for_each_entry(page, &cc->migratepages, lru)
- nr_migratepages++;
- list_for_each_entry(page, &cc->freepages, lru)
- nr_freepages++;
+ struct compact_control *cc = (struct compact_control *)data;
- cc->nr_migratepages = nr_migratepages;
- cc->nr_freepages = nr_freepages;
+ list_add(&page->lru, &cc->freepages);
+ cc->nr_freepages++;
}
/* possible outcome of isolate_migratepages */
@@ -839,11 +890,16 @@ static int compact_finished(struct zone *zone,
unsigned int order;
unsigned long watermark;
- if (fatal_signal_pending(current))
+ if (cc->contended || fatal_signal_pending(current))
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
+ /* Let the next compaction start anew. */
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
@@ -941,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
+ const bool sync = cc->mode != MIGRATE_ASYNC;
ret = compaction_suitable(zone, cc->order);
switch (ret) {
@@ -954,11 +1011,19 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
/*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
+ /*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
*/
- cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -966,21 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- /*
- * Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
- */
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
- __reset_isolation_suitable(zone);
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
- unsigned long nr_migrate, nr_remaining;
int err;
switch (isolate_migratepages(zone, cc)) {
@@ -995,22 +1054,25 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- nr_migrate = cc->nr_migratepages;
+ if (!cc->nr_migratepages)
+ continue;
+
err = migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
- update_nr_listpages(cc);
- nr_remaining = cc->nr_migratepages;
- trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
- nr_remaining);
+ trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+ &cc->migratepages);
- /* Release isolated pages not migrated */
+ /* All pages were either migrated or will be released */
+ cc->nr_migratepages = 0;
if (err) {
putback_movable_pages(&cc->migratepages);
- cc->nr_migratepages = 0;
- if (err == -ENOMEM) {
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+ */
+ if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
ret = COMPACT_PARTIAL;
goto out;
}
@@ -1022,12 +1084,13 @@ out:
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
+ trace_mm_compaction_end(ret);
+
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone,
- int order, gfp_t gfp_mask,
- bool sync, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone, int order,
+ gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
{
unsigned long ret;
struct compact_control cc = {
@@ -1036,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
- .sync = sync,
+ .mode = mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1058,7 +1121,7 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that is true if compaction was aborted due to lock contention
* @page: Optionally capture a free page of the requested order during compaction
*
@@ -1066,7 +1129,7 @@ int sysctl_extfrag_threshold = 500;
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync, bool *contended)
+ enum migrate_mode mode, bool *contended)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1091,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, sync,
+ status = compact_zone_order(zone, order, gfp_mask, mode,
contended);
rc = max(status, rc);
@@ -1127,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
compact_zone(zone, cc);
if (cc->order > 0) {
- int ok = zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0);
- if (ok && cc->order >= zone->compact_order_failed)
- zone->compact_order_failed = cc->order + 1;
- /* Currently async compaction is never deferred. */
- else if (!ok && cc->sync)
- defer_compaction(zone, cc->order);
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
}
VM_BUG_ON(!list_empty(&cc->freepages));
@@ -1145,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
- .sync = false,
+ .mode = MIGRATE_ASYNC,
};
if (!order)
@@ -1158,7 +1217,8 @@ static void compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
- .sync = true,
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
};
__compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d2d39a..bd08e9b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
{
int ret = 0;
/* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ if (test_bit(AS_ENOSPC, &mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &mapping->flags))
ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ if (test_bit(AS_EIO, &mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &mapping->flags))
ret = -EIO;
return ret;
}
@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
+static int page_cache_tree_insert(struct address_space *mapping,
+ struct page *page)
+{
+ void **slot;
+ int error;
+
+ slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
+ if (slot) {
+ void *p;
+
+ p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ if (!radix_tree_exceptional_entry(p))
+ return -EEXIST;
+ radix_tree_replace_slot(slot, page);
+ mapping->nrpages++;
+ return 0;
+ }
+ error = radix_tree_insert(&mapping->page_tree, page->index, page);
+ if (!error)
+ mapping->nrpages++;
+ return error;
+}
+
/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
+ error = page_cache_tree_insert(mapping, page);
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
trace_mm_filemap_add_to_page_cache(page);
@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
page = alloc_pages_exact_node(n, gfp, 0);
- } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
return page;
}
@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page);
*/
void end_page_writeback(struct page *page)
{
- if (TestClearPageReclaim(page))
+ /*
+ * TestClearPageReclaim could be used here but it is an atomic
+ * operation and overkill in this particular case. Failing to
+ * shuffle a page marked for immediate reclaim is too mild to
+ * justify taking an atomic operation penalty at the end of
+ * ever page writeback.
+ */
+ if (PageReclaim(page)) {
+ ClearPageReclaim(page);
rotate_reclaimable_page(page);
+ }
if (!test_clear_page_writeback(page))
BUG();
@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
}
/**
- * find_get_page - find and get a page reference
+ * page_cache_next_hole - find the next hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
+ * lowest indexed hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'return - index >=
+ * max_scan' will be true). In rare cases of index wrap-around, 0 will
+ * be returned.
+ *
+ * page_cache_next_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 5, then subsequently a hole is created at
+ * index 10, page_cache_next_hole covering both indexes may return 10
+ * if called under rcu_read_lock.
+ */
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ unsigned long i;
+
+ for (i = 0; i < max_scan; i++) {
+ struct page *page;
+
+ page = radix_tree_lookup(&mapping->page_tree, index);
+ if (!page || radix_tree_exceptional_entry(page))
+ break;
+ index++;
+ if (index == 0)
+ break;
+ }
+
+ return index;
+}
+EXPORT_SYMBOL(page_cache_next_hole);
+
+/**
+ * page_cache_prev_hole - find the prev hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search backwards in the range [max(index-max_scan+1, 0), index] for
+ * the first hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'index - return >=
+ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
+ * will be returned.
+ *
+ * page_cache_prev_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 10, then subsequently a hole is created at
+ * index 5, page_cache_prev_hole covering both indexes may return 5 if
+ * called under rcu_read_lock.
+ */
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ unsigned long i;
+
+ for (i = 0; i < max_scan; i++) {
+ struct page *page;
+
+ page = radix_tree_lookup(&mapping->page_tree, index);
+ if (!page || radix_tree_exceptional_entry(page))
+ break;
+ index--;
+ if (index == ULONG_MAX)
+ break;
+ }
+
+ return index;
+}
+EXPORT_SYMBOL(page_cache_prev_hole);
+
+/**
+ * find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page index
+ * @offset: the page cache index
*
- * Is there a pagecache struct page at the given (mapping, offset) tuple?
- * If yes, increment its refcount and return it; if no, return NULL.
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned with an increased refcount.
+ *
+ * If the slot holds a shadow entry of a previously evicted page, it
+ * is returned.
+ *
+ * Otherwise, %NULL is returned.
*/
-struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
void **pagep;
struct page *page;
@@ -734,24 +854,30 @@ out:
return page;
}
-EXPORT_SYMBOL(find_get_page);
+EXPORT_SYMBOL(find_get_entry);
/**
- * find_lock_page - locate, pin and lock a pagecache page
+ * find_lock_entry - locate, pin and lock a page cache entry
* @mapping: the address_space to search
- * @offset: the page index
+ * @offset: the page cache index
*
- * Locates the desired pagecache page, locks it, increments its reference
- * count and returns its address.
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
*
- * Returns zero if the page was not present. find_lock_page() may sleep.
+ * If the slot holds a shadow entry of a previously evicted page, it
+ * is returned.
+ *
+ * Otherwise, %NULL is returned.
+ *
+ * find_lock_entry() may sleep.
*/
-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
{
struct page *page;
repeat:
- page = find_get_page(mapping, offset);
+ page = find_get_entry(mapping, offset);
if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
@@ -764,44 +890,86 @@ repeat:
}
return page;
}
-EXPORT_SYMBOL(find_lock_page);
+EXPORT_SYMBOL(find_lock_entry);
/**
- * find_or_create_page - locate or add a pagecache page
- * @mapping: the page's address_space
- * @index: the page's index into the mapping
- * @gfp_mask: page allocation mode
+ * pagecache_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @fgp_flags: PCG flags
+ * @gfp_mask: gfp mask to use for the page cache data page allocation
+ *
+ * Looks up the page cache slot at @mapping & @offset.
*
- * Locates a page in the pagecache. If the page is not present, a new page
- * is allocated using @gfp_mask and is added to the pagecache and to the VM's
- * LRU list. The returned page is locked and has its reference count
- * incremented.
+ * PCG flags modify how the page is returned
*
- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
- * allocation!
+ * FGP_ACCESSED: the page will be marked accessed
+ * FGP_LOCK: Page is return locked
+ * FGP_CREAT: If page is not present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, %NULL is returned.
*
- * find_or_create_page() returns the desired page's address, or zero on
- * memory exhaustion.
+ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
+ * if the GFP flags specified for FGP_CREAT are atomic.
+ *
+ * If there is a page cache page, it is returned with an increased refcount.
*/
-struct page *find_or_create_page(struct address_space *mapping,
- pgoff_t index, gfp_t gfp_mask)
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+ int fgp_flags, gfp_t gfp_mask)
{
struct page *page;
- int err;
+
repeat:
- page = find_lock_page(mapping, index);
- if (!page) {
+ page = find_get_entry(mapping, offset);
+ if (radix_tree_exceptional_entry(page))
+ page = NULL;
+ if (!page)
+ goto no_page;
+
+ if (fgp_flags & FGP_LOCK) {
+ if (fgp_flags & FGP_NOWAIT) {
+ if (!trylock_page(page)) {
+ page_cache_release(page);
+ return NULL;
+ }
+ } else {
+ lock_page(page);
+ }
+
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON(page->index != offset);
+ }
+
+ if (page && (fgp_flags & FGP_ACCESSED))
+ mark_page_accessed(page);
+
+no_page:
+ if (!page && (fgp_flags & FGP_CREAT)) {
+ int err;
+ if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS)
+ gfp_mask &= ~__GFP_FS;
+
page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
- /*
- * We want a regular kernel memory (not highmem or DMA etc)
- * allocation for the radix tree nodes, but we need to honour
- * the context-specific requirements the caller has asked for.
- * GFP_RECLAIM_MASK collects those requirements.
- */
- err = add_to_page_cache_lru(page, mapping, index,
- (gfp_mask & GFP_RECLAIM_MASK));
+
+ if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ fgp_flags |= FGP_LOCK;
+
+ /* Init accessed so avoit atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ init_page_accessed(page);
+
+ err = add_to_page_cache_lru(page, mapping, offset,
+ gfp_mask & GFP_RECLAIM_MASK);
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -809,9 +977,80 @@ repeat:
goto repeat;
}
}
+
return page;
}
-EXPORT_SYMBOL(find_or_create_page);
+EXPORT_SYMBOL(pagecache_get_page);
+
+/**
+ * find_get_entries - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page cache index
+ * @nr_entries: The maximum number of entries
+ * @entries: Where the resulting entries are placed
+ * @indices: The cache indices corresponding to the entries in @entries
+ *
+ * find_get_entries() will search for and return a group of up to
+ * @nr_entries entries in the mapping. The entries are placed at
+ * @entries. find_get_entries() takes a reference against any actual
+ * pages it returns.
+ *
+ * The search returns a group of mapping-contiguous page cache entries
+ * with ascending indexes. There may be holes in the indices due to
+ * not-present pages.
+ *
+ * Any shadow entries of evicted pages are included in the returned
+ * array.
+ *
+ * find_get_entries() returns the number of pages and shadow entries
+ * which were found.
+ */
+unsigned find_get_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices)
+{
+ void **slot;
+ unsigned int ret = 0;
+ struct radix_tree_iter iter;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ /*
+ * Otherwise, we must be storing a swap entry
+ * here as an exceptional entry: so return it
+ * without attempting to raise page count.
+ */
+ goto export;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+export:
+ indices[ret] = iter.index;
+ entries[ret] = page;
+ if (++ret == nr_entries)
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
/**
* find_get_pages - gang pagecache lookup
@@ -1031,39 +1270,6 @@ repeat:
}
EXPORT_SYMBOL(find_get_pages_tag);
-/**
- * grab_cache_page_nowait - returns locked page at given index in given cache
- * @mapping: target address_space
- * @index: the page index
- *
- * Same as grab_cache_page(), but do not wait if the page is unavailable.
- * This is intended for speculative data generators, where the data can
- * be regenerated if the page couldn't be grabbed. This routine should
- * be safe to call while holding the lock for another page.
- *
- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- * and deadlock against the caller's locked page.
- */
-struct page *
-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
-{
- struct page *page = find_get_page(mapping, index);
-
- if (page) {
- if (trylock_page(page))
- return page;
- page_cache_release(page);
- return NULL;
- }
- page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
- page_cache_release(page);
- page = NULL;
- }
- return page;
-}
-EXPORT_SYMBOL(grab_cache_page_nowait);
-
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -1797,6 +2003,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
@@ -1823,6 +2041,8 @@ repeat:
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
+ } else {
+ page = wait_on_page_read(page);
}
}
return page;
@@ -1859,6 +2079,10 @@ retry:
if (err < 0) {
page_cache_release(page);
return ERR_PTR(err);
+ } else {
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
}
out:
mark_page_accessed(page);
@@ -1866,40 +2090,25 @@ out:
}
/**
- * read_cache_page_async - read into page cache, fill it if needed
+ * read_cache_page - read into page cache, fill it if needed
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
* @data: first arg to filler(data, page) function, often left as NULL
*
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
* Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
+ * not set, try to fill the page and wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
*/
-struct page *read_cache_page_async(struct address_space *mapping,
+struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
}
-EXPORT_SYMBOL(read_cache_page_async);
-
-static struct page *wait_on_page_read(struct page *page)
-{
- if (!IS_ERR(page)) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- page_cache_release(page);
- page = ERR_PTR(-EIO);
- }
- }
- return page;
-}
+EXPORT_SYMBOL(read_cache_page);
/**
* read_cache_page_gfp - read into page cache, using specified page allocation flags.
@@ -1918,31 +2127,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
{
filler_t *filler = (filler_t *)mapping->a_ops->readpage;
- return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+ return do_read_cache_page(mapping, index, filler, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);
-/**
- * read_cache_page - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: first arg to filler(data, page) function, often left as NULL
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page then wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data)
-{
- return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
-}
-EXPORT_SYMBOL(read_cache_page);
-
static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
@@ -1976,7 +2164,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
char *kaddr;
size_t copied;
- BUG_ON(!pagefault_disabled());
kaddr = kmap_atomic(page);
if (likely(i->nr_segs == 1)) {
int left;
@@ -2186,7 +2373,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
{
const struct address_space_operations *aops = mapping->a_ops;
- mark_page_accessed(page);
return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
}
EXPORT_SYMBOL(pagecache_write_end);
@@ -2268,34 +2454,17 @@ EXPORT_SYMBOL(generic_file_direct_write);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
- int status;
- gfp_t gfp_mask;
struct page *page;
- gfp_t gfp_notmask = 0;
+ int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
- gfp_mask = mapping_gfp_mask(mapping);
- if (mapping_cap_account_dirty(mapping))
- gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
- gfp_notmask = __GFP_FS;
-repeat:
- page = find_lock_page(mapping, index);
+ fgp_flags |= FGP_NOFS;
+
+ page = pagecache_get_page(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
if (page)
- goto found;
+ wait_for_stable_page(page);
- page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
- if (!page)
- return NULL;
- status = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & ~gfp_notmask);
- if (unlikely(status)) {
- page_cache_release(page);
- if (status == -EEXIST)
- goto repeat;
- return NULL;
- }
-found:
- wait_for_stable_page(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2344,18 +2513,15 @@ again:
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
- if (unlikely(status))
+ if (unlikely(status < 0))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
@@ -2555,8 +2721,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (ret > 0) {
ssize_t err;
- err = generic_write_sync(file, pos, ret);
- if (err < 0 && ret > 0)
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
ret = err;
}
return ret;
diff --git a/mm/fremap.c b/mm/fremap.c
index bbc4d66..34feba6 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -23,28 +23,44 @@
#include "internal.h"
+static int mm_counter(struct page *page)
+{
+ return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
+}
+
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ struct page *page;
+ swp_entry_t entry;
if (pte_present(pte)) {
- struct page *page;
-
flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, mm_counter(page));
page_remove_rmap(page);
page_cache_release(page);
+ }
+ } else { /* zap_pte() is not called when pte_none() */
+ if (!pte_file(pte)) {
update_hiwater_rss(mm);
- dec_mm_counter(mm, MM_FILEPAGES);
+ entry = pte_to_swp_entry(pte);
+ if (non_swap_entry(entry)) {
+ if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+ dec_mm_counter(mm, mm_counter(page));
+ }
+ } else {
+ free_swap_and_cache(entry);
+ dec_mm_counter(mm, MM_SWAPENTS);
+ }
}
- } else {
- if (!pte_file(pte))
- free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear_not_present_full(mm, addr, ptep, 0);
}
}
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdc..f2a3571 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -244,8 +244,10 @@ int __frontswap_store(struct page *page)
the (older) page from frontswap
*/
inc_frontswap_failed_stores();
- if (dup)
+ if (dup) {
__frontswap_clear(sis, offset);
+ frontswap_ops->invalidate_page(type, offset);
+ }
}
if (frontswap_writethrough_enabled)
/* report failure so swap also writes to swap device */
@@ -327,15 +329,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
static unsigned long __frontswap_curr_pages(void)
{
- int type;
unsigned long totalpages = 0;
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ plist_for_each_entry(si, &swap_active_head, list)
totalpages += atomic_read(&si->frontswap_pages);
- }
return totalpages;
}
@@ -347,11 +346,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
int si_frontswap_pages;
unsigned long total_pages_to_unuse = total;
unsigned long pages = 0, pages_to_unuse = 0;
- int type;
assert_spin_locked(&swap_lock);
- for (type = swap_list.head; type >= 0; type = si->next) {
- si = swap_info[type];
+ plist_for_each_entry(si, &swap_active_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +363,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
vm_unacct_memory(pages);
*unused = pages_to_unuse;
- *swapid = type;
+ *swapid = si->type;
ret = 0;
break;
}
@@ -413,7 +410,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_list.head each time
+ * so restart scan from swap_active_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/highmem.c b/mm/highmem.c
index b1c7d43..b32b70c 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,11 +29,10 @@
#include <linux/kgdb.h>
#include <asm/tlbflush.h>
-#ifndef CONFIG_PREEMPT_RT_FULL
+
#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
DEFINE_PER_CPU(int, __kmap_atomic_idx);
#endif
-#endif
/*
* Virtual_count is not a pure "count".
@@ -48,9 +47,8 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx);
unsigned long totalhigh_pages __read_mostly;
EXPORT_SYMBOL(totalhigh_pages);
-#ifndef CONFIG_PREEMPT_RT_FULL
+
EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-#endif
unsigned int nr_free_highpages (void)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 389973f..04535b6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -192,7 +192,7 @@ retry:
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
- __free_page(zero_page);
+ __free_pages(zero_page, compound_order(zero_page));
goto retry;
}
@@ -224,7 +224,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
- __free_page(zero_page);
+ __free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-#ifndef CONFIG_NUMA
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-#endif
-
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -1549,15 +1541,22 @@ pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag)
{
+ pgd_t *pgd;
+ pud_t *pud;
pmd_t *pmd, *ret = NULL;
if (address & ~HPAGE_PMD_MASK)
goto out;
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
goto out;
- if (pmd_none(*pmd))
+ pmd = pmd_offset(pud, address);
+
+ if (!pmd_present(*pmd))
goto out;
if (pmd_page(*pmd) != page)
goto out;
@@ -1748,21 +1747,24 @@ static int __split_huge_page_map(struct page *page,
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
+ if (pmd_write(*pmd))
+ BUG_ON(page_mapcount(page) != 1);
haddr = address;
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
+ /*
+ * Note that pmd_numa is not transferred deliberately
+ * to avoid any possibility that pte_numa leaks to
+ * a PROT_NONE VMA by accident.
+ */
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (!pmd_write(*pmd))
entry = pte_wrprotect(entry);
- else
- BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
- if (pmd_numa(*pmd))
- entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -2197,7 +2199,58 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
+static int khugepaged_node_load[MAX_NUMNODES];
+
+static bool khugepaged_scan_abort(int nid)
+{
+ int i;
+
+ /*
+ * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * allocate memory locally.
+ */
+ if (!zone_reclaim_mode)
+ return false;
+
+ /* If there is a count for this node already, it must be acceptable */
+ if (khugepaged_node_load[nid])
+ return false;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (!khugepaged_node_load[i])
+ continue;
+ if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ return true;
+ }
+ return false;
+}
+
#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2231,9 +2284,8 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
- node, __GFP_OTHER_NODE);
-
+ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2249,6 +2301,17 @@ static struct page
return *hpage;
}
#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2352,8 +2415,6 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
- if (pmd_trans_huge(*pmd))
- goto out;
anon_vma_lock_write(vma->anon_vma);
@@ -2452,9 +2513,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
- if (pmd_trans_huge(*pmd))
- goto out;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2471,12 +2531,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Chose the node of the first page. This could
- * be more sophisticated and look at more pages,
- * but isn't for now.
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
*/
- if (node == NUMA_NO_NODE)
- node = page_to_nid(page);
+ node = page_to_nid(page);
+ if (khugepaged_scan_abort(node))
+ goto out_unmap;
+ khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2491,9 +2554,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
+ node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
+ }
out:
return ret;
}
@@ -2801,12 +2866,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
+ pgd_t *pgd;
+ pud_t *pud;
pmd_t *pmd;
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f0a4ca4..c33d8a6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
goto err;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask(h), &mpol, &nodemask);
@@ -596,7 +596,7 @@ retry_cpuset:
}
mpol_cond_put(mpol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -1177,6 +1177,7 @@ static void return_unused_surplus_pages(struct hstate *h,
while (nr_pages--) {
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
+ cond_resched_lock(&hugetlb_lock);
}
}
@@ -1552,6 +1553,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
+ cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2112,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->max_huge_pages;
if (write && h->order >= MAX_ORDER)
@@ -2165,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->nr_overcommit_huge_pages;
if (write && h->order >= MAX_ORDER)
@@ -2190,6 +2198,8 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return;
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
@@ -2206,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
int hugetlb_report_node_meminfo(int nid, char *buf)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return 0;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
@@ -2220,6 +2232,9 @@ void hugetlb_show_meminfo(void)
struct hstate *h;
int nid;
+ if (!hugepages_supported())
+ return;
+
for_each_node_state(nid, N_MEMORY)
for_each_hstate(h)
pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
@@ -2379,6 +2394,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
+}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -2406,7 +2446,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock(&dst->page_table_lock);
spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
- if (!huge_pte_none(huge_ptep_get(src_pte))) {
+ entry = huge_ptep_get(src_pte);
+ if (huge_pte_none(entry)) { /* skip none entry */
+ ;
+ } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+ is_hugetlb_entry_hwpoisoned(entry))) {
+ swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+ if (is_write_migration_entry(swp_entry) && cow) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ set_huge_pte_at(src, addr, src_pte, entry);
+ }
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
entry = huge_ptep_get(src_pte);
@@ -2424,32 +2481,6 @@ nomem:
return -ENOMEM;
}
-static int is_hugetlb_entry_migration(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
- else
- return 0;
-}
-
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
- else
- return 0;
-}
-
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
diff --git a/mm/internal.h b/mm/internal.h
index 8b6cfd6..d610f7c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
+#include <linux/fs.h>
#include <linux/mm.h>
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
atomic_set(&page->_count, v);
}
+extern int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ return __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+}
+
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
@@ -120,7 +135,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- bool sync; /* Synchronous migration */
+ enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool finished_update_free; /* True when the zone cached pfns are
* no longer being updated
@@ -130,7 +145,10 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
- bool contended; /* True if a lock was contended */
+ bool contended; /* True if a lock was contended, or
+ * need_resched() true during async
+ * compaction
+ */
};
unsigned long
@@ -369,5 +387,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e126b0e..31f01c5 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}
spin_lock_irqsave(&object->lock, flags);
- if (ptr + size > object->pointer + object->size) {
+ if (size == SIZE_MAX) {
+ size = object->pointer + object->size - ptr;
+ } else if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index c78fff1..29cbd06 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
- BUG_ON(pmd_trans_huge(*pmd));
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb9..a402f8f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
for (; start < end; start += PAGE_SIZE) {
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- page = find_get_page(mapping, index);
+ page = find_get_entry(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
page_cache_release(page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bc16ebc..ff64896 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,9 @@ struct mem_cgroup {
/* vmpressure notifications */
struct vmpressure vmpressure;
+ /* css_online() has been completed */
+ int initialized;
+
/*
* the counter to account for mem+swap usage.
*/
@@ -1089,9 +1092,23 @@ skip_node:
* skipping css reference should be safe.
*/
if (next_css) {
- if ((next_css == &root->css) ||
- ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
- return mem_cgroup_from_css(next_css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
+
+ if (next_css == &root->css)
+ return memcg;
+
+ if (css_tryget(next_css)) {
+ if (memcg->initialized) {
+ /*
+ * Make sure the memcg is initialized:
+ * mem_cgroup_css_online() orders the the
+ * initialization against setting the flag.
+ */
+ smp_rmb();
+ return memcg;
+ }
+ css_put(next_css);
+ }
prev_css = next_css;
goto skip_node;
@@ -1820,13 +1837,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (points > chosen_points) {
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
}
css_task_iter_end(&it);
}
@@ -2473,7 +2495,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
- curcpu = get_cpu_light();
+ curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2490,7 +2512,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu_light();
+ put_cpu();
if (!sync)
goto out;
@@ -5643,8 +5665,12 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
struct mem_cgroup_eventfd_list *ev;
+ spin_lock(&memcg_oom_lock);
+
list_for_each_entry(ev, &memcg->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
+
+ spin_unlock(&memcg_oom_lock);
return 0;
}
@@ -6322,6 +6348,16 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
+
+ if (!error) {
+ /*
+ * Make sure the memcg is initialized: mem_cgroup_iter()
+ * orders reading memcg->initialized against its callers
+ * reading the memcg members.
+ */
+ smp_wmb();
+ memcg->initialized = 1;
+ }
return error;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5ea3cf7..4ab233d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -208,9 +208,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
#endif
si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
- if ((flags & MF_ACTION_REQUIRED) && t == current) {
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
si.si_code = BUS_MCEERR_AR;
- ret = force_sig_info(SIGBUS, &si, t);
+ ret = force_sig_info(SIGBUS, &si, current);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -384,20 +384,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
}
}
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t)
+ if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+ return t;
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
+{
+ struct task_struct *t;
if (!tsk->mm)
- return 0;
- if (tsk->flags & PF_MCE_PROCESS)
- return !!(tsk->flags & PF_MCE_EARLY);
- return sysctl_memory_failure_early_kill;
+ return NULL;
+ if (force_early)
+ return tsk;
+ t = find_early_kill_thread(tsk);
+ if (t)
+ return t;
+ if (sysctl_memory_failure_early_kill)
+ return tsk;
+ return NULL;
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -412,16 +443,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -432,7 +464,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -442,10 +474,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
-
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
@@ -455,8 +487,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -469,7 +501,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
{
struct to_kill *tk;
@@ -480,9 +513,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
+ collect_procs_anon(page, tokill, &tk, force_early);
else
- collect_procs_file(page, tokill, &tk);
+ collect_procs_file(page, tokill, &tk, force_early);
kfree(tk);
}
@@ -967,7 +1000,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill);
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
@@ -1085,15 +1118,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
} else if (PageHuge(hpage)) {
/*
- * Check "just unpoisoned", "filter hit", and
- * "race with other subpage."
+ * Check "filter hit" and "race with other subpage."
*/
lock_page(hpage);
- if (!PageHWPoison(hpage)
- || (hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- return 0;
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1156,6 +1190,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
res = 0;
goto out;
}
@@ -1518,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_move(&hpage->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1599,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index 0dcdc84..db2916f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -808,20 +808,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (swap_duplicate(entry) < 0)
- return entry.val;
-
- /* make sure dst_mm is on swapoff's mmlist. */
- if (unlikely(list_empty(&dst_mm->mmlist))) {
- spin_lock(&mmlist_lock);
- if (list_empty(&dst_mm->mmlist))
- list_add(&dst_mm->mmlist,
- &src_mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (likely(!non_swap_entry(entry)))
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
rss[MM_SWAPENTS]++;
- else if (is_migration_entry(entry)) {
+ } else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
if (PageAnon(page))
@@ -878,7 +878,7 @@ out_set_pte:
return 0;
}
-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
@@ -1929,12 +1929,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags)
{
struct vm_area_struct *vma;
+ vm_flags_t vm_flags;
int ret;
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
+ vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+ if (!(vm_flags & vma->vm_flags))
+ return -EFAULT;
+
ret = handle_mm_fault(mm, vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
@@ -3189,7 +3194,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (prev && prev->vm_end == address)
return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
- expand_downwards(vma, address - PAGE_SIZE);
+ return expand_downwards(vma, address - PAGE_SIZE);
}
if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
struct vm_area_struct *next = vma->vm_next;
@@ -3198,7 +3203,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (next && next->vm_start == address + PAGE_SIZE)
return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
- expand_upwards(vma, address + PAGE_SIZE);
+ return expand_upwards(vma, address + PAGE_SIZE);
}
return 0;
}
@@ -3693,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm,
pte_t entry;
spinlock_t *ptl;
- entry = *pte;
+ entry = ACCESS_ONCE(*pte);
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
@@ -3742,32 +3747,6 @@ unlock:
return 0;
}
-#ifdef CONFIG_PREEMPT_RT_FULL
-void pagefault_disable(void)
-{
- migrate_disable();
- current->pagefault_disabled++;
- /*
- * make sure to have issued the store before a pagefault
- * can hit.
- */
- barrier();
-}
-EXPORT_SYMBOL(pagefault_disable);
-
-void pagefault_enable(void)
-{
- /*
- * make sure to issue those last loads/stores before enabling
- * the pagefault handler again.
- */
- barrier();
- current->pagefault_disabled--;
- migrate_enable();
-}
-EXPORT_SYMBOL(pagefault_enable);
-#endif
-
/*
* By the time we get here, we already hold the mm semaphore
*/
@@ -4344,35 +4323,3 @@ void copy_user_huge_page(struct page *dst, struct page *src,
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-
-#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0)
-/*
- * Heinous hack, relies on the caller doing something like:
- *
- * pte = alloc_pages(PGALLOC_GFP, 0);
- * if (pte)
- * pgtable_page_ctor(pte);
- * return pte;
- *
- * This ensures we release the page and return NULL when the
- * lock allocation fails.
- */
-struct page *pte_lock_init(struct page *page)
-{
- page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
- if (page->ptl) {
- spin_lock_init(__pte_lockptr(page));
- } else {
- __free_page(page);
- page = NULL;
- }
- return page;
-}
-
-void pte_lock_deinit(struct page *page)
-{
- kfree(page->ptl);
- page->mapping = NULL;
-}
-
-#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3..d317305 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* alloc_migrate_target should be improooooved!!
* migrate_pages returns # of failed pages.
*/
- ret = migrate_pages(&source, alloc_migrate_target, 0,
+ ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret)
putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 927a69c..3650036 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -525,9 +525,13 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
#ifdef CONFIG_HUGETLB_PAGE
int nid;
struct page *page;
+ pte_t entry;
spin_lock(&vma->vm_mm->page_table_lock);
- page = pte_page(huge_ptep_get((pte_t *)pmd));
+ entry = huge_ptep_get((pte_t *)pmd);
+ if (!pte_present(entry))
+ goto unlock;
+ page = pte_page(entry);
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
goto unlock;
@@ -649,19 +653,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
* @nodes and @flags,) it's isolated and queued to the pagelist which is
* passed via @private.)
*/
-static struct vm_area_struct *
+static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
const nodemask_t *nodes, unsigned long flags, void *private)
{
- int err;
- struct vm_area_struct *first, *vma, *prev;
-
+ int err = 0;
+ struct vm_area_struct *vma, *prev;
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
+ vma = find_vma(mm, start);
+ if (!vma)
+ return -EFAULT;
prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for (; vma && vma->vm_start < end; vma = vma->vm_next) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
@@ -671,9 +674,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
if (flags & MPOL_MF_LAZY) {
@@ -687,15 +690,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
err = queue_pages_pgd_range(vma, start, endvma, nodes,
flags, private);
- if (err) {
- first = ERR_PTR(err);
+ if (err)
break;
- }
}
next:
prev = vma;
}
- return first;
+ return err;
}
/*
@@ -1059,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, dest,
+ err = migrate_pages(&pagelist, new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1180,30 +1181,31 @@ out:
/*
* Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
- struct vm_area_struct *vma = (struct vm_area_struct *)private;
+ struct vm_area_struct *vma;
unsigned long uninitialized_var(address);
+ vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
vma = vma->vm_next;
}
- /*
- * queue_pages_range() confirms that @page belongs to some vma,
- * so vma shouldn't be NULL.
- */
- BUG_ON(!vma);
- if (PageHuge(page))
+ if (PageHuge(page)) {
+ BUG_ON(!vma);
return alloc_huge_page_noerr(vma, address, 1);
+ }
+ /*
+ * if !vma, alloc_page_vma() will use task or system default policy
+ */
return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
}
#else
@@ -1219,7 +1221,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
return NULL;
}
@@ -1229,7 +1231,6 @@ static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
- struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
@@ -1295,11 +1296,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- vma = queue_pages_range(mm, start, end, nmask,
+ err = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
-
- err = PTR_ERR(vma); /* maybe ... */
- if (!IS_ERR(vma))
+ if (!err)
err = mbind_range(mm, start, end, new);
if (!err) {
@@ -1307,9 +1306,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma,
- MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_movable_pages(&pagelist);
}
@@ -1875,7 +1873,7 @@ int node_random(const nodemask_t *maskp)
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
- * Must be protected by get_mems_allowed()
+ * Must be protected by read_mems_allowed_begin()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
@@ -2039,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
retry_cpuset:
pol = get_vma_policy(current, vma, addr);
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -2047,7 +2045,7 @@ retry_cpuset:
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2057,7 +2055,7 @@ retry_cpuset:
policy_nodemask(gfp, pol));
if (unlikely(mpol_needs_cond_ref(pol)))
__mpol_put(pol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
}
@@ -2091,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
pol = &default_policy;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/*
* No reference counting needed for current->mempolicy
@@ -2104,7 +2102,7 @@ retry_cpuset:
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2148,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;
- rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
@@ -2156,7 +2153,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index e3cf71d..fac5fa0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -136,8 +136,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
- if (pmd_trans_huge(*pmd))
- goto out;
ptep = pte_offset_map(pmd, addr);
@@ -164,8 +162,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (pte_swp_soft_dirty(*ptep))
pte = pte_mksoft_dirty(pte);
+
+ /* Recheck VMA as permissions can change since migration started */
if (is_write_migration_entry(entry))
- pte = pte_mkwrite(pte);
+ pte = maybe_mkwrite(pte, vma);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -867,8 +868,9 @@ out:
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct page *page, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -912,11 +914,18 @@ out:
page_is_file_cache(page));
putback_lru_page(page);
}
+
/*
- * Move the new page to the LRU. If migration was not successful
- * then this will free the page.
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, putback_lru_page() will drop the reference grabbed
+ * during isolation.
*/
- putback_lru_page(newpage);
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+ ClearPageSwapBacked(newpage);
+ put_new_page(newpage, private);
+ } else
+ putback_lru_page(newpage);
+
if (result) {
if (rc)
*result = rc;
@@ -945,8 +954,9 @@ out:
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
- unsigned long private, struct page *hpage,
- int force, enum migrate_mode mode)
+ free_page_t put_new_page, unsigned long private,
+ struct page *hpage, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -982,20 +992,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
put_anon_vma(anon_vma);
- if (!rc)
+ if (rc == MIGRATEPAGE_SUCCESS)
hugetlb_cgroup_migrate(hpage, new_hpage);
unlock_page(hpage);
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
- put_page(new_hpage);
+
+ /*
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, put_page() will drop the reference grabbed during
+ * isolation.
+ */
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ put_new_page(new_hpage, private);
+ else
+ put_page(new_hpage);
+
if (result) {
if (rc)
*result = rc;
@@ -1012,6 +1032,8 @@ out:
* @from: The list of pages to be migrated.
* @get_new_page: The function used to allocate free pages to be used
* as the target of the page migration.
+ * @put_new_page: The function used to free target pages if migration
+ * fails, or NULL if no special handling is necessary.
* @private: Private data to be passed on to get_new_page()
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
@@ -1025,7 +1047,8 @@ out:
* Returns the number of pages that were not migrated, or an error code.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
- unsigned long private, enum migrate_mode mode, int reason)
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
@@ -1047,10 +1070,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
- private, page, pass > 2, mode);
+ put_new_page, private, page,
+ pass > 2, mode);
else
- rc = unmap_and_move(get_new_page, private,
- page, pass > 2, mode);
+ rc = unmap_and_move(get_new_page, put_new_page,
+ private, page, pass > 2, mode);
switch(rc) {
case -ENOMEM:
@@ -1194,7 +1218,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_page_node,
+ err = migrate_pages(&pagelist, new_page_node, NULL,
(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1643,7 +1667,8 @@ int migrate_misplaced_page(struct page *page, int node)
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
- node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+ NULL, node, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
if (nr_remaining) {
putback_lru_pages(&migratepages);
isolated = 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56..06cb810 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_page(mapping, pgoff);
#ifdef CONFIG_SWAP
- /* shmem/tmpfs may return swap: account for swapcache page too. */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swap = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swap), swap.val);
- }
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ /*
+ * shmem/tmpfs may return swap: account for swapcache
+ * page too.
+ */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
#endif
if (page) {
present = PageUptodate(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6ee..1b12dfa 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
+ /* Serialize with page migration */
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
@@ -153,6 +154,7 @@ unsigned int munlock_vma_page(struct page *page)
{
unsigned int nr_pages;
+ /* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index af99b9e..441602d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -744,8 +746,11 @@ again: remove_next = 1 + (end > next->vm_end);
* shrinking vma had, to cover any anon pages imported.
*/
if (exporter && exporter->anon_vma && !importer->anon_vma) {
- if (anon_vma_clone(importer, exporter))
- return -ENOMEM;
+ int error;
+
+ error = anon_vma_clone(importer, exporter);
+ if (error)
+ return error;
importer->anon_vma = exporter->anon_vma;
}
}
@@ -1980,34 +1985,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
@@ -2045,14 +2049,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start;
+ unsigned long new_start, actual_size;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
/* Stack limit test */
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ actual_size = size;
+ if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
+ actual_size -= PAGE_SIZE;
+ if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2379,7 +2386,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -2416,7 +2425,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (err)
goto out_free_vma;
- if (anon_vma_clone(new, vma))
+ err = anon_vma_clone(new, vma);
+ if (err)
goto out_free_mpol;
if (new->vm_file)
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index adfce87..8a8cd02 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -23,7 +23,6 @@ void use_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
- preempt_disable_rt();
active_mm = tsk->active_mm;
if (active_mm != mm) {
atomic_inc(&mm->mm_count);
@@ -31,7 +30,6 @@ void use_mm(struct mm_struct *mm)
}
tsk->mm = mm;
switch_mm(active_mm, mm, tsk);
- preempt_enable_rt();
task_unlock(tsk);
if (active_mm != mm)
diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb..05f1180 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -194,10 +194,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
- if (extent == HPAGE_PMD_SIZE)
+ if (extent == HPAGE_PMD_SIZE) {
+ VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ anon_vma_lock_write(vma->anon_vma);
err = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
+ if (need_rmap_locks)
+ anon_vma_unlock_write(vma->anon_vma);
+ }
if (err > 0) {
need_flush = true;
continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f15..1221d2b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
+ int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
kenter("%p", vma);
protect_vma(vma, 0);
mm->map_count--;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = NULL;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache[i] == vma) {
+ vmacache_invalidate(curr->mm);
+ break;
+ }
+ }
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;
/* check the cache first */
- vma = mm->mmap_cache;
- if (vma && vma->vm_start == addr && vma->vm_end == end)
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e73f01c..712a0f8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
const nodemask_t *mask)
{
- struct task_struct *start = tsk;
+ struct task_struct *tsk;
+ bool ret = false;
- do {
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- if (mempolicy_nodemask_intersects(tsk, mask))
- return true;
+ ret = mempolicy_nodemask_intersects(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
- if (cpuset_mems_allowed_intersects(current, tsk))
- return true;
+ ret = cpuset_mems_allowed_intersects(current, tsk);
}
- } while_each_thread(start, tsk);
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
- do {
+ rcu_read_lock();
+
+ for_each_thread(p, t) {
task_lock(t);
if (likely(t->mm))
- return t;
+ goto found;
task_unlock(t);
- } while_each_thread(p, t);
+ }
+ t = NULL;
+found:
+ rcu_read_unlock();
- return NULL;
+ return t;
}
/* return true if the task is not adequate as candidate victim task. */
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long chosen_points = 0;
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
unsigned int points;
switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -319,11 +327,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
break;
};
points = oom_badness(p, NULL, nodemask, totalpages);
- if (points > chosen_points) {
- chosen = p;
- chosen_points = points;
- }
- } while_each_thread(g, p);
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points && thread_group_leader(chosen))
+ continue;
+
+ chosen = p;
+ chosen_points = points;
+ }
if (chosen)
get_task_struct(chosen);
rcu_read_unlock();
@@ -394,6 +406,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
dump_tasks(memcg, nodemask);
}
+/*
+ * Number of OOM killer invocations (including memcg OOM killer).
+ * Primarily used by PM freezer to check for potential races with
+ * OOM killed frozen task.
+ */
+static atomic_t oom_kills = ATOMIC_INIT(0);
+
+int oom_kills_count(void)
+{
+ return atomic_read(&oom_kills);
+}
+
+void note_oom_kill(void)
+{
+ atomic_inc(&oom_kills);
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
@@ -406,7 +435,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *victim = p;
struct task_struct *child;
- struct task_struct *t = p;
+ struct task_struct *t;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +466,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* still freeing memory.
*/
read_lock(&tasklist_lock);
- do {
+ for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -455,13 +484,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
- } while_each_thread(p, t);
+ }
read_unlock(&tasklist_lock);
- rcu_read_lock();
p = find_lock_task_mm(victim);
if (!p) {
- rcu_read_unlock();
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -487,6 +514,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
+ rcu_read_lock();
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1..9f45f87 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -593,14 +593,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
* (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
* => fast response on large errors; small oscillation near setpoint
*/
-static inline long long pos_ratio_polynom(unsigned long setpoint,
+static long long pos_ratio_polynom(unsigned long setpoint,
unsigned long dirty,
unsigned long limit)
{
long long pos_ratio;
long x;
- x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+ x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
limit - setpoint + 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -842,7 +842,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
x_intercept = bdi_setpoint + span;
if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
x_intercept - bdi_setpoint + 1);
} else
pos_ratio /= 4;
@@ -1324,9 +1324,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
if (bdi_bg_thresh)
- *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
- background_thresh,
- dirty_thresh);
+ *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
+ background_thresh,
+ dirty_thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
@@ -2398,7 +2398,7 @@ int test_clear_page_writeback(struct page *page)
return ret;
}
-int test_set_page_writeback(struct page *page)
+int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret;
@@ -2423,9 +2423,10 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_TOWRITE);
+ if (!keep_write)
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
@@ -2436,7 +2437,7 @@ int test_set_page_writeback(struct page *page)
return ret;
}
-EXPORT_SYMBOL(test_set_page_writeback);
+EXPORT_SYMBOL(__test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marked with the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36c40eb..7abab3b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,7 +61,6 @@
#include <linux/page-debug-flags.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
-#include <linux/locallock.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -70,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -231,18 +231,6 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
-static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
-
-#ifdef CONFIG_PREEMPT_RT_BASE
-# define cpu_lock_irqsave(cpu, flags) \
- local_lock_irqsave_on(pa_lock, flags, cpu)
-# define cpu_unlock_irqrestore(cpu, flags) \
- local_unlock_irqrestore_on(pa_lock, flags, cpu)
-#else
-# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
-# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
-#endif
-
int page_group_by_mobility_disabled __read_mostly;
void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -417,7 +405,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
return bad;
}
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+static inline void prep_zero_page(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
{
int i;
@@ -461,7 +450,7 @@ static inline void set_page_guard_flag(struct page *page) { }
static inline void clear_page_guard_flag(struct page *page) { }
#endif
-static inline void set_page_order(struct page *page, int order)
+static inline void set_page_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -512,21 +501,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
- int order)
+ unsigned int order)
{
if (!pfn_valid_within(page_to_pfn(buddy)))
return 0;
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
+
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
+
+ /*
+ * zone check is done late to avoid uselessly
+ * calculating zone/node ids for pages that could
+ * never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
return 0;
@@ -558,6 +557,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
*/
static inline void __free_one_page(struct page *page,
+ unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
@@ -574,7 +574,7 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(migratetype == -1);
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
VM_BUG_ON(page_idx & ((1 << order) - 1));
VM_BUG_ON(bad_range(zone, page));
@@ -648,7 +648,7 @@ static inline int free_pages_check(struct page *page)
}
/*
- * Frees a number of pages which have been collected from the pcp lists.
+ * Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
@@ -659,48 +659,17 @@ static inline int free_pages_check(struct page *page)
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
- struct list_head *list)
+ struct per_cpu_pages *pcp)
{
+ int migratetype = 0;
+ int batch_free = 0;
int to_free = count;
- unsigned long flags;
-
- spin_lock_irqsave(&zone->lock, flags);
- zone->pages_scanned = 0;
-
- while (!list_empty(list)) {
- struct page *page = list_first_entry(list, struct page, lru);
- int mt; /* migratetype of the to-be-freed page */
-
- /* must delete as __free_one_page list manipulates */
- list_del(&page->lru);
-
- mt = get_freepage_migratetype(page);
- /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
- __free_one_page(page, zone, 0, mt);
- trace_mm_page_pcpu_drain(page, 0, mt);
- if (likely(!is_migrate_isolate_page(page))) {
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
- if (is_migrate_cma(mt))
- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
- }
-
- to_free--;
- }
- WARN_ON(to_free != 0);
- spin_unlock_irqrestore(&zone->lock, flags);
-}
+ unsigned long nr_scanned;
-/*
- * Moves a number of pages from the PCP lists to free list which
- * is freed outside of the locked region.
- *
- * Assumes all pages on list are in same zone, and of same order.
- * count is the number of pages to free.
- */
-static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
- struct list_head *dst)
-{
- int migratetype = 0, batch_free = 0;
+ spin_lock(&zone->lock);
+ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ if (nr_scanned)
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
while (to_free) {
struct page *page;
@@ -717,7 +686,7 @@ static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
batch_free++;
if (++migratetype == MIGRATE_PCPTYPES)
migratetype = 0;
- list = &src->lists[migratetype];
+ list = &pcp->lists[migratetype];
} while (list_empty(list));
/* This is the only non-empty list. Free them all. */
@@ -725,25 +694,40 @@ static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
batch_free = to_free;
do {
- page = list_last_entry(list, struct page, lru);
+ int mt; /* migratetype of the to-be-freed page */
+
+ page = list_entry(list->prev, struct page, lru);
+ /* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- list_add(&page->lru, dst);
+ mt = get_freepage_migratetype(page);
+ /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ if (likely(!is_migrate_isolate_page(page))) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+ }
} while (--to_free && --batch_free && !list_empty(list));
}
+ spin_unlock(&zone->lock);
}
-static void free_one_page(struct zone *zone, struct page *page, int order,
+static void free_one_page(struct zone *zone,
+ struct page *page, unsigned long pfn,
+ unsigned int order,
int migratetype)
{
- unsigned long flags;
-
- spin_lock_irqsave(&zone->lock, flags);
- zone->pages_scanned = 0;
+ unsigned long nr_scanned;
+ spin_lock(&zone->lock);
+ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ if (nr_scanned)
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
- __free_one_page(page, zone, order, migratetype);
+ __free_one_page(page, pfn, zone, order, migratetype);
if (unlikely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
}
static bool free_pages_prepare(struct page *page, unsigned int order)
@@ -777,16 +761,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
+ unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order))
return;
- local_lock_irqsave(pa_lock, flags);
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
- free_one_page(page_zone(page), page, order, migratetype);
- local_unlock_irqrestore(pa_lock, flags);
+ free_one_page(page_zone(page), page, pfn, order, migratetype);
+ local_irq_restore(flags);
}
void __init __free_pages_bootmem(struct page *page, unsigned int order)
@@ -821,9 +806,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_page_refcounted(page);
set_pageblock_migratetype(page, MIGRATE_CMA);
- __free_pages(page, pageblock_order);
+
+ if (pageblock_order >= MAX_ORDER) {
+ i = pageblock_nr_pages;
+ p = page;
+ do {
+ set_page_refcounted(p);
+ __free_pages(p, MAX_ORDER - 1);
+ p += MAX_ORDER_NR_PAGES;
+ } while (i -= MAX_ORDER_NR_PAGES);
+ } else {
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
+ }
+
adjust_managed_page_count(page, pageblock_nr_pages);
}
#endif
@@ -893,7 +890,7 @@ static inline int check_new_page(struct page *page)
return 0;
}
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
{
int i;
@@ -942,6 +939,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
+ set_freepage_migratetype(page, migratetype);
return page;
}
@@ -1066,6 +1064,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
{
int current_order = page_order(page);
+ /*
+ * When borrowing from MIGRATE_CMA, we need to release the excess
+ * buddy pages to CMA itself. We also ensure the freepage_migratetype
+ * is set to CMA so it is returned to the correct freelist in case
+ * the page ends up being not actually allocated from the pcp lists.
+ */
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1097,16 +1101,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
- int current_order;
+ unsigned int current_order;
struct page *page;
int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1; current_order >= order;
- --current_order) {
+ for (current_order = MAX_ORDER-1;
+ current_order >= order && current_order <= MAX_ORDER-1;
+ --current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
@@ -1130,21 +1135,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
list_del(&page->lru);
rmv_page_order(page);
- /*
- * Borrow the excess buddy pages as well, irrespective
- * of whether we stole freepages, or took ownership of
- * the pageblock or not.
- *
- * Exception: When borrowing from MIGRATE_CMA, release
- * the excess buddy pages to CMA itself.
- */
expand(zone, page, order, current_order, area,
- is_migrate_cma(migratetype)
- ? migratetype : start_migratetype);
+ new_type);
+ /* The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages. This is OK as long as it does
+ * not differ for MIGRATE_CMA type.
+ */
+ set_freepage_migratetype(page, new_type);
- trace_mm_page_alloc_extfrag(page, order,
- current_order, start_migratetype, migratetype,
- new_type == start_migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, migratetype, new_type);
return page;
}
@@ -1190,9 +1191,9 @@ retry_reserve:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, int cold)
+ int migratetype, bool cold)
{
- int mt = migratetype, i;
+ int i;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -1209,18 +1210,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* merge IO requests if the physical pages are ordered
* properly.
*/
- if (likely(cold == 0))
+ if (likely(!cold))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
- if (IS_ENABLED(CONFIG_CMA)) {
- mt = get_pageblock_migratetype(page);
- if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
- mt = migratetype;
- }
- set_freepage_migratetype(page, mt);
list = &page->lru;
- if (is_migrate_cma(mt))
+ if (is_migrate_cma(get_freepage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1241,31 +1236,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
- LIST_HEAD(dst);
int to_drain;
unsigned long batch;
- local_lock_irqsave(pa_lock, flags);
+ local_irq_save(flags);
batch = ACCESS_ONCE(pcp->batch);
if (pcp->count >= batch)
to_drain = batch;
else
to_drain = pcp->count;
if (to_drain > 0) {
- isolate_pcp_pages(to_drain, pcp, &dst);
+ free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
}
- local_unlock_irqrestore(pa_lock, flags);
- free_pcppages_bulk(zone, to_drain, &dst);
-}
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return false;
+ local_irq_restore(flags);
}
#endif
@@ -1284,21 +1268,16 @@ static void drain_pages(unsigned int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- LIST_HEAD(dst);
- int count;
- cpu_lock_irqsave(cpu, flags);
+ local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- count = pcp->count;
- if (count) {
- isolate_pcp_pages(count, pcp, &dst);
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
}
- cpu_unlock_irqrestore(cpu, flags);
- if (count)
- free_pcppages_bulk(zone, count, &dst);
+ local_irq_restore(flags);
}
}
@@ -1351,12 +1330,7 @@ void drain_all_pages(void)
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
-#ifndef CONFIG_PREEMPT_RT_BASE
on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
-#else
- for_each_cpu(cpu, &cpus_with_pcps)
- drain_pages(cpu);
-#endif
}
#ifdef CONFIG_HIBERNATION
@@ -1365,7 +1339,7 @@ void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn;
unsigned long flags;
- int order, t;
+ unsigned int order, t;
struct list_head *curr;
if (zone_is_empty(zone))
@@ -1397,21 +1371,22 @@ void mark_free_pages(struct zone *zone)
/*
* Free a 0-order page
- * cold == 1 ? free a cold page : free a hot page
+ * cold == true ? free a cold page : free a hot page
*/
-void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pages_prepare(page, 0))
return;
- migratetype = get_pageblock_migratetype(page);
+ migratetype = get_pfnblock_migratetype(page, pfn);
set_freepage_migratetype(page, migratetype);
- local_lock_irqsave(pa_lock, flags);
+ local_irq_save(flags);
__count_vm_event(PGFREE);
/*
@@ -1423,37 +1398,32 @@ void free_hot_cold_page(struct page *page, int cold)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (cold)
- list_add_tail(&page->lru, &pcp->lists[migratetype]);
- else
+ if (!cold)
list_add(&page->lru, &pcp->lists[migratetype]);
+ else
+ list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = ACCESS_ONCE(pcp->batch);
- LIST_HEAD(dst);
-
- isolate_pcp_pages(batch, pcp, &dst);
+ free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
- local_unlock_irqrestore(pa_lock, flags);
- free_pcppages_bulk(zone, batch, &dst);
- return;
}
out:
- local_unlock_irqrestore(pa_lock, flags);
+ local_irq_restore(flags);
}
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, int cold)
+void free_hot_cold_page_list(struct list_head *list, bool cold)
{
struct page *page, *next;
@@ -1565,19 +1535,19 @@ int split_free_page(struct page *page)
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype)
{
unsigned long flags;
struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);
+ bool cold = ((gfp_flags & __GFP_COLD) != 0);
again:
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
- local_lock_irqsave(pa_lock, flags);
+ local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
if (list_empty(list)) {
@@ -1609,27 +1579,23 @@ again:
*/
WARN_ON_ONCE(order > 1);
}
- local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
+ spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
- if (!page) {
- spin_unlock(&zone->lock);
+ spin_unlock(&zone->lock);
+ if (!page)
goto failed;
- }
__mod_zone_freepage_state(zone, -(1 << order),
- get_pageblock_migratetype(page));
- spin_unlock(&zone->lock);
+ get_freepage_migratetype(page));
}
- /*
- * NOTE: GFP_THISNODE allocations do not partake in the kswapd
- * aging protocol, so they can't be fair.
- */
- if (!gfp_thisnode_allocation(gfp_flags))
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
+ !zone_is_fair_depleted(zone))
+ zone_set_flag(zone, ZONE_FAIR_DEPLETED);
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
- local_unlock_irqrestore(pa_lock, flags);
+ local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
@@ -1637,7 +1603,7 @@ again:
return page;
failed:
- local_unlock_irqrestore(pa_lock, flags);
+ local_irq_restore(flags);
return NULL;
}
@@ -1722,12 +1688,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags, long free_pages)
+static bool __zone_watermark_ok(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags,
+ long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
long free_cma = 0;
@@ -1742,7 +1708,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
- if (free_pages - free_cma <= min + lowmem_reserve)
+ if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1757,15 +1723,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
@@ -1907,7 +1873,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
{
int i;
- for_each_online_node(i)
+ for_each_node_state(i, N_MEMORY)
if (node_distance(nid, i) <= RECLAIM_DISTANCE)
node_set(i, NODE_DATA(nid)->reclaim_nodes);
else
@@ -1950,6 +1916,18 @@ static inline void init_zone_allows_reclaim(int nid)
}
#endif /* CONFIG_NUMA */
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+ struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+
+ do {
+ mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+ zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+ } while (zone++ != preferred_zone);
+}
+
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -1957,18 +1935,22 @@ static inline void init_zone_allows_reclaim(int nid)
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
+ struct zone *preferred_zone, int classzone_idx, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
- int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
+ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE);
+ int nr_fair_skipped = 0;
+ bool zonelist_rescan;
- classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
+ zonelist_rescan = false;
+
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1980,34 +1962,23 @@ zonelist_scan:
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
- goto try_this_zone;
/*
* Distribute pages in proportion to the individual
* zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
- *
- * Try to stay in local zones in the fastpath. If
- * that fails, the slowpath is entered, which will do
- * another pass starting with the local zones, but
- * ultimately fall back to remote zones that do not
- * partake in the fairness round-robin cycle of this
- * zonelist.
- *
- * NOTE: GFP_THISNODE allocations do not partake in
- * the kswapd aging protocol, so they can't be fair.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- !gfp_thisnode_allocation(gfp_mask)) {
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
- continue;
+ if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
+ break;
+ if (zone_is_fair_depleted(zone)) {
+ nr_fair_skipped++;
continue;
+ }
}
/*
* When allocating a page cache page for writing, we
@@ -2035,15 +2006,19 @@ zonelist_scan:
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
- goto this_zone_full;
+ if (consider_zone_dirty && !zone_dirty_ok(zone))
+ continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags)) {
int ret;
+ /* Checked here to keep the fast path fast */
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ goto try_this_zone;
+
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
@@ -2105,17 +2080,11 @@ try_this_zone:
if (page)
break;
this_zone_full:
- if (IS_ENABLED(CONFIG_NUMA))
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
zlc_mark_zone_full(zonelist, z);
}
- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- goto zonelist_scan;
- }
-
- if (page)
+ if (page) {
/*
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
@@ -2124,8 +2093,37 @@ this_zone_full:
* for !PFMEMALLOC purposes.
*/
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+ return page;
+ }
- return page;
+ /*
+ * The first pass makes sure allocations are spread fairly within the
+ * local node. However, the local node might have free pages left
+ * after the fairness batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without fairness, and
+ * include remote zones now, before entering the slowpath and waking
+ * kswapd: prefer spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ alloc_flags &= ~ALLOC_FAIR;
+ if (nr_fair_skipped) {
+ zonelist_rescan = true;
+ reset_alloc_batches(preferred_zone);
+ }
+ if (nr_online_nodes > 1)
+ zonelist_rescan = true;
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ zonelist_rescan = true;
+ }
+
+ if (zonelist_rescan)
+ goto zonelist_scan;
+
+ return NULL;
}
/*
@@ -2241,7 +2239,7 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
struct page *page;
@@ -2252,6 +2250,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/*
+ * PM-freezer should be notified that there might be an OOM killer on
+ * its way to kill and wake somebody up. This is too early and we might
+ * end up not killing anything but false positives are acceptable.
+ * See freeze_processes.
+ */
+ note_oom_kill();
+
+ /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
@@ -2259,7 +2265,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto out;
@@ -2294,7 +2300,7 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, bool sync_migration,
+ int classzone_idx, int migratetype, enum migrate_mode mode,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
@@ -2308,7 +2314,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, sync_migration,
+ nodemask, mode,
contended_compaction);
current->flags &= ~PF_MEMALLOC;
@@ -2316,19 +2322,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct page *page;
/* Page migration frees to the PCP lists but we want merging */
- drain_pages(get_cpu_light());
- put_cpu_light();
+ drain_pages(get_cpu());
+ put_cpu();
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
- preferred_zone->compact_considered = 0;
- preferred_zone->compact_defer_shift = 0;
- if (order >= preferred_zone->compact_order_failed)
- preferred_zone->compact_order_failed = order + 1;
+ compaction_defer_reset(preferred_zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2344,7 +2347,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* As async compaction considers a subset of pageblocks, only
* defer if the failure was a sync compaction failure.
*/
- if (sync_migration)
+ if (mode != MIGRATE_ASYNC)
defer_compaction(preferred_zone, order);
cond_resched();
@@ -2357,9 +2360,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, bool sync_migration,
- bool *contended_compaction, bool *deferred_compaction,
- unsigned long *did_some_progress)
+ int classzone_idx, int migratetype,
+ enum migrate_mode mode, bool *contended_compaction,
+ bool *deferred_compaction, unsigned long *did_some_progress)
{
return NULL;
}
@@ -2398,7 +2401,7 @@ static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int classzone_idx, int migratetype, unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
@@ -2416,7 +2419,8 @@ retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx,
+ migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2439,14 +2443,14 @@ static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2455,7 +2459,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
+static void wake_all_kswapds(unsigned int order,
struct zonelist *zonelist,
enum zone_type high_zoneidx,
struct zone *preferred_zone)
@@ -2463,29 +2467,15 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
struct zoneref *z;
struct zone *zone;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
- /*
- * Only reset the batches of zones that were actually
- * considered in the fast path, we don't want to
- * thrash fairness information for zones that are not
- * actually part of this zonelist's round-robin cycle.
- */
- if (!zone_local(preferred_zone, zone))
- continue;
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
- }
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2494,20 +2484,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (!wait) {
+ if (atomic) {
/*
- * Not worth trying to allocate harder for
- * __GFP_NOMEMALLOC even if it can't schedule.
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2539,14 +2529,14 @@ static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- bool sync_migration = false;
+ enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
bool contended_compaction = false;
@@ -2569,12 +2559,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (gfp_thisnode_allocation(gfp_mask))
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
- prepare_slowpath(gfp_mask, order, zonelist,
- high_zoneidx, preferred_zone);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2587,15 +2578,19 @@ restart:
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
- first_zones_zonelist(zonelist, high_zoneidx, NULL,
- &preferred_zone);
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+ struct zoneref *preferred_zoneref;
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ NULL,
+ &preferred_zone);
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ }
rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto got_pg;
@@ -2610,7 +2605,7 @@ rebalance:
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
@@ -2632,17 +2627,16 @@ rebalance:
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
- sync_migration = true;
+ migration_mode = MIGRATE_SYNC_LIGHT;
/*
* If compaction is deferred for high-order allocations, it is because
@@ -2659,7 +2653,8 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ classzone_idx, migratetype,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -2678,7 +2673,7 @@ rebalance:
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
- migratetype);
+ classzone_idx, migratetype);
if (page)
goto got_pg;
@@ -2717,12 +2712,11 @@ rebalance:
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
@@ -2748,11 +2742,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
+ struct zoneref *preferred_zoneref;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
struct mem_cgroup *memcg = NULL;
+ int classzone_idx;
gfp_mask &= gfp_allowed_mask;
@@ -2779,14 +2775,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
return NULL;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx,
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -2795,7 +2792,7 @@ retry_cpuset:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
@@ -2805,7 +2802,7 @@ retry_cpuset:
gfp_mask = memalloc_noio_flags(gfp_mask);
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
}
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2817,7 +2814,7 @@ out:
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
@@ -2856,7 +2853,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
@@ -3085,9 +3082,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
goto out;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
out:
return ret;
}
@@ -3240,12 +3237,12 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_BOUNCE)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
- zone->pages_scanned,
+ K(zone_page_state(zone, NR_PAGES_SCANNED)),
(!zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %lu", zone->lowmem_reserve[i]);
+ printk(" %ld", zone->lowmem_reserve[i]);
printk("\n");
}
@@ -3985,6 +3982,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
struct page *page;
unsigned long block_migratetype;
int reserve;
+ int old_reserve;
/*
* Get the start pfn, end pfn and the number of blocks to reserve
@@ -4006,6 +4004,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
+ old_reserve = zone->nr_migrate_reserve_block;
+
+ /* When memory hot-add, we almost always need to do nothing */
+ if (reserve == old_reserve)
+ return;
+ zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
@@ -4043,6 +4047,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve--;
continue;
}
+ } else if (!old_reserve) {
+ /*
+ * At boot time we don't need to scan the whole zone
+ * for turning off MIGRATE_RESERVE.
+ */
+ break;
}
/*
@@ -4122,7 +4132,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
static void __meminit zone_init_free_lists(struct zone *zone)
{
- int order, t;
+ unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
@@ -4134,7 +4144,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4250,8 +4260,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -4945,7 +4955,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- init_zone_allows_reclaim(nid);
+ if (node_state(nid, N_MEMORY))
+ init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
@@ -5519,7 +5530,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
void __init page_alloc_init(void)
{
hotcpu_notifier(page_alloc_cpu_notify, 0);
- local_irq_lock_init(pa_lock);
}
/*
@@ -5535,7 +5545,7 @@ static void calculate_totalreserve_pages(void)
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- unsigned long max = 0;
+ long max = 0;
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
@@ -5650,9 +5660,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -5777,7 +5786,12 @@ module_init(init_per_zone_wmark_min)
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
@@ -5845,23 +5859,38 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
+ int old_percpu_pagelist_fraction;
int ret;
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
- return ret;
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
- mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
+out:
mutex_unlock(&pcp_batch_high_lock);
- return 0;
+ return ret;
}
int hashdist = HASHDIST_DEFAULT;
@@ -6004,53 +6033,64 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
* @end_bitidx: The last bit of interest
* returns pageblock_bits flags
*/
-unsigned long get_pageblock_flags_group(struct page *page,
- int start_bitidx, int end_bitidx)
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long flags = 0;
- unsigned long value = 1;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
zone = page_zone(page);
- pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (test_bit(bitidx + start_bitidx, bitmap))
- flags |= value;
-
- return flags;
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}
/**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @start_bitidx: The first bit of interest
* @end_bitidx: The last bit of interest
* @flags: The flags to set
*/
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
- int start_bitidx, int end_bitidx)
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long value = 1;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
zone = page_zone(page);
- pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
VM_BUG_ON(!zone_spans_pfn(zone, pfn));
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (flags & value)
- __set_bit(bitidx + start_bitidx, bitmap);
- else
- __clear_bit(bitidx + start_bitidx, bitmap);
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = ACCESS_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
}
/*
@@ -6210,7 +6250,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- 0, MIGRATE_SYNC, MR_CMA);
+ NULL, 0, cc->mode, MR_CMA);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
@@ -6249,7 +6289,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
- .sync = true,
+ .mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -6383,7 +6423,7 @@ void zone_pcp_reset(struct zone *zone)
struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
- local_lock_irqsave(pa_lock, flags);
+ local_irq_save(flags);
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -6392,7 +6432,7 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
- local_unlock_irqrestore(pa_lock, flags);
+ local_irq_restore(flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -6404,7 +6444,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
- int order, i;
+ unsigned int order, i;
unsigned long pfn;
unsigned long flags;
/* find the first valid pfn */
@@ -6456,7 +6496,7 @@ bool is_free_buddy_page(struct page *page)
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
- int order;
+ unsigned int order;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 98caeee..e007236 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -13,14 +13,6 @@
static unsigned long total_usage;
-static void page_cgroup_lock_init(struct page_cgroup *pc, int nr_pages)
-{
-#ifdef CONFIG_PREEMPT_RT_BASE
- for (; nr_pages; nr_pages--, pc++)
- spin_lock_init(&pc->pcg_lock);
-#endif
-}
-
#if !defined(CONFIG_SPARSEMEM)
@@ -68,7 +60,6 @@ static int __init alloc_node_page_cgroup(int nid)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
- page_cgroup_lock_init(base, nr_pages);
return 0;
}
@@ -159,8 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
return -ENOMEM;
}
- page_cgroup_lock_init(base, PAGES_PER_SECTION);
-
/*
* The passed "pfn" may not be aligned to SECTION. For the calculation
* we need to apply a mask.
@@ -181,6 +170,7 @@ static void free_page_cgroup(void *addr)
sizeof(struct page_cgroup) * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71..5110816 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -108,7 +108,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
- unsigned int cpu;
+ unsigned int cpu, tcpu;
int i;
for_each_possible_cpu(cpu) {
@@ -116,14 +116,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
- if (!*pagep) {
- pcpu_free_pages(chunk, pages, populated,
- page_start, page_end);
- return -ENOMEM;
- }
+ if (!*pagep)
+ goto err;
}
}
return 0;
+
+err:
+ while (--i >= page_start)
+ __free_page(pages[pcpu_page_idx(cpu, i)]);
+
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ for (i = page_start; i < page_end; i++)
+ __free_page(pages[pcpu_page_idx(tcpu, i)]);
+ }
+ return -ENOMEM;
}
/**
@@ -263,6 +272,7 @@ err:
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
+ pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
return err;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 8c8e08f..25e2ea5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -612,7 +612,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- kfree(chunk);
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
return NULL;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed041..0f35e98 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
*/
#include <linux/kernel.h>
-#include <linux/fs.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
#include <linux/syscalls.h>
#include <linux/file.h>
+#include "internal.h"
+
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
@@ -149,8 +149,7 @@ out:
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-static int
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
- if (page)
+ if (page && !radix_tree_exceptional_entry(page))
continue;
page = page_cache_alloc_readahead(mapping);
@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
return ret;
}
+#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
*/
unsigned long max_sane_readahead(unsigned long nr)
{
- return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
- + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
-}
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
-{
- int actual;
-
- actual = __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
-
- return actual;
+ return min(nr, MAX_READAHEAD);
}
/*
@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t head;
rcu_read_lock();
- head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+ head = page_cache_prev_hole(mapping, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
+ pgoff_t prev_offset;
/*
* start of file
@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
+ start = page_cache_next_hole(mapping, offset + 1, max);
rcu_read_unlock();
if (!start || start - offset > max)
@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
+ * trivial case: (offset - prev_offset) == 1
+ * unaligned reads: (offset - prev_offset) == 0
*/
- if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
+ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+ if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index b9d2222..440c71c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
+ might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock_write(anon_vma);
@@ -273,6 +274,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
+ int error;
/* Don't bother if the parent process has no anon_vma here. */
if (!pvma->anon_vma)
@@ -282,8 +284,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
- if (anon_vma_clone(vma, pvma))
- return -ENOMEM;
+ error = anon_vma_clone(vma, pvma);
+ if (error)
+ return error;
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
@@ -426,8 +429,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
* above cannot corrupt).
*/
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
+ return NULL;
}
out:
rcu_read_unlock();
@@ -477,9 +481,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
- goto out;
+ return NULL;
}
/* we pinned the anon_vma, its safe to sleep */
@@ -567,6 +571,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
+ pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -577,7 +582,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ /*
+ * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = ACCESS_ONCE(*pmd);
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
pmd = NULL;
out:
return pmd;
@@ -613,9 +624,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
if (!pmd)
return NULL;
- if (pmd_trans_huge(*pmd))
- return NULL;
-
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
if (!sync && !pte_present(*pte)) {
@@ -1392,9 +1400,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- mlock_vma_page(page); /* no-op if already mlocked */
- if (page == check_page)
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
continue; /* don't unmap */
}
@@ -1671,10 +1689,9 @@ void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
+ anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
-
- anon_vma_free(anon_vma);
}
#ifdef CONFIG_MIGRATION
diff --git a/mm/shmem.c b/mm/shmem.c
index 8297623..e9502a67 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
- * (with i_mutex making sure that it has only one user at a time):
- * we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
+ wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -242,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
void **pslot;
- void *item = NULL;
+ void *item;
VM_BUG_ON(!expected);
+ VM_BUG_ON(!replacement);
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (pslot)
- item = radix_tree_deref_slot_protected(pslot,
- &mapping->tree_lock);
+ if (!pslot)
+ return -ENOENT;
+ item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- if (replacement)
- radix_tree_replace_slot(pslot, replacement);
- else
- radix_tree_delete(&mapping->page_tree, index);
+ radix_tree_replace_slot(pslot, replacement);
return 0;
}
@@ -331,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
}
/*
- * Like find_get_pages, but collecting swap entries as well as pages.
- */
-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
- pgoff_t start, unsigned int nr_pages,
- struct page **pages, pgoff_t *indices)
-{
- void **slot;
- unsigned int ret = 0;
- struct radix_tree_iter iter;
-
- if (!nr_pages)
- return 0;
-
- rcu_read_lock();
-restart:
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
- /*
- * Otherwise, we must be storing a swap entry
- * here as an exceptional entry: so return it
- * without attempting to raise page count.
- */
- goto export;
- }
- if (!page_cache_get_speculative(page))
- goto repeat;
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- page_cache_release(page);
- goto repeat;
- }
-export:
- indices[ret] = iter.index;
- pages[ret] = page;
- if (++ret == nr_pages)
- break;
- }
- rcu_read_unlock();
- return ret;
-}
-
-/*
* Remove swap entry from radix tree, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{
- int error;
+ void *old;
spin_lock_irq(&mapping->tree_lock);
- error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+ old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
spin_unlock_irq(&mapping->tree_lock);
- if (!error)
- free_swap_and_cache(radix_to_swp_entry(radswap));
- return error;
-}
-
-/*
- * Pagevec may contain swap entries, so shuffle up pages before releasing.
- */
-static void shmem_deswap_pagevec(struct pagevec *pvec)
-{
- int i, j;
-
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
- pvec->pages[j++] = page;
- }
- pvec->nr = j;
+ if (old != radswap)
+ return -ENOENT;
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return 0;
}
/*
@@ -429,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
*/
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
+ pvec.nr = find_get_entries(mapping, index,
+ PAGEVEC_SIZE, pvec.pages, indices);
if (!pvec.nr)
break;
index = indices[pvec.nr - 1] + 1;
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
check_move_unevictable_pages(pvec.pages, pvec.nr);
pagevec_release(&pvec);
cond_resched();
@@ -466,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pagevec_init(&pvec, 0);
index = start;
while (index < end) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
if (!pvec.nr)
break;
mem_cgroup_uncharge_start();
@@ -497,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
unlock_page(page);
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -533,22 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
return;
index = start;
- for ( ; ; ) {
+ while (index < end) {
cond_resched();
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+
+ pvec.nr = find_get_entries(mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.pages, indices);
if (!pvec.nr) {
- if (index == start || unfalloc)
+ /* If all gone or hole-punch or unfalloc, we're done */
+ if (index == start || end != -1)
break;
+ /* But if truncating, restart to make sure all gone */
index = start;
continue;
}
- if ((index == start || unfalloc) && indices[0] >= end) {
- shmem_deswap_pagevec(&pvec);
- pagevec_release(&pvec);
- break;
- }
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -560,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (radix_tree_exceptional_entry(page)) {
if (unfalloc)
continue;
- nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ if (shmem_free_swap(mapping, index, page)) {
+ /* Swap was replaced by page: retry */
+ index--;
+ break;
+ }
+ nr_swaps_freed++;
continue;
}
@@ -570,11 +507,16 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (page->mapping == mapping) {
VM_BUG_ON(PageWriteback(page));
truncate_inode_page(mapping, page);
+ } else {
+ /* Page was replaced by swap: retry */
+ unlock_page(page);
+ index--;
+ break;
}
}
unlock_page(page);
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -826,6 +768,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private;
if (shmem_falloc &&
+ !shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped++;
@@ -1082,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
return -EFBIG;
repeat:
swap.val = 0;
- page = find_lock_page(mapping, index);
+ page = find_lock_entry(mapping, index);
if (radix_tree_exceptional_entry(page)) {
swap = radix_to_swp_entry(page);
page = NULL;
@@ -1094,6 +1037,9 @@ repeat:
goto failed;
}
+ if (page && sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
/* fallocated page? */
if (page && !PageUptodate(page)) {
if (sgp != SGP_READ)
@@ -1175,6 +1121,9 @@ repeat:
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
delete_from_swap_cache(page);
set_page_dirty(page);
swap_free(swap);
@@ -1199,8 +1148,11 @@ repeat:
goto decused;
}
- SetPageSwapBacked(page);
+ __SetPageSwapBacked(page);
__set_page_locked(page);
+ if (sgp == SGP_WRITE)
+ init_page_accessed(page);
+
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
@@ -1300,6 +1252,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error;
int ret = VM_FAULT_LOCKED;
+ /*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_mutex. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_mutex in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+ if (unlikely(inode->i_private)) {
+ struct shmem_falloc *shmem_falloc;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT(shmem_fault_wait);
+
+ ret = VM_FAULT_NOPAGE;
+ if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+ !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /* It's polite to up mmap_sem if we can */
+ up_read(&vma->vm_mm->mmap_sem);
+ ret = VM_FAULT_RETRY;
+ }
+
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ spin_unlock(&inode->i_lock);
+ return ret;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1419,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->backing_dev_info == &shmem_backing_dev_info;
+}
+
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
@@ -1731,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pagevec_init(&pvec, 0);
pvec.nr = 1; /* start small: we may be there already */
while (!done) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ pvec.nr = find_get_entries(mapping, index,
pvec.nr, pvec.pages, indices);
if (!pvec.nr) {
if (whence == SEEK_DATA)
@@ -1758,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
break;
}
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
pvec.nr = PAGEVEC_SIZE;
cond_resched();
@@ -1819,12 +1834,25 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+
+ shmem_falloc.waitq = &shmem_falloc_waitq;
+ shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
shmem_truncate_range(inode, offset, offset + len - 1);
/* No need to unmap again: hole-punching leaves COWed pages */
+
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ wake_up_all(&shmem_falloc_waitq);
+ spin_unlock(&inode->i_lock);
error = 0;
goto out;
}
@@ -1842,6 +1870,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ shmem_falloc.waitq = NULL;
shmem_falloc.start = start;
shmem_falloc.next = start;
shmem_falloc.nr_falloced = 0;
@@ -2077,8 +2106,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
- if (they_are_dirs)
+ if (they_are_dirs) {
+ drop_nlink(new_dentry->d_inode);
drop_nlink(old_dir);
+ }
} else if (they_are_dirs) {
drop_nlink(old_dir);
inc_nlink(new_dir);
diff --git a/mm/slab.c b/mm/slab.c
index 2580db0..c180fbb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
{
if (unlikely(pfmemalloc_active)) {
/* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
+ struct slab *slabp = virt_to_slab(objp);
+ struct page *page = virt_to_head_page(slabp->s_mem);
if (PageSlabPfmemalloc(page))
set_obj_pfmemalloc(&objp);
}
@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
__SetPageSlab(page + i);
if (page->pfmemalloc)
- SetPageSlabPfmemalloc(page + i);
+ SetPageSlabPfmemalloc(page);
}
memcg_bind_pages(cachep, cachep->gfporder);
@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
else
sub_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_freed);
+
+ __ClearPageSlabPfmemalloc(page);
while (i--) {
BUG_ON(!PageSlab(page));
- __ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page++;
}
@@ -2220,7 +2222,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, slab_size, ralign;
+ size_t left_over, slab_size;
+ size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;
@@ -2253,14 +2256,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size &= ~(BYTES_PER_WORD - 1);
}
- /*
- * Redzoning and user store require word alignment or possibly larger.
- * Note this will be overridden by architecture or caller mandated
- * alignment if either is greater than BYTES_PER_WORD.
- */
- if (flags & SLAB_STORE_USER)
- ralign = BYTES_PER_WORD;
-
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
@@ -3220,7 +3215,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(slab_node(), flags);
retry:
@@ -3276,7 +3271,7 @@ retry:
}
}
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return obj;
}
diff --git a/mm/slab.h b/mm/slab.h
index 8ffb287..a535033 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -247,11 +247,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
* The slab lists for all objects.
*/
struct kmem_cache_node {
-#ifdef CONFIG_SLUB
- raw_spinlock_t list_lock;
-#else
spinlock_t list_lock;
-#endif
#ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e2e98af..97e5f5e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -56,7 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
continue;
}
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
+#if !defined(CONFIG_SLUB)
/*
* For simplicity, we won't check this in the list of memcg
* caches. We have control over memcg naming, and if there
diff --git a/mm/slub.c b/mm/slub.c
index a164648..a88d94c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1087,7 +1087,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- raw_spin_lock_irqsave(&n->list_lock, *flags);
+ spin_lock_irqsave(&n->list_lock, *flags);
slab_lock(page);
if (!check_slab(s, page))
@@ -1135,7 +1135,7 @@ out:
fail:
slab_unlock(page);
- raw_spin_unlock_irqrestore(&n->list_lock, *flags);
+ spin_unlock_irqrestore(&n->list_lock, *flags);
slab_fix(s, "Object at 0x%p not freed", object);
return NULL;
}
@@ -1270,12 +1270,6 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
#endif /* CONFIG_SLUB_DEBUG */
-struct slub_free_list {
- raw_spinlock_t lock;
- struct list_head list;
-};
-static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
-
/*
* Slab allocation and freeing
*/
@@ -1297,15 +1291,10 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
- bool enableirqs;
flags &= gfp_allowed_mask;
- enableirqs = (flags & __GFP_WAIT) != 0;
-#ifdef CONFIG_PREEMPT_RT_FULL
- enableirqs |= system_state == SYSTEM_RUNNING;
-#endif
- if (enableirqs)
+ if (flags & __GFP_WAIT)
local_irq_enable();
flags |= s->allocflags;
@@ -1345,7 +1334,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
kmemcheck_mark_unallocated_pages(page, pages);
}
- if (enableirqs)
+ if (flags & __GFP_WAIT)
local_irq_disable();
if (!page)
return NULL;
@@ -1363,10 +1352,8 @@ static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
setup_object_debug(s, page, object);
-#ifndef CONFIG_PREEMPT_RT_FULL
if (unlikely(s->ctor))
s->ctor(object);
-#endif
}
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1444,16 +1431,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__free_memcg_kmem_pages(page, order);
}
-static void free_delayed(struct list_head *h)
-{
- while(!list_empty(h)) {
- struct page *page = list_first_entry(h, struct page, lru);
-
- list_del(&page->lru);
- __free_slab(page->slab_cache, page);
- }
-}
-
#define need_reserve_slab_rcu \
(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
@@ -1488,12 +1465,6 @@ static void free_slab(struct kmem_cache *s, struct page *page)
}
call_rcu(head, rcu_free_slab);
- } else if (irqs_disabled()) {
- struct slub_free_list *f = &__get_cpu_var(slub_free_list);
-
- raw_spin_lock(&f->lock);
- list_add(&page->lru, &f->list);
- raw_spin_unlock(&f->lock);
} else
__free_slab(s, page);
}
@@ -1598,7 +1569,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!n || !n->nr_partial)
return NULL;
- raw_spin_lock(&n->list_lock);
+ spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t;
@@ -1623,7 +1594,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
break;
}
- raw_spin_unlock(&n->list_lock);
+ spin_unlock(&n->list_lock);
return object;
}
@@ -1664,7 +1635,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
return NULL;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1676,19 +1647,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
object = get_partial_node(s, n, c, flags);
if (object) {
/*
- * Return the object even if
- * put_mems_allowed indicated that
- * the cpuset mems_allowed was
- * updated in parallel. It's a
- * harmless race between the alloc
- * and the cpuset update.
+ * Don't check read_mems_allowed_retry()
+ * here - if mems_allowed was updated in
+ * parallel, that was a harmless race
+ * between allocation and the cpuset
+ * update
*/
- put_mems_allowed(cpuset_mems_cookie);
return object;
}
}
}
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
#endif
return NULL;
}
@@ -1866,7 +1835,7 @@ redo:
* that acquire_slab() will see a slab page that
* is frozen
*/
- raw_spin_lock(&n->list_lock);
+ spin_lock(&n->list_lock);
}
} else {
m = M_FULL;
@@ -1877,7 +1846,7 @@ redo:
* slabs from diagnostic functions will not see
* any frozen slabs.
*/
- raw_spin_lock(&n->list_lock);
+ spin_lock(&n->list_lock);
}
}
@@ -1912,7 +1881,7 @@ redo:
goto redo;
if (lock)
- raw_spin_unlock(&n->list_lock);
+ spin_unlock(&n->list_lock);
if (m == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
@@ -1944,10 +1913,10 @@ static void unfreeze_partials(struct kmem_cache *s,
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
if (n)
- raw_spin_unlock(&n->list_lock);
+ spin_unlock(&n->list_lock);
n = n2;
- raw_spin_lock(&n->list_lock);
+ spin_lock(&n->list_lock);
}
do {
@@ -1976,7 +1945,7 @@ static void unfreeze_partials(struct kmem_cache *s,
}
if (n)
- raw_spin_unlock(&n->list_lock);
+ spin_unlock(&n->list_lock);
while (discard_page) {
page = discard_page;
@@ -2014,21 +1983,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > s->cpu_partial) {
- struct slub_free_list *f;
unsigned long flags;
- LIST_HEAD(tofree);
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
- f = &__get_cpu_var(slub_free_list);
- raw_spin_lock(&f->lock);
- list_splice_init(&f->list, &tofree);
- raw_spin_unlock(&f->lock);
local_irq_restore(flags);
- free_delayed(&tofree);
oldpage = NULL;
pobjects = 0;
pages = 0;
@@ -2092,22 +2054,7 @@ static bool has_cpu_slab(int cpu, void *info)
static void flush_all(struct kmem_cache *s)
{
- LIST_HEAD(tofree);
- int cpu;
-
on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
- for_each_online_cpu(cpu) {
- struct slub_free_list *f;
-
- if (!has_cpu_slab(cpu, s))
- continue;
-
- f = &per_cpu(slub_free_list, cpu);
- raw_spin_lock_irq(&f->lock);
- list_splice_init(&f->list, &tofree);
- raw_spin_unlock_irq(&f->lock);
- free_delayed(&tofree);
- }
}
/*
@@ -2135,10 +2082,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
unsigned long x = 0;
struct page *page;
- raw_spin_lock_irqsave(&n->list_lock, flags);
+ spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
x += get_count(page);
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
@@ -2281,11 +2228,9 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
- struct slub_free_list *f;
void *freelist;
struct page *page;
unsigned long flags;
- LIST_HEAD(tofree);
local_irq_save(flags);
#ifdef CONFIG_PREEMPT
@@ -2348,13 +2293,7 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
-out:
- f = &__get_cpu_var(slub_free_list);
- raw_spin_lock(&f->lock);
- list_splice_init(&f->list, &tofree);
- raw_spin_unlock(&f->lock);
local_irq_restore(flags);
- free_delayed(&tofree);
return freelist;
new_slab:
@@ -2372,7 +2311,9 @@ new_slab:
if (unlikely(!freelist)) {
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
- goto out;
+
+ local_irq_restore(flags);
+ return NULL;
}
page = c->page;
@@ -2387,7 +2328,8 @@ new_slab:
deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
- goto out;
+ local_irq_restore(flags);
+ return freelist;
}
/*
@@ -2472,10 +2414,6 @@ redo:
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->object_size);
-#ifdef CONFIG_PREEMPT_RT_FULL
- if (unlikely(s->ctor) && object)
- s->ctor(object);
-#endif
slab_post_alloc_hook(s, gfpflags, object);
@@ -2563,7 +2501,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
do {
if (unlikely(n)) {
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
prior = page->freelist;
@@ -2595,7 +2533,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* Otherwise the list_lock will synchronize with
* other processors updating the list of slabs.
*/
- raw_spin_lock_irqsave(&n->list_lock, flags);
+ spin_lock_irqsave(&n->list_lock, flags);
}
}
@@ -2637,7 +2575,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
@@ -2651,7 +2589,7 @@ slab_empty:
/* Slab must be on the full list */
remove_full(s, page);
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
}
@@ -2853,7 +2791,7 @@ static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
- raw_spin_lock_init(&n->list_lock);
+ spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
@@ -3439,7 +3377,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
for (i = 0; i < objects; i++)
INIT_LIST_HEAD(slabs_by_inuse + i);
- raw_spin_lock_irqsave(&n->list_lock, flags);
+ spin_lock_irqsave(&n->list_lock, flags);
/*
* Build lists indexed by the items in use in each slab.
@@ -3460,7 +3398,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
for (i = objects - 1; i > 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
@@ -3636,12 +3574,6 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
- INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
- }
if (debug_guardpage_minorder())
slub_max_order = 0;
@@ -3946,7 +3878,7 @@ static int validate_slab_node(struct kmem_cache *s,
struct page *page;
unsigned long flags;
- raw_spin_lock_irqsave(&n->list_lock, flags);
+ spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru) {
validate_slab_slab(s, page, map);
@@ -3969,7 +3901,7 @@ static int validate_slab_node(struct kmem_cache *s,
atomic_long_read(&n->nr_slabs));
out:
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
return count;
}
@@ -4159,12 +4091,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
if (!atomic_long_read(&n->nr_slabs))
continue;
- raw_spin_lock_irqsave(&n->list_lock, flags);
+ spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
process_slab(&t, s, page, alloc, map);
list_for_each_entry(page, &n->full, lru)
process_slab(&t, s, page, alloc, map);
- raw_spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
}
for (i = 0; i < t.count; i++) {
diff --git a/mm/swap.c b/mm/swap.c
index 05a6951..16e70ce 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,7 +32,6 @@
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
-#include <linux/locallock.h>
#include "internal.h"
@@ -46,9 +45,6 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
-static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
-static DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
-
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
@@ -72,7 +68,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
}
static void __put_compound_page(struct page *page)
@@ -412,11 +408,11 @@ void rotate_reclaimable_page(struct page *page)
unsigned long flags;
page_cache_get(page);
- local_lock_irqsave(rotate_lock, flags);
+ local_irq_save(flags);
pvec = &__get_cpu_var(lru_rotate_pvecs);
if (!pagevec_add(pvec, page))
pagevec_move_tail(pvec);
- local_unlock_irqrestore(rotate_lock, flags);
+ local_irq_restore(flags);
}
}
@@ -441,7 +437,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
SetPageActive(page);
lru += LRU_ACTIVE;
add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_activate(page, page_to_pfn(page));
+ trace_mm_lru_activate(page);
__count_vm_event(PGACTIVATE);
update_page_reclaim_stat(lruvec, file, 1);
@@ -467,13 +463,12 @@ static bool need_activate_page_drain(int cpu)
void activate_page(struct page *page)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_locked_var(swapvec_lock,
- activate_page_pvecs);
+ struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
page_cache_get(page);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- put_locked_var(swapvec_lock, activate_page_pvecs);
+ put_cpu_var(activate_page_pvecs);
}
}
@@ -499,7 +494,7 @@ void activate_page(struct page *page)
static void __lru_cache_activate_page(struct page *page)
{
- struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
int i;
/*
@@ -521,7 +516,7 @@ static void __lru_cache_activate_page(struct page *page)
}
}
- put_locked_var(swapvec_lock, lru_add_pvec);
+ put_cpu_var(lru_add_pvec);
}
/*
@@ -554,26 +549,54 @@ void mark_page_accessed(struct page *page)
EXPORT_SYMBOL(mark_page_accessed);
/*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
- * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * Used to mark_page_accessed(page) that is not visible yet and when it is
+ * still safe to use non-atomic ops
*/
-void __lru_cache_add(struct page *page)
+void init_page_accessed(struct page *page)
{
- struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
+ if (!PageReferenced(page))
+ __SetPageReferenced(page);
+}
+EXPORT_SYMBOL(init_page_accessed);
+
+static void __lru_cache_add(struct page *page)
+{
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
page_cache_get(page);
if (!pagevec_space(pvec))
__pagevec_lru_add(pvec);
pagevec_add(pvec, page);
- put_locked_var(swapvec_lock, lru_add_pvec);
+ put_cpu_var(lru_add_pvec);
+}
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add_anon(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
}
-EXPORT_SYMBOL(__lru_cache_add);
+
+void lru_cache_add_file(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
*/
void lru_cache_add(struct page *page)
{
@@ -690,9 +713,9 @@ void lru_add_drain_cpu(int cpu)
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_lock_irqsave(rotate_lock, flags);
+ local_irq_save(flags);
pagevec_move_tail(pvec);
- local_unlock_irqrestore(rotate_lock, flags);
+ local_irq_restore(flags);
}
pvec = &per_cpu(lru_deactivate_pvecs, cpu);
@@ -720,19 +743,18 @@ void deactivate_page(struct page *page)
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_locked_var(swapvec_lock,
- lru_deactivate_pvecs);
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_locked_var(swapvec_lock, lru_deactivate_pvecs);
+ put_cpu_var(lru_deactivate_pvecs);
}
}
void lru_add_drain(void)
{
- lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
- local_unlock_cpu(swapvec_lock);
+ lru_add_drain_cpu(get_cpu());
+ put_cpu();
}
static void lru_add_drain_per_cpu(struct work_struct *dummy)
@@ -785,7 +807,7 @@ void lru_add_drain_all(void)
* grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
* will free it.
*/
-void release_pages(struct page **pages, int nr, int cold)
+void release_pages(struct page **pages, int nr, bool cold)
{
int i;
LIST_HEAD(pages_to_free);
@@ -826,7 +848,7 @@ void release_pages(struct page **pages, int nr, int cold)
}
/* Clear Active bit in case of parallel mark_page_accessed */
- ClearPageActive(page);
+ __ClearPageActive(page);
list_add(&page->lru, &pages_to_free);
}
@@ -908,7 +930,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
update_page_reclaim_stat(lruvec, file, active);
- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
+ trace_mm_lru_insertion(page, lru);
}
/*
@@ -922,6 +944,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_lru_add);
/**
+ * pagevec_lookup_entries - gang pagecache lookup
+ * @pvec: Where the resulting entries are placed
+ * @mapping: The address_space to search
+ * @start: The starting entry index
+ * @nr_entries: The maximum number of entries
+ * @indices: The cache indices corresponding to the entries in @pvec
+ *
+ * pagevec_lookup_entries() will search for and return a group of up
+ * to @nr_entries pages and shadow entries in the mapping. All
+ * entries are placed in @pvec. pagevec_lookup_entries() takes a
+ * reference against actual pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous entries with
+ * ascending indexes. There may be holes in the indices due to
+ * not-present entries.
+ *
+ * pagevec_lookup_entries() returns the number of entries which were
+ * found.
+ */
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages,
+ pgoff_t *indices)
+{
+ pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->pages, indices);
+ return pagevec_count(pvec);
+}
+
+/**
+ * pagevec_remove_exceptionals - pagevec exceptionals pruning
+ * @pvec: The pagevec to prune
+ *
+ * pagevec_lookup_entries() fills both pages and exceptional radix
+ * tree entries into the pagevec. This function prunes all
+ * exceptionals from @pvec without leaving holes, so that it can be
+ * passed on to page-only pagevec operations.
+ */
+void pagevec_remove_exceptionals(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+}
+
+/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8..4079edf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
return ret;
}
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
+
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
for (i = 0; i < todo; i++)
free_swap_cache(pagep[i]);
- release_pages(pagep, todo, 0);
+ release_pages(pagep, todo, false);
pagep += todo;
nr -= todo;
}
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), entry.val);
- if (page)
+ if (page) {
INC_CACHE_INFO(find_success);
+ if (TestClearPageReadahead(page))
+ atomic_inc(&swapin_readahead_hits);
+ }
INC_CACHE_INFO(find_total);
return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int pages, max_pages, last_ra;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << ACCESS_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ /*
+ * This heuristic has been found to work well on both sequential and
+ * random loads, swapping to hard disk or to SSD: please don't ask
+ * what the "+ 2" means, it just happens to work well, that's all.
+ */
+ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ if (pages == 2) {
+ /*
+ * We can have no readahead hits to judge by: but must not get
+ * stuck here forever, so check for an adjacent offset instead
+ * (and don't even bother to check whether swap type is same).
+ */
+ if (offset != prev_offset + 1 && offset != prev_offset - 1)
+ pages = 1;
+ prev_offset = offset;
+ } else {
+ unsigned int roundup = 4;
+ while (roundup < pages)
+ roundup <<= 1;
+ pages = roundup;
+ }
+
+ if (pages > max_pages)
+ pages = max_pages;
+
+ /* Don't shrink readahead too fast */
+ last_ra = atomic_read(&last_readahead_pages) / 2;
+ if (pages < last_ra)
+ pages = last_ra;
+ atomic_set(&last_readahead_pages, pages);
+
+ return pages;
+}
+
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- unsigned long offset = swp_offset(entry);
+ unsigned long entry_offset = swp_offset(entry);
+ unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
- unsigned long mask = (1UL << page_cluster) - 1;
+ unsigned long mask;
struct blk_plug plug;
+ mask = swapin_nr_pages(offset) - 1;
+ if (!mask)
+ goto skip;
+
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
gfp_mask, vma, addr);
if (!page)
continue;
+ if (offset != entry_offset)
+ SetPageReadahead(page);
page_cache_release(page);
}
blk_finish_plug(&plug);
lru_add_drain(); /* Push any new pages onto the LRU now */
+skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0ec2eaf..660b9c0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+PLIST_HEAD(swap_active_head);
+
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -591,6 +609,9 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
@@ -639,71 +660,65 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si, *next;
pgoff_t offset;
- int type, next;
- int wrapped = 0;
- int hp_index;
- spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- hp_index = atomic_xchg(&highest_priority_index, -1);
- /*
- * highest_priority_index records current highest priority swap
- * type which just frees swap entries. If its priority is
- * higher than that of swap_list.next swap type, we use it. It
- * isn't protected by swap_lock, so it can be an invalid value
- * if the corresponding swap type is swapoff. We double check
- * the flags here. It's even possible the swap type is swapoff
- * and swapon again and its priority is changed. In such rare
- * case, low prority swap type might be used, but eventually
- * high priority swap will be used after several rounds of
- * swap.
- */
- if (hp_index != -1 && hp_index != type &&
- swap_info[type]->prio < swap_info[hp_index]->prio &&
- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
- type = hp_index;
- swap_list.next = type;
- }
-
- si = swap_info[type];
- next = si->next;
- if (next < 0 ||
- (!wrapped && si->prio != swap_info[next]->prio)) {
- next = swap_list.head;
- wrapped++;
- }
+ spin_lock(&swap_avail_lock);
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ /* requeue si to after same-priority siblings */
+ plist_requeue(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
- if (!si->highest_bit) {
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&si->lock);
- continue;
+ goto nextsi;
}
- if (!(si->flags & SWP_WRITEOK)) {
- spin_unlock(&si->lock);
- continue;
- }
-
- swap_list.next = next;
- spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
- return swp_entry(type, offset);
- spin_lock(&swap_lock);
- next = swap_list.next;
+ return swp_entry(si->type, offset);
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&swap_avail_lock);
+nextsi:
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over.
+ */
+ if (plist_node_empty(&next->avail_list))
+ goto start_over;
}
+ spin_unlock(&swap_avail_lock);
+
atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -765,27 +780,6 @@ out:
return NULL;
}
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
- int old_hp_index, new_hp_index;
-
- do {
- old_hp_index = atomic_read(&highest_priority_index);
- if (old_hp_index != -1 &&
- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
- break;
- new_hp_index = type;
- } while (atomic_cmpxchg(&highest_priority_index,
- old_hp_index, new_hp_index) != old_hp_index);
-}
-
static unsigned char swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit)
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
p->highest_bit = offset;
- set_highest_priority_index(p->type);
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- int i, prev;
-
if (prio >= 0)
p->prio = prio;
else
p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ p->avail_list.prio = -p->prio;
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p->type;
- else
- swap_info[prev]->next = p->type;
+ assert_spin_locked(&swap_lock);
+ /*
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
+ */
+ plist_add(&p->list, &swap_active_head);
+ spin_lock(&swap_avail_lock);
+ plist_add(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int i, type, prev;
- int err;
+ int err, found = 0;
unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out;
mapping = victim->f_mapping;
- prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
- p = swap_info[type];
+ plist_for_each_entry(p, &swap_active_head, list) {
if (p->flags & SWP_WRITEOK) {
- if (p->swap_file->f_mapping == mapping)
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
break;
+ }
}
- prev = type;
}
- if (type < 0) {
+ if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0)
- swap_list.head = p->next;
- else
- swap_info[prev]->next = p->next;
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
- }
+ spin_lock(&swap_avail_lock);
+ plist_del(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&p->lock);
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i]->next)
- swap_info[i]->prio = p->prio--;
+ struct swap_info_struct *si = p;
+
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
+ si->prio++;
+ si->list.prio--;
+ si->avail_list.prio--;
+ }
least_priority++;
}
+ plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
set_current_oom_origin();
- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
if (err) {
@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
frontswap_map_set(p, NULL);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- frontswap_invalidate_area(type);
+ frontswap_invalidate_area(p->type);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(cluster_info);
vfree(frontswap_map);
/* Destroy swap account informatin */
- swap_cgroup_swapoff(type);
+ swap_cgroup_swapoff(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
+ plist_node_init(&p->list, 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
- p->next = -1;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index 353b683..827ad8d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,8 +20,25 @@
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
+static void clear_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ spin_lock_irq(&mapping->tree_lock);
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ radix_tree_delete_item(&mapping->page_tree, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+}
/**
* do_invalidatepage - invalidate part or all of a page
@@ -208,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
unsigned int partial_start; /* inclusive */
unsigned int partial_end; /* exclusive */
struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
@@ -238,17 +256,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_init(&pvec, 0);
index = start;
- while (index < end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index >= end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
if (!trylock_page(page))
continue;
WARN_ON(page->index != index);
@@ -259,6 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
truncate_inode_page(mapping, page);
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -307,14 +332,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
+ if (!pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
if (index == start)
break;
index = start;
continue;
}
- if (index == start && pvec.pages[0]->index >= end) {
+ if (index == start && indices[0] >= end) {
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
break;
}
@@ -323,16 +350,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index >= end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
lock_page(page);
WARN_ON(page->index != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -375,6 +408,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
+ pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index = start;
unsigned long ret;
@@ -390,17 +424,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
*/
pagevec_init(&pvec, 0);
- while (index <= end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index > end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
if (!trylock_page(page))
continue;
WARN_ON(page->index != index);
@@ -414,6 +454,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
deactivate_page(page);
count += ret;
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -481,6 +522,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
+ pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index;
int i;
@@ -491,17 +533,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
cleancache_invalidate_inode(mapping);
pagevec_init(&pvec, 0);
index = start;
- while (index <= end && pagevec_lookup(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = page->index;
+ index = indices[i];
if (index > end)
break;
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
lock_page(page);
WARN_ON(page->index != index);
if (page->mapping != mapping) {
@@ -539,6 +587,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
ret = ret2;
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -613,12 +662,67 @@ EXPORT_SYMBOL(truncate_pagecache);
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
+ loff_t oldsize = inode->i_size;
+
i_size_write(inode, newsize);
+ if (newsize > oldsize)
+ pagecache_isize_extended(inode, oldsize, newsize);
truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);
/**
+ * pagecache_isize_extended - update pagecache after extension of i_size
+ * @inode: inode for which i_size was extended
+ * @from: original inode size
+ * @to: new inode size
+ *
+ * Handle extension of inode size either caused by extending truncate or by
+ * write starting after current i_size. We mark the page straddling current
+ * i_size RO so that page_mkwrite() is called on the nearest write access to
+ * the page. This way filesystem can be sure that page_mkwrite() is called on
+ * the page before user writes to the page via mmap after the i_size has been
+ * changed.
+ *
+ * The function must be called after i_size is updated so that page fault
+ * coming after we unlock the page will already see the new i_size.
+ * The function must be called while we still hold i_mutex - this not only
+ * makes sure i_size is stable but also that userspace cannot observe new
+ * i_size value before we are prepared to store mmap writes at new inode size.
+ */
+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
+{
+ int bsize = 1 << inode->i_blkbits;
+ loff_t rounded_from;
+ struct page *page;
+ pgoff_t index;
+
+ WARN_ON(to > inode->i_size);
+
+ if (from >= to || bsize == PAGE_CACHE_SIZE)
+ return;
+ /* Page straddling @from will not have any hole block created? */
+ rounded_from = round_up(from, bsize);
+ if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+ return;
+
+ index = from >> PAGE_CACHE_SHIFT;
+ page = find_lock_page(inode->i_mapping, index);
+ /* Page not cached? Nothing to do */
+ if (!page)
+ return;
+ /*
+ * See clear_page_dirty_for_io() for details why set_page_dirty()
+ * is needed.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+}
+EXPORT_SYMBOL(pagecache_isize_extended);
+
+/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode
* @lstart: offset of beginning of hole
diff --git a/mm/util.c b/mm/util.c
index 96da2d7..de943ec 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -272,17 +272,14 @@ pid_t vm_is_stack(struct task_struct *task,
if (in_group) {
struct task_struct *t;
- rcu_read_lock();
- if (!pid_alive(task))
- goto done;
- t = task;
- do {
+ rcu_read_lock();
+ for_each_thread(task, t) {
if (vm_is_stack_for_task(t, vma)) {
ret = t->pid;
goto done;
}
- } while_each_thread(task, t);
+ }
done:
rcu_read_unlock();
}
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 0000000..1037a3ba
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+ struct task_struct *g, *p;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * Only flush the vmacache pointers as the
+ * mm seqnum is already set and curr's will
+ * be set upon invalidation when the next
+ * lookup is done.
+ */
+ if (mm == p->mm)
+ vmacache_flush(p);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma(). The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own). There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+ return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+ if (vmacache_valid_mm(newvma->vm_mm))
+ current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+ struct task_struct *curr;
+
+ if (!vmacache_valid_mm(mm))
+ return false;
+
+ curr = current;
+ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+ /*
+ * First attempt will always be invalid, initialize
+ * the new cache for this task here.
+ */
+ curr->vmacache_seqnum = mm->vmacache_seqnum;
+ vmacache_flush(curr);
+ return false;
+ }
+ return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (!vma)
+ continue;
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+ if (vma->vm_start <= addr && vma->vm_end > addr)
+ return vma;
+ }
+
+ return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start == start && vma->vm_end == end)
+ return vma;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d64289d..060dc36 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
+ /*
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
+ */
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+
retry:
spin_lock(&vmap_area_lock);
/*
@@ -790,7 +796,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
- int node, err, cpu;
+ int node, err;
node = numa_node_id();
@@ -828,12 +834,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
BUG_ON(err);
radix_tree_preload_end();
- cpu = get_cpu_light();
- vbq = &__get_cpu_var(vmap_block_queue);
+ vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
list_add_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_light();
+ put_cpu_var(vmap_block_queue);
return vb;
}
@@ -901,7 +906,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
struct vmap_block *vb;
unsigned long addr = 0;
unsigned int order;
- int cpu = 0;
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -917,8 +921,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
again:
rcu_read_lock();
- cpu = get_cpu_light();
- vbq = &__get_cpu_var(vmap_block_queue);
+ vbq = &get_cpu_var(vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
int i;
@@ -942,7 +945,7 @@ next:
spin_unlock(&vb->lock);
}
- put_cpu_light();
+ put_cpu_var(vmap_block_queue);
rcu_read_unlock();
if (!addr) {
@@ -1649,11 +1652,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
clear_vm_uninitialized_flag(area);
/*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
+ * A ref_count = 2 is needed because vm_struct allocated in
+ * __get_vm_area_node() contains a reference to the virtual address of
+ * the vmalloc'ed block.
*/
- kmemleak_alloc(addr, real_size, 3, gfp_mask);
+ kmemleak_alloc(addr, real_size, 2, gfp_mask);
return addr;
@@ -2682,14 +2685,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
prev_end = VMALLOC_START;
- spin_lock(&vmap_area_lock);
+ rcu_read_lock();
if (list_empty(&vmap_area_list)) {
vmi->largest_chunk = VMALLOC_TOTAL;
goto out;
}
- list_for_each_entry(va, &vmap_area_list, list) {
+ list_for_each_entry_rcu(va, &vmap_area_list, list) {
unsigned long addr = va->va_start;
/*
@@ -2716,7 +2719,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
vmi->largest_chunk = VMALLOC_END - prev_end;
out:
- spin_unlock(&vmap_area_lock);
+ rcu_read_unlock();
}
#endif
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index e0f6283..c98b14e 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -164,6 +164,7 @@ static void vmpressure_work_fn(struct work_struct *work)
unsigned long scanned;
unsigned long reclaimed;
+ spin_lock(&vmpr->sr_lock);
/*
* Several contexts might be calling vmpressure(), so it is
* possible that the work was rescheduled again before the old
@@ -172,11 +173,12 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- if (!vmpr->scanned)
+ scanned = vmpr->scanned;
+ if (!scanned) {
+ spin_unlock(&vmpr->sr_lock);
return;
+ }
- spin_lock(&vmpr->sr_lock);
- scanned = vmpr->scanned;
reclaimed = vmpr->reclaimed;
vmpr->scanned = 0;
vmpr->reclaimed = 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 05e6095..ee8363f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
bool zone_reclaimable(struct zone *zone)
{
- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+ return zone_page_state(zone, NR_PAGES_SCANNED) <
+ zone_reclaimable_pages(zone) * 6;
}
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
- long max_pass;
+ long freeable;
long nr;
long new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- if (max_pass == 0)
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0)
return 0;
/*
@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
+ delta *= freeable;
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
- total_scan = max_pass;
+ total_scan = freeable;
}
/*
@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
* shrinkers to return -1 all the time. This results in a large
* nr being built up so when a shrink that can do some work
* comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
+ * freeable. This is bad for sustaining a working set in
* memory.
*
* Hence only allow the shrinker to scan the entire cache when
* a large delta change is calculated directly.
*/
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
+ if (delta < freeable / 4)
+ total_scan = min(total_scan, freeable / 2);
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
+ if (total_scan > freeable * 2)
+ total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
+ freeable, delta, total_scan);
- while (total_scan >= batch_size) {
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= freeable) {
unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
- shrinkctl->nr_to_scan = batch_size;
+ shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ count_vm_events(SLABS_SCANNED, nr_to_scan);
+ total_scan -= nr_to_scan;
cond_resched();
}
@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (!node_online(shrinkctl->nid))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
- (shrinkctl->nid != 0))
- break;
-
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ nr_pages_scanned, lru_pages);
+ continue;
+ }
+
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
}
}
@@ -1089,7 +1108,7 @@ keep:
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
- free_hot_cold_page_list(&free_pages, 1);
+ free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
TTU_UNMAP|TTU_IGNORE_ACCESS,
&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
list_splice(&clean_pages, page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
if (global_reclaim(sc)) {
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, 1);
+ free_hot_cold_page_list(&page_list, true);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1522,19 +1541,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context. It will forcibly stall in the
- * next check.
+ * pages from reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
/*
- * In addition, if kswapd scans pages marked marked for
- * immediate reclaim and under writeback (nr_immediate), it
- * implies that pages are cycling through the LRU faster than
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it implies
+ * that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_unqueued_dirty == nr_taken || nr_immediate)
+ if (nr_immediate)
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
@@ -1642,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -1708,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, 1);
+ free_hot_cold_page_list(&l_hold, true);
}
#ifdef CONFIG_SWAP
@@ -1830,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file, free;
+ unsigned long anon, file;
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
@@ -1878,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
-
/*
* If it's foreseeable that reclaiming the file cache won't be
* enough to get the zone back into a desirable shape, we have
@@ -1890,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* thrashing - remaining file pages alone.
*/
if (global_reclaim(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
- if (unlikely(file + free <= high_wmark_pages(zone))) {
+ unsigned long zonefile;
+ unsigned long zonefree;
+
+ zonefree = zone_page_state(zone, NR_FREE_PAGES);
+ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
scan_balance = SCAN_ANON;
goto out;
}
@@ -1926,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
+
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
@@ -2001,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
- bool scan_adjusted = false;
+ bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
+ /*
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -2028,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
continue;
/*
- * For global direct reclaim, reclaim only the number of pages
- * requested. Less care is taken to scan proportionally as it
- * is more important to minimise direct reclaim stall latency
- * than it is to properly age the LRU lists.
- */
- if (global_reclaim(sc) && !current_is_kswapd())
- break;
-
- /*
* For kswapd and memcg, reclaim at least the number of pages
- * requested. Ensure that the anon and file LRUs shrink
+ * requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
@@ -2046,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
@@ -2407,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
unsigned long lru_pages = 0;
nodes_clear(shrink->nodes_to_scan);
- for_each_zone_zonelist(zone, z, zonelist,
- gfp_zone(sc->gfp_mask)) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
@@ -2484,10 +2523,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2512,9 +2558,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2533,10 +2579,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -2798,18 +2868,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
/*
- * There is a potential race between when kswapd checks its watermarks
- * and a process gets throttled. There is also a potential race if
- * processes get throttled, kswapd wakes, a large process exits therby
- * balancing the zones that causes kswapd to miss a wakeup. If kswapd
- * is going to sleep, no process should be sleeping on pfmemalloc_wait
- * so wake them now if necessary. If necessary, processes will wake
- * kswapd and get throttled again
+ * The throttled processes are normally woken up in balance_pgdat() as
+ * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ * race between when kswapd checks the watermarks and a process gets
+ * throttled. There is also a potential race if processes get
+ * throttled, kswapd wakes, a large process exits thereby balancing the
+ * zones, which causes kswapd to exit balance_pgdat() before reaching
+ * the wake up checks. If kswapd is going to sleep, no process should
+ * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+ * the wake up is premature, processes will wake kswapd and get
+ * throttled again. The difference from wake ups in balance_pgdat() is
+ * that here we are under prepare_to_wait().
*/
- if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
- wake_up(&pgdat->pfmemalloc_wait);
- return false;
- }
+ if (waitqueue_active(&pgdat->pfmemalloc_wait))
+ wake_up_all(&pgdat->pfmemalloc_wait);
return pgdat_balanced(pgdat, order, classzone_idx);
}
@@ -3267,7 +3339,10 @@ static int kswapd(void *p)
}
}
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index efea337..f7ca044 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;
threshold = (*calculate_pressure)(zone);
- for_each_possible_cpu(cpu)
+ for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
}
@@ -217,7 +217,6 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long x;
long t;
- preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -227,7 +226,6 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
x = 0;
}
__this_cpu_write(*p, x);
- preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -260,7 +258,6 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
- preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -269,7 +266,6 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
- preempt_enable_rt();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -284,7 +280,6 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
- preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -293,7 +288,6 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
- preempt_enable_rt();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -767,6 +761,7 @@ const char * const vmstat_text[] = {
"nr_shmem",
"nr_dirtied",
"nr_written",
+ "nr_pages_scanned",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -857,12 +852,14 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
+#ifdef CONFIG_DEBUG_TLBFLUSH
#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
-#endif
+#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
+#endif /* CONFIG_DEBUG_TLBFLUSH */
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
@@ -1059,7 +1056,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- zone->pages_scanned,
+ zone_page_state(zone, NR_PAGES_SCANNED),
zone->spanned_pages,
zone->present_pages,
zone->managed_pages);
@@ -1069,10 +1066,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
seq_printf(m,
- "\n protection: (%lu",
+ "\n protection: (%ld",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
seq_printf(m,
")"
"\n pagesets");