summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2015-02-13 22:12:06 (GMT)
committerScott Wood <scottwood@freescale.com>2015-02-13 22:19:22 (GMT)
commit6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch)
treef558a94f1553814cc122ab8d9e04c0ebad5262a5 /mm/vmscan.c
parentfcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff)
downloadlinux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz
Reset to 3.12.37
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c225
1 files changed, 150 insertions, 75 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 05e6095..ee8363f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
bool zone_reclaimable(struct zone *zone)
{
- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+ return zone_page_state(zone, NR_PAGES_SCANNED) <
+ zone_reclaimable_pages(zone) * 6;
}
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
- long max_pass;
+ long freeable;
long nr;
long new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
- max_pass = shrinker->count_objects(shrinker, shrinkctl);
- if (max_pass == 0)
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0)
return 0;
/*
@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= max_pass;
+ delta *= freeable;
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
- total_scan = max_pass;
+ total_scan = freeable;
}
/*
@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
* shrinkers to return -1 all the time. This results in a large
* nr being built up so when a shrink that can do some work
* comes along it empties the entire cache due to nr >>>
- * max_pass. This is bad for sustaining a working set in
+ * freeable. This is bad for sustaining a working set in
* memory.
*
* Hence only allow the shrinker to scan the entire cache when
* a large delta change is calculated directly.
*/
- if (delta < max_pass / 4)
- total_scan = min(total_scan, max_pass / 2);
+ if (delta < freeable / 4)
+ total_scan = min(total_scan, freeable / 2);
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (total_scan > max_pass * 2)
- total_scan = max_pass * 2;
+ if (total_scan > freeable * 2)
+ total_scan = freeable * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
- max_pass, delta, total_scan);
+ freeable, delta, total_scan);
- while (total_scan >= batch_size) {
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= freeable) {
unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
- shrinkctl->nr_to_scan = batch_size;
+ shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ count_vm_events(SLABS_SCANNED, nr_to_scan);
+ total_scan -= nr_to_scan;
cond_resched();
}
@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (!node_online(shrinkctl->nid))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
- (shrinkctl->nid != 0))
- break;
-
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ nr_pages_scanned, lru_pages);
+ continue;
+ }
+
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
}
}
@@ -1089,7 +1108,7 @@ keep:
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
- free_hot_cold_page_list(&free_pages, 1);
+ free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
TTU_UNMAP|TTU_IGNORE_ACCESS,
&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
list_splice(&clean_pages, page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
if (global_reclaim(sc)) {
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, 1);
+ free_hot_cold_page_list(&page_list, true);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1522,19 +1541,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context. It will forcibly stall in the
- * next check.
+ * pages from reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
/*
- * In addition, if kswapd scans pages marked marked for
- * immediate reclaim and under writeback (nr_immediate), it
- * implies that pages are cycling through the LRU faster than
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it implies
+ * that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_unqueued_dirty == nr_taken || nr_immediate)
+ if (nr_immediate)
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
@@ -1642,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -1708,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, 1);
+ free_hot_cold_page_list(&l_hold, true);
}
#ifdef CONFIG_SWAP
@@ -1830,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file, free;
+ unsigned long anon, file;
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
@@ -1878,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
-
/*
* If it's foreseeable that reclaiming the file cache won't be
* enough to get the zone back into a desirable shape, we have
@@ -1890,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* thrashing - remaining file pages alone.
*/
if (global_reclaim(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
- if (unlikely(file + free <= high_wmark_pages(zone))) {
+ unsigned long zonefile;
+ unsigned long zonefree;
+
+ zonefree = zone_page_state(zone, NR_FREE_PAGES);
+ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
scan_balance = SCAN_ANON;
goto out;
}
@@ -1926,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
+
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
@@ -2001,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
- bool scan_adjusted = false;
+ bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
+ /*
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -2028,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
continue;
/*
- * For global direct reclaim, reclaim only the number of pages
- * requested. Less care is taken to scan proportionally as it
- * is more important to minimise direct reclaim stall latency
- * than it is to properly age the LRU lists.
- */
- if (global_reclaim(sc) && !current_is_kswapd())
- break;
-
- /*
* For kswapd and memcg, reclaim at least the number of pages
- * requested. Ensure that the anon and file LRUs shrink
+ * requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
@@ -2046,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
@@ -2407,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
unsigned long lru_pages = 0;
nodes_clear(shrink->nodes_to_scan);
- for_each_zone_zonelist(zone, z, zonelist,
- gfp_zone(sc->gfp_mask)) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
@@ -2484,10 +2523,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2512,9 +2558,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2533,10 +2579,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -2798,18 +2868,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
/*
- * There is a potential race between when kswapd checks its watermarks
- * and a process gets throttled. There is also a potential race if
- * processes get throttled, kswapd wakes, a large process exits therby
- * balancing the zones that causes kswapd to miss a wakeup. If kswapd
- * is going to sleep, no process should be sleeping on pfmemalloc_wait
- * so wake them now if necessary. If necessary, processes will wake
- * kswapd and get throttled again
+ * The throttled processes are normally woken up in balance_pgdat() as
+ * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ * race between when kswapd checks the watermarks and a process gets
+ * throttled. There is also a potential race if processes get
+ * throttled, kswapd wakes, a large process exits thereby balancing the
+ * zones, which causes kswapd to exit balance_pgdat() before reaching
+ * the wake up checks. If kswapd is going to sleep, no process should
+ * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+ * the wake up is premature, processes will wake kswapd and get
+ * throttled again. The difference from wake ups in balance_pgdat() is
+ * that here we are under prepare_to_wait().
*/
- if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
- wake_up(&pgdat->pfmemalloc_wait);
- return false;
- }
+ if (waitqueue_active(&pgdat->pfmemalloc_wait))
+ wake_up_all(&pgdat->pfmemalloc_wait);
return pgdat_balanced(pgdat, order, classzone_idx);
}
@@ -3267,7 +3339,10 @@ static int kswapd(void *p)
}
}
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}