summaryrefslogtreecommitdiff
path: root/mm/vmstat.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c495
1 files changed, 430 insertions, 65 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed5..312d728 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,22 +12,26 @@
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
+static void sum_vm_events(unsigned long *ret)
{
int cpu;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- for_each_cpu(cpu, cpumask) {
+ for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -43,7 +47,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret, cpu_online_mask);
+ sum_vm_events(ret);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
@@ -136,10 +140,24 @@ static void refresh_zone_stat_thresholds(void)
int threshold;
for_each_populated_zone(zone) {
+ unsigned long max_drift, tolerate_drift;
+
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+
+ /*
+ * Only set percpu_drift_mark if there is a danger that
+ * NR_FREE_PAGES reports the low watermark is ok when in fact
+ * the min watermark could be breached by an allocation
+ */
+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+ max_drift = num_online_cpus() * threshold;
+ if (max_drift > tolerate_drift)
+ zone->percpu_drift_mark = high_wmark_pages(zone) +
+ max_drift;
}
}
@@ -149,35 +167,24 @@ static void refresh_zone_stat_thresholds(void)
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
- x = delta + *p;
+ x = delta + __this_cpu_read(*p);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ t = __this_cpu_read(pcp->stat_threshold);
+
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_zone_page_state(zone, item, delta);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
@@ -202,16 +209,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)++;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- if (unlikely(*p > pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
- zone_page_state_add(*p + overstep, zone, item);
- *p = -overstep;
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
}
}
@@ -223,16 +231,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)--;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- if (unlikely(*p < - pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
- zone_page_state_add(*p - overstep, zone, item);
- *p = overstep;
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
}
}
@@ -242,6 +251,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+ enum zone_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the following
+ * will apply the threshold again and therefore bring the
+ * counter under the threshold.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
@@ -272,6 +367,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
+#endif
/*
* Update the zone counters for one cpu.
@@ -300,7 +396,7 @@ void refresh_cpu_vm_stats(int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
- p = zone_pcp(zone, cpu);
+ p = per_cpu_ptr(zone->pageset, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (p->vm_stat_diff[i]) {
@@ -376,7 +472,87 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
}
#endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_COMPACTION
+
+struct contig_page_info {
+ unsigned long free_pages;
+ unsigned long free_blocks_total;
+ unsigned long free_blocks_suitable;
+};
+
+/*
+ * Calculate the number of free pages in a zone, how many contiguous
+ * pages are free and how many are large enough to satisfy an allocation of
+ * the target size. Note that this function makes no attempt to estimate
+ * how many suitable free blocks there *might* be if MOVABLE pages were
+ * migrated. Calculating that is possible, but expensive and can be
+ * figured out from userspace
+ */
+static void fill_contig_page_info(struct zone *zone,
+ unsigned int suitable_order,
+ struct contig_page_info *info)
+{
+ unsigned int order;
+
+ info->free_pages = 0;
+ info->free_blocks_total = 0;
+ info->free_blocks_suitable = 0;
+
+ for (order = 0; order < MAX_ORDER; order++) {
+ unsigned long blocks;
+
+ /* Count number of free blocks */
+ blocks = zone->free_area[order].nr_free;
+ info->free_blocks_total += blocks;
+
+ /* Count free base pages */
+ info->free_pages += blocks << order;
+
+ /* Count the suitable free blocks */
+ if (order >= suitable_order)
+ info->free_blocks_suitable += blocks <<
+ (order - suitable_order);
+ }
+}
+
+/*
+ * A fragmentation index only makes sense if an allocation of a requested
+ * size would fail. If that is true, the fragmentation index indicates
+ * whether external fragmentation or a lack of memory was the problem.
+ * The value can be used to determine if page reclaim or compaction
+ * should be used
+ */
+static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
+{
+ unsigned long requested = 1UL << order;
+
+ if (!info->free_blocks_total)
+ return 0;
+
+ /* Fragmentation index only makes sense when a request would fail */
+ if (info->free_blocks_suitable)
+ return -1000;
+
+ /*
+ * Index is between 0 and 1 so return within 3 decimal places
+ *
+ * 0 => allocation would fail due to lack of memory
+ * 1 => allocation would fail due to fragmentation
+ */
+ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
+}
+
+/* Same as __fragmentation index but allocs contig_page_info on stack */
+int fragmentation_index(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ return __fragmentation_index(order, &info);
+}
+#endif
+
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -429,7 +605,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
spin_unlock_irqrestore(&zone->lock, flags);
}
}
+#endif
+#ifdef CONFIG_PROC_FS
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -639,10 +817,16 @@ static const char * const vmstat_text[] = {
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
"nr_page_table_pages",
+ "nr_kernel_stack",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
"nr_writeback_temp",
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "nr_shmem",
+ "nr_dirtied",
+ "nr_written",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -652,6 +836,8 @@ static const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
"pgpgin",
@@ -680,10 +866,23 @@ static const char * const vmstat_text[] = {
"slabs_scanned",
"kswapd_steal",
"kswapd_inodesteal",
+ "kswapd_low_wmark_hit_quickly",
+ "kswapd_high_wmark_hit_quickly",
+ "kswapd_skip_congestion_wait",
"pageoutrun",
"allocstall",
"pgrotated",
+
+#ifdef CONFIG_COMPACTION
+ "compact_blocks_moved",
+ "compact_pages_moved",
+ "compact_pagemigrate_failed",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+#endif
+
#ifdef CONFIG_HUGETLB_PAGE
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
@@ -712,7 +911,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_page_state(zone, NR_FREE_PAGES),
+ zone_nr_free_pages(zone),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@@ -735,7 +934,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, i);
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
@@ -752,11 +951,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n all_unreclaimable: %u"
- "\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
- zone_is_all_unreclaimable(zone),
- zone->prev_priority,
+ zone->all_unreclaimable,
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');
@@ -792,36 +989,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
.release = seq_release,
};
+enum writeback_stat_item {
+ NR_DIRTY_THRESHOLD,
+ NR_DIRTY_BG_THRESHOLD,
+ NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
- unsigned long *e;
-#endif
- int i;
+ int i, stat_items_size;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
+ stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
- + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
- GFP_KERNEL);
+ stat_items_size += sizeof(struct vm_event_state);
#endif
+
+ v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_page_state(i);
+ v += NR_VM_ZONE_STAT_ITEMS;
+
+ global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+ v + NR_DIRTY_THRESHOLD);
+ v += NR_VM_WRITEBACK_STAT_ITEMS;
+
#ifdef CONFIG_VM_EVENT_COUNTERS
- e = v + NR_VM_ZONE_STAT_ITEMS;
- all_vm_events(e);
- e[PGPGIN] /= 2; /* sectors -> kbytes */
- e[PGPGOUT] /= 2;
+ all_vm_events(v);
+ v[PGPGIN] /= 2; /* sectors -> kbytes */
+ v[PGPGOUT] /= 2;
#endif
- return v + *pos;
+ return (unsigned long *)m->private + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -880,11 +1085,10 @@ static void vmstat_update(struct work_struct *w)
static void __cpuinit start_cpu_timer(int cpu)
{
- struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+ struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
- schedule_delayed_work_on(cpu, vmstat_work,
- __round_jiffies_relative(HZ, cpu));
+ INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+ schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
/*
@@ -900,11 +1104,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ refresh_zone_stat_thresholds();
start_cpu_timer(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
@@ -945,3 +1151,162 @@ static int __init setup_vmstat(void)
return 0;
}
module_init(setup_vmstat)
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
+#include <linux/debugfs.h>
+
+static struct dentry *extfrag_debug_root;
+
+/*
+ * Return an index indicating how much of the available free memory is
+ * unusable for an allocation of the requested size.
+ */
+static int unusable_free_index(unsigned int order,
+ struct contig_page_info *info)
+{
+ /* No free memory is interpreted as all free memory is unusable */
+ if (info->free_pages == 0)
+ return 1000;
+
+ /*
+ * Index should be a value between 0 and 1. Return a value to 3
+ * decimal places.
+ *
+ * 0 => no fragmentation
+ * 1 => high fragmentation
+ */
+ return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
+
+}
+
+static void unusable_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = unusable_free_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display unusable free space index
+ *
+ * The unusable free space index measures how much of the available free
+ * memory cannot be used to satisfy an allocation of a given size and is a
+ * value between 0 and 1. The higher the value, the more of free memory is
+ * unusable and by implication, the worse the external fragmentation is. This
+ * can be expressed as a percentage by multiplying by 100.
+ */
+static int unusable_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ return 0;
+
+ walk_zones_in_node(m, pgdat, unusable_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations unusable_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = unusable_show,
+};
+
+static int unusable_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unusable_op);
+}
+
+static const struct file_operations unusable_file_ops = {
+ .open = unusable_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void extfrag_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+
+ /* Alloc on stack as interrupts are disabled for zone walk */
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = __fragmentation_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display fragmentation index for orders that allocations would fail for
+ */
+static int extfrag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ walk_zones_in_node(m, pgdat, extfrag_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations extfrag_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = extfrag_show,
+};
+
+static int extfrag_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &extfrag_op);
+}
+
+static const struct file_operations extfrag_file_ops = {
+ .open = extfrag_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init extfrag_debug_init(void)
+{
+ extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
+ if (!extfrag_debug_root)
+ return -ENOMEM;
+
+ if (!debugfs_create_file("unusable_index", 0444,
+ extfrag_debug_root, NULL, &unusable_file_ops))
+ return -ENOMEM;
+
+ if (!debugfs_create_file("extfrag_index", 0444,
+ extfrag_debug_root, NULL, &extfrag_file_ops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+module_init(extfrag_debug_init);
+#endif