summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c228
1 files changed, 172 insertions, 56 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 925b431..f3a84c6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct oom_control oc = {
.zonelist = NULL,
.nodemask = NULL,
+ .memcg = memcg,
.gfp_mask = gfp_mask,
.order = order,
};
@@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}
- check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1289,7 +1290,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(&oc, task, totalpages)) {
+ switch (oom_scan_process_thread(&oc, task)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ oom_kill_process(&oc, chosen, points, totalpages,
"Memory cgroup out of memory");
}
unlock:
@@ -1608,7 +1609,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_may_oom || current->memcg_in_oom)
+ if (!current->memcg_may_oom)
return;
/*
* We are in the middle of the charge context here, so we
@@ -2272,20 +2273,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
current->memcg_kmem_skip_account = 0;
}
-/*
+static inline bool memcg_kmem_bypass(void)
+{
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+ return false;
+}
+
+/**
+ * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
+ * @cachep: the original global kmem cache
+ *
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
*
- * If the cache does not exist yet, if we are the first user of it,
- * we either create it immediately, if possible, or create it asynchronously
- * in a workqueue.
- * In the latter case, we will let the current allocation go through with
- * the original cache.
+ * If the cache does not exist yet, if we are the first user of it, we
+ * create it asynchronously in a workqueue and let the current allocation
+ * go through with the original cache.
*
- * Can't be called in interrupt context or from kernel threads.
- * This function needs to be called with rcu_read_lock() held.
+ * This function takes a reference to the cache it returns to assure it
+ * won't get destroyed while we are working with it. Once the caller is
+ * done with it, memcg_kmem_put_cache() must be called to release the
+ * reference.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
+struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -2293,10 +2304,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
VM_BUG_ON(!is_root_cache(cachep));
- if (cachep->flags & SLAB_ACCOUNT)
- gfp |= __GFP_ACCOUNT;
-
- if (!(gfp & __GFP_ACCOUNT))
+ if (memcg_kmem_bypass())
return cachep;
if (current->memcg_kmem_skip_account)
@@ -2329,14 +2337,27 @@ out:
return cachep;
}
-void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+/**
+ * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
+ * @cachep: the cache returned by memcg_kmem_get_cache
+ */
+void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
css_put(&cachep->memcg_params.memcg->css);
}
-int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg)
+/**
+ * memcg_kmem_charge: charge a kmem page
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ * @memcg: memory cgroup to charge
+ *
+ * Returns 0 on success, an error code on failure.
+ */
+int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
struct page_counter *counter;
@@ -2357,19 +2378,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
return 0;
}
-int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+/**
+ * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ *
+ * Returns 0 on success, an error code on failure.
+ */
+int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
+ if (memcg_kmem_bypass())
+ return 0;
+
memcg = get_mem_cgroup_from_mm(current->mm);
if (!mem_cgroup_is_root(memcg))
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
-
-void __memcg_kmem_uncharge(struct page *page, int order)
+/**
+ * memcg_kmem_uncharge: uncharge a kmem page
+ * @page: page to uncharge
+ * @order: allocation order
+ */
+void memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -2896,6 +2932,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
* ordering is imposed by list_lru_node->lock taken by
* memcg_drain_all_list_lrus().
*/
+ rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
css_for_each_descendant_pre(css, &memcg->css) {
child = mem_cgroup_from_css(css);
BUG_ON(child->kmemcg_id != kmemcg_id);
@@ -2903,6 +2940,8 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!memcg->use_hierarchy)
break;
}
+ rcu_read_unlock();
+
memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
memcg_free_cache_id(kmemcg_id);
@@ -4054,6 +4093,60 @@ static struct cftype mem_cgroup_legacy_files[] = {
{ }, /* terminate */
};
+/*
+ * Private memory cgroup IDR
+ *
+ * Swap-out records and page cache shadow entries need to store memcg
+ * references in constrained space, so we maintain an ID space that is
+ * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
+ * memory-controlled cgroups to 64k.
+ *
+ * However, there usually are many references to the oflline CSS after
+ * the cgroup has been destroyed, such as page cache or reclaimable
+ * slab objects, that don't need to hang on to the ID. We want to keep
+ * those dead CSS from occupying IDs, or we might quickly exhaust the
+ * relatively small ID space and prevent the creation of new cgroups
+ * even when there are much fewer than 64k cgroups - possibly none.
+ *
+ * Maintain a private 16-bit ID space for memcg, and allow the ID to
+ * be freed and recycled when it's no longer needed, which is usually
+ * when the CSS is offlined.
+ *
+ * The only exception to that are records of swapped out tmpfs/shmem
+ * pages that need to be attributed to live ancestors on swapin. But
+ * those references are manageable from userspace.
+ */
+
+static DEFINE_IDR(mem_cgroup_idr);
+
+static void mem_cgroup_id_get(struct mem_cgroup *memcg)
+{
+ atomic_inc(&memcg->id.ref);
+}
+
+static void mem_cgroup_id_put(struct mem_cgroup *memcg)
+{
+ if (atomic_dec_and_test(&memcg->id.ref)) {
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
+ memcg->id.id = 0;
+
+ /* Memcg ID pins CSS */
+ css_put(&memcg->css);
+ }
+}
+
+/**
+ * mem_cgroup_from_id - look up a memcg from a memcg id
+ * @id: the memcg id to look up
+ *
+ * Caller must hold rcu_read_lock().
+ */
+struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&mem_cgroup_idr, id);
+}
+
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -4113,6 +4206,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (!memcg)
return NULL;
+ memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
+ 1, MEM_CGROUP_ID_MAX,
+ GFP_KERNEL);
+ if (memcg->id.id < 0)
+ goto fail;
+
memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!memcg->stat)
goto fail;
@@ -4139,8 +4238,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
+ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
+ if (memcg->id.id > 0)
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
mem_cgroup_free(memcg);
return NULL;
}
@@ -4200,15 +4302,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
fail:
mem_cgroup_free(memcg);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
+ /* Online state pins memcg ID, memcg ID pins CSS */
+ mem_cgroup_id_get(mem_cgroup_from_css(css));
+ css_get(css);
return 0;
}
@@ -4231,6 +4332,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+
+ mem_cgroup_id_put(memcg);
}
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -4342,7 +4445,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
#ifdef CONFIG_SWAP
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
@@ -4361,7 +4464,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
}
#else
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ pte_t ptent, swp_entry_t *entry)
{
return NULL;
}
@@ -4404,7 +4507,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
+ * @compound: charge the page as compound or small page
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
@@ -4526,7 +4629,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
- page = mc_handle_swap_pte(vma, addr, ptent, &ent);
+ page = mc_handle_swap_pte(vma, ptent, &ent);
else if (pte_none(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
@@ -5266,6 +5369,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
* @memcgp: charged memcg return
+ * @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
@@ -5328,6 +5432,7 @@ out:
* @page: page to charge
* @memcg: memcg to charge the page to
* @lrucare: page might be on LRU already
+ * @compound: charge the page as compound or small page
*
* Finalize a charge transaction started by mem_cgroup_try_charge(),
* after page->mapping has been set up. This must happen atomically
@@ -5379,6 +5484,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
* mem_cgroup_cancel_charge - cancel a page charge
* @page: page to charge
* @memcg: memcg to charge the page to
+ * @compound: charge the page as compound or small page
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
@@ -5402,15 +5508,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_huge, struct page *dummy_page)
+ unsigned long nr_huge, unsigned long nr_kmem,
+ struct page *dummy_page)
{
- unsigned long nr_pages = nr_anon + nr_file;
+ unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
unsigned long flags;
if (!mem_cgroup_is_root(memcg)) {
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
+ page_counter_uncharge(&memcg->kmem, nr_kmem);
memcg_oom_recover(memcg);
}
@@ -5433,6 +5542,7 @@ static void uncharge_list(struct list_head *page_list)
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
+ unsigned long nr_kmem = 0;
unsigned long pgpgout = 0;
struct list_head *next;
struct page *page;
@@ -5443,8 +5553,6 @@ static void uncharge_list(struct list_head *page_list)
*/
next = page_list->next;
do {
- unsigned int nr_pages = 1;
-
page = list_entry(next, struct page, lru);
next = page->lru.next;
@@ -5463,31 +5571,34 @@ static void uncharge_list(struct list_head *page_list)
if (memcg != page->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, page);
- pgpgout = nr_anon = nr_file = nr_huge = 0;
+ nr_huge, nr_kmem, page);
+ pgpgout = nr_anon = nr_file =
+ nr_huge = nr_kmem = 0;
}
memcg = page->mem_cgroup;
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- nr_huge += nr_pages;
- }
+ if (!PageKmemcg(page)) {
+ unsigned int nr_pages = 1;
- if (PageAnon(page))
- nr_anon += nr_pages;
- else
- nr_file += nr_pages;
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ nr_huge += nr_pages;
+ }
+ if (PageAnon(page))
+ nr_anon += nr_pages;
+ else
+ nr_file += nr_pages;
+ pgpgout++;
+ } else
+ nr_kmem += 1 << compound_order(page);
page->mem_cgroup = NULL;
-
- pgpgout++;
} while (next != page_list);
if (memcg)
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, page);
+ nr_huge, nr_kmem, page);
}
/**
@@ -5541,6 +5652,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
struct mem_cgroup *memcg;
unsigned int nr_pages;
bool compound;
+ unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
@@ -5571,10 +5683,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, false);
- local_irq_disable();
+ local_irq_save(flags);
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
memcg_check_events(memcg, newpage);
- local_irq_enable();
+ local_irq_restore(flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -5752,6 +5864,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!memcg)
return;
+ mem_cgroup_id_get(memcg);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(memcg, true);
@@ -5770,6 +5883,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, page, false, -1);
memcg_check_events(memcg, page);
+
+ if (!mem_cgroup_is_root(memcg))
+ css_put(&memcg->css);
}
/*
@@ -5800,11 +5916,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
!page_counter_try_charge(&memcg->swap, 1, &counter))
return -ENOMEM;
+ mem_cgroup_id_get(memcg);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(memcg, true);
- css_get(&memcg->css);
return 0;
}
@@ -5833,7 +5949,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
page_counter_uncharge(&memcg->memsw, 1);
}
mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
+ mem_cgroup_id_put(memcg);
}
rcu_read_unlock();
}