diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 228 |
1 files changed, 172 insertions, 56 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 925b431..f3a84c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = memcg, .gfp_mask = gfp_mask, .order = order, }; @@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, goto unlock; } - check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1289,7 +1290,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task, totalpages)) { + switch (oom_scan_process_thread(&oc, task)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); @@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (chosen) { points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, memcg, + oom_kill_process(&oc, chosen, points, totalpages, "Memory cgroup out of memory"); } unlock: @@ -1608,7 +1609,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || current->memcg_in_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -2272,20 +2273,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -/* +static inline bool memcg_kmem_bypass(void) +{ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + return false; +} + +/** + * memcg_kmem_get_cache: select the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. * - * If the cache does not exist yet, if we are the first user of it, - * we either create it immediately, if possible, or create it asynchronously - * in a workqueue. - * In the latter case, we will let the current allocation go through with - * the original cache. + * If the cache does not exist yet, if we are the first user of it, we + * create it asynchronously in a workqueue and let the current allocation + * go through with the original cache. * - * Can't be called in interrupt context or from kernel threads. - * This function needs to be called with rcu_read_lock() held. + * This function takes a reference to the cache it returns to assure it + * won't get destroyed while we are working with it. Once the caller is + * done with it, memcg_kmem_put_cache() must be called to release the + * reference. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2293,10 +2304,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) VM_BUG_ON(!is_root_cache(cachep)); - if (cachep->flags & SLAB_ACCOUNT) - gfp |= __GFP_ACCOUNT; - - if (!(gfp & __GFP_ACCOUNT)) + if (memcg_kmem_bypass()) return cachep; if (current->memcg_kmem_skip_account) @@ -2329,14 +2337,27 @@ out: return cachep; } -void __memcg_kmem_put_cache(struct kmem_cache *cachep) +/** + * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache + * @cachep: the cache returned by memcg_kmem_get_cache + */ +void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg) +/** + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * @memcg: memory cgroup to charge + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; struct page_counter *counter; @@ -2357,19 +2378,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, return 0; } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +/** + * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; + if (memcg_kmem_bypass()) + return 0; + memcg = get_mem_cgroup_from_mm(current->mm); if (!mem_cgroup_is_root(memcg)) - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } - -void __memcg_kmem_uncharge(struct page *page, int order) +/** + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order + */ +void memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2896,6 +2932,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) * ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ + rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ css_for_each_descendant_pre(css, &memcg->css) { child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); @@ -2903,6 +2940,8 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!memcg->use_hierarchy) break; } + rcu_read_unlock(); + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); memcg_free_cache_id(kmemcg_id); @@ -4054,6 +4093,60 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4113,6 +4206,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return NULL; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto fail; + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto fail; @@ -4139,8 +4238,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: + if (memcg->id.id > 0) + idr_remove(&mem_cgroup_idr, memcg->id.id); mem_cgroup_free(memcg); return NULL; } @@ -4200,15 +4302,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: mem_cgroup_free(memcg); - return NULL; + return ERR_PTR(-ENOMEM); } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); return 0; } @@ -4231,6 +4332,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4342,7 +4445,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, #ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); @@ -4361,7 +4464,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, } #else static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { return NULL; } @@ -4404,7 +4507,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /** * mem_cgroup_move_account - move account of the page * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) + * @compound: charge the page as compound or small page * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -4526,7 +4629,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) - page = mc_handle_swap_pte(vma, addr, ptent, &ent); + page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); @@ -5266,6 +5369,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * @mm: mm context of the victim * @gfp_mask: reclaim mode * @memcgp: charged memcg return + * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. @@ -5328,6 +5432,7 @@ out: * @page: page to charge * @memcg: memcg to charge the page to * @lrucare: page might be on LRU already + * @compound: charge the page as compound or small page * * Finalize a charge transaction started by mem_cgroup_try_charge(), * after page->mapping has been set up. This must happen atomically @@ -5379,6 +5484,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * mem_cgroup_cancel_charge - cancel a page charge * @page: page to charge * @memcg: memcg to charge the page to + * @compound: charge the page as compound or small page * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ @@ -5402,15 +5508,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, struct page *dummy_page) + unsigned long nr_huge, unsigned long nr_kmem, + struct page *dummy_page) { - unsigned long nr_pages = nr_anon + nr_file; + unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) + page_counter_uncharge(&memcg->kmem, nr_kmem); memcg_oom_recover(memcg); } @@ -5433,6 +5542,7 @@ static void uncharge_list(struct list_head *page_list) unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; + unsigned long nr_kmem = 0; unsigned long pgpgout = 0; struct list_head *next; struct page *page; @@ -5443,8 +5553,6 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { - unsigned int nr_pages = 1; - page = list_entry(next, struct page, lru); next = page->lru.next; @@ -5463,31 +5571,34 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); - pgpgout = nr_anon = nr_file = nr_huge = 0; + nr_huge, nr_kmem, page); + pgpgout = nr_anon = nr_file = + nr_huge = nr_kmem = 0; } memcg = page->mem_cgroup; } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - nr_huge += nr_pages; - } + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; - if (PageAnon(page)) - nr_anon += nr_pages; - else - nr_file += nr_pages; + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + nr_huge += nr_pages; + } + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + pgpgout++; + } else + nr_kmem += 1 << compound_order(page); page->mem_cgroup = NULL; - - pgpgout++; } while (next != page_list); if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); + nr_huge, nr_kmem, page); } /** @@ -5541,6 +5652,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) struct mem_cgroup *memcg; unsigned int nr_pages; bool compound; + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -5571,10 +5683,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, false); - local_irq_disable(); + local_irq_save(flags); mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); memcg_check_events(memcg, newpage); - local_irq_enable(); + local_irq_restore(flags); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); @@ -5752,6 +5864,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5770,6 +5883,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /* @@ -5800,11 +5916,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) !page_counter_try_charge(&memcg->swap, 1, &counter)) return -ENOMEM; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); return 0; } @@ -5833,7 +5949,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) page_counter_uncharge(&memcg->memsw, 1); } mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } |