From fbbb7f4e149f6dd19a8dbebc9fa5c5b72173c6de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: remove the usage of separate populated bitmap in percpu-vm percpu-vm uses pcpu_get_pages_and_bitmap() to acquire temp pages array and populated bitmap and uses the two during [de]population. The temp bitmap is used only to build the new bitmap that is copied to chunk->populated after the operation succeeds; however, the new bitmap can be trivially set after success without using the temp bitmap. This patch removes the temp populated bitmap usage from percpu-vm.c. * pcpu_get_pages_and_bitmap() is renamed to pcpu_get_pages() and no longer hands out the temp bitmap. * @populated arugment is dropped from all the related functions. @populated updates in pcpu_[un]map_pages() are dropped. * Two loops in pcpu_map_pages() are merged. * pcpu_[de]populated_chunk() modify chunk->populated bitmap directly from @page_start and @page_end after success. Signed-off-by: Tejun Heo Acked-by: Christoph Lameter diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 5110816..47b47bf 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -20,46 +20,24 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, } /** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap * @may_alloc: may allocate the array * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and access + * exclusion is the caller's responsibility. * * RETURNS: - * Pointer to temp pages array on success, NULL on failure. + * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk, bool may_alloc) { static struct page **pages; - static unsigned long *bitmap; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_zalloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_zalloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); - - *bitmapp = bitmap; + if (!pages && may_alloc) + pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -67,7 +45,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * @@ -75,8 +52,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -95,7 +71,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @@ -104,8 +79,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu, tcpu; @@ -164,7 +138,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @@ -175,8 +148,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -192,8 +164,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } - - bitmap_clear(populated, page_start, page_end - page_start); } /** @@ -228,7 +198,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped - * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * @@ -236,13 +205,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; @@ -253,18 +220,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, page_end - page_start); if (err < 0) goto err; - } - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) + for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); - __set_bit(i, populated); } - return 0; - err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) @@ -314,7 +275,6 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_end = PFN_UP(off + size); int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned long *populated; unsigned int cpu; int rs, re, rc; @@ -327,28 +287,27 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) /* need to allocate and map pages, this chunk can't be immutable */ WARN_ON(chunk->immutable); - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + pages = pcpu_get_pages(chunk, true); if (!pages) return -ENOMEM; /* alloc and map */ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + rc = pcpu_alloc_pages(chunk, pages, rs, re); if (rc) goto err_free; free_end = re; } pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); + rc = pcpu_map_pages(chunk, pages, rs, re); if (rc) goto err_unmap; unmap_end = re; } pcpu_post_map_flush(chunk, page_start, page_end); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + bitmap_set(chunk->populated, page_start, page_end - page_start); clear: for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -357,11 +316,11 @@ clear: err_unmap: pcpu_pre_unmap_flush(chunk, page_start, unmap_end); pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, rs, re); pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); err_free: pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); + pcpu_free_pages(chunk, pages, rs, re); return rc; } @@ -383,7 +342,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); struct page **pages; - unsigned long *populated; int rs, re; /* quick path, check whether it's empty already */ @@ -400,22 +358,21 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + pages = pcpu_get_pages(chunk, false); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, rs, re); /* no need to flush tlb, vmalloc will handle it lazily */ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); + pcpu_free_pages(chunk, pages, rs, re); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + bitmap_clear(chunk->populated, page_start, page_end - page_start); } static struct pcpu_chunk *pcpu_create_chunk(void) -- cgit v0.10.2 From cdb4cba5a3c9fa27240d04f4f8dad316b10d995b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: remove @may_alloc from pcpu_get_pages() pcpu_get_pages() creates the temp pages array if not already allocated and returns the pointer to it. As the function is called from both [de]population paths and depopulation can only happen after at least one successful population, the param doesn't make any difference - the allocation will always happen on the population path anyway. Remove @may_alloc from pcpu_get_pages(). Also, add an lockdep assertion pcpu_alloc_mutex instead of vaguely stating that the exclusion is the caller's responsibility. Signed-off-by: Tejun Heo diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 47b47bf..d9e0b61 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -22,21 +22,22 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, /** * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @may_alloc: may allocate the array * * Returns pointer to array of pointers to struct page which can be indexed - * with pcpu_page_idx(). Note that there is only one array and access - * exclusion is the caller's responsibility. + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. * * RETURNS: * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages(struct pcpu_chunk *chunk, bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - if (!pages && may_alloc) + lockdep_assert_held(&pcpu_alloc_mutex); + + if (!pages) pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -287,7 +288,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) /* need to allocate and map pages, this chunk can't be immutable */ WARN_ON(chunk->immutable); - pages = pcpu_get_pages(chunk, true); + pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; @@ -358,7 +359,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages(chunk, false); + pages = pcpu_get_pages(chunk); BUG_ON(!pages); /* unmap and free */ -- cgit v0.10.2 From dca496451bddea9aa87b7510dc2eb413d1a19dfd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: move common parts out of pcpu_[de]populate_chunk() percpu-vm and percpu-km implement separate versions of pcpu_[de]populate_chunk() and some part which is or should be common are currently in the specific implementations. Make the following changes. * Allocate area clearing is moved from the pcpu_populate_chunk() implementations to pcpu_alloc(). This makes percpu-km's version noop. * Quick exit tests in pcpu_[de]populate_chunk() of percpu-vm are moved to their respective callers so that they are applied to percpu-km too. This doesn't make any meaningful difference as both functions are noop for percpu-km; however, this is more consistent and will help implementing atomic allocation support. Signed-off-by: Tejun Heo diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fe..9a9096f 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -35,11 +35,6 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index d9e0b61..edf7097 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -265,7 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * @size: size of the area to populate in bytes * * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. + * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. @@ -276,18 +276,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_end = PFN_UP(off + size); int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned int cpu; int rs, re, rc; - /* quick path, check whether all pages are already there */ - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - goto clear; - - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; @@ -308,10 +298,6 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) } pcpu_post_map_flush(chunk, page_start, page_end); - bitmap_set(chunk->populated, page_start, page_end - page_start); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; err_unmap: @@ -345,15 +331,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) struct page **pages; int rs, re; - /* quick path, check whether it's empty already */ - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - return; - - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); - /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must @@ -372,8 +349,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) pcpu_free_pages(chunk, pages, rs, re); - - bitmap_clear(chunk->populated, page_start, page_end - page_start); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index da997f9..6087384 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -709,7 +709,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + int slot, off, new_alloc, cpu; + int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -802,17 +803,32 @@ restart: area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + + if (rs != page_start || re != page_end) { + WARN_ON(chunk->immutable); + + if (pcpu_populate_chunk(chunk, off, size)) { + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + + bitmap_set(chunk->populated, page_start, page_end - page_start); } mutex_unlock(&pcpu_alloc_mutex); - /* return address relative to base address */ + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; @@ -903,7 +919,12 @@ static void pcpu_reclaim(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + int rs = 0, re; + + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(pcpu_unit_size)); + if (rs || re != PFN_UP(pcpu_unit_size)) + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + pcpu_destroy_chunk(chunk); } -- cgit v0.10.2 From a93ace487a339dccf7040be7fee08c3415188e14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: move region iterations out of pcpu_[de]populate_chunk() Previously, pcpu_[de]populate_chunk() were called with the range which may contain multiple target regions in it and pcpu_[de]populate_chunk() iterated over the regions. This has the benefit of batching up cache flushes for all the regions; however, we're planning to add more bookkeeping logic around [de]population to support atomic allocations and this delegation of iterations gets in the way. This patch moves the region iterations out of pcpu_[de]populate_chunk() into its callers - pcpu_alloc() and pcpu_reclaim() - so that we can later add logic to track more states around them. This change may make cache and tlb flushes more frequent but multi-region [de]populations are rare anyway and if this actually becomes a problem, it's not difficult to factor out cache flushes as separate callbacks which are directly invoked from percpu.c. Signed-off-by: Tejun Heo diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 9a9096f..a6e34bc 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,12 +33,14 @@ #include -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index edf7097..538998a 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -261,8 +261,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. @@ -270,66 +270,43 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - int rs, re, rc; pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - int rs, re; /* * If control reaches here, there must have been at least one @@ -342,13 +319,11 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, rs, re); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index 6087384..fe5de97 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -807,20 +807,17 @@ area_found: page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - - if (rs != page_start || re != page_end) { + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); - if (pcpu_populate_chunk(chunk, off, size)) { + if (pcpu_populate_chunk(chunk, rs, re)) { spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, page_start, page_end - page_start); + bitmap_set(chunk->populated, rs, re - rs); } mutex_unlock(&pcpu_alloc_mutex); @@ -919,12 +916,12 @@ static void pcpu_reclaim(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { - int rs = 0, re; - - pcpu_next_unpop(chunk, &rs, &re, PFN_UP(pcpu_unit_size)); - if (rs || re != PFN_UP(pcpu_unit_size)) - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + int rs, re; + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + bitmap_clear(chunk->populated, rs, re - rs); + } pcpu_destroy_chunk(chunk); } -- cgit v0.10.2 From a63d4ac4ab6094c051a5a240260d16117a7a2f86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: make percpu-km set chunk->populated bitmap properly percpu-km instantiates the whole chunk on creation and doesn't make use of chunk->populated bitmap and leaves it as zero. While this currently doesn't cause any problem, the inconsistency makes it difficult to build further logic on top of chunk->populated. This patch makes percpu-km fill chunk->populated on creation so that the bitmap is always consistent. Signed-off-by: Tejun Heo Acked-by: Christoph Lameter diff --git a/mm/percpu-km.c b/mm/percpu-km.c index a6e34bc..67a971b 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -67,6 +67,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + bitmap_fill(chunk->populated, nr_pages); + return chunk; } -- cgit v0.10.2 From b38d08f3181c5025a7ce84646494cc4748492a3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: restructure locking At first, the percpu allocator required a sleepable context for both alloc and free paths and used pcpu_alloc_mutex to protect everything. Later, pcpu_lock was introduced to protect the index data structure so that the free path can be invoked from atomic contexts. The conversion only updated what's necessary and left most of the allocation path under pcpu_alloc_mutex. The percpu allocator is planned to add support for atomic allocation and this patch restructures locking so that the coverage of pcpu_alloc_mutex is further reduced. * pcpu_alloc() now grab pcpu_alloc_mutex only while creating a new chunk and populating the allocated area. Everything else is now protected soley by pcpu_lock. After this change, multiple instances of pcpu_extend_area_map() may race but the function already implements sufficient synchronization using pcpu_lock. This also allows multiple allocators to arrive at new chunk creation. To avoid creating multiple empty chunks back-to-back, a new chunk is created iff there is no other empty chunk after grabbing pcpu_alloc_mutex. * pcpu_lock is now held while modifying chunk->populated bitmap. After this, all data structures are protected by pcpu_lock. Signed-off-by: Tejun Heo diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 67a971b..e662b49 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -68,7 +68,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + spin_lock_irq(&pcpu_lock); bitmap_fill(chunk->populated, nr_pages); + spin_unlock_irq(&pcpu_lock); return chunk; } diff --git a/mm/percpu.c b/mm/percpu.c index fe5de97..e59f7b4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -152,31 +152,12 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * Free path accesses and alters only the index data structures and can be + * safely called from atomic context. When memory needs to be returned to + * the system, free path schedules reclaim_work. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ @@ -709,7 +690,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc, cpu; + int slot, off, new_alloc, cpu, ret; int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -729,7 +710,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -745,7 +725,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } @@ -771,7 +751,7 @@ restart: if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -787,37 +767,53 @@ restart: } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ + mutex_lock(&pcpu_alloc_mutex); page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); - if (pcpu_populate_chunk(chunk, rs, re)) { - spin_lock_irqsave(&pcpu_lock, flags); + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, rs, re - rs); + spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); @@ -832,8 +828,7 @@ area_found: fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); +fail: if (warn_limit) { pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " "%s\n", size, align, err); -- cgit v0.10.2 From a16037c8dfc2734c1a2c8e3ffd4766ed25f2a41d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: make pcpu_alloc_area() capable of allocating only from populated areas Update pcpu_alloc_area() so that it can skip unpopulated areas if the new parameter @pop_only is true. This is implemented by a new function, pcpu_fit_in_area(), which determines the amount of head padding considering the alignment and populated state. @pop_only is currently always false but this will be used to implement atomic allocation. Signed-off-by: Tejun Heo diff --git a/mm/percpu.c b/mm/percpu.c index e59f7b4..e18aa14 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -400,10 +400,60 @@ out_unlock: } /** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + +/** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -418,7 +468,8 @@ out_unlock: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -434,11 +485,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -730,7 +781,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, false); if (off >= 0) goto area_found; @@ -761,7 +812,7 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, false); if (off >= 0) goto area_found; } -- cgit v0.10.2 From e04d320838f573d8fa989a0d7af0972f9b0142d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: indent the population block in pcpu_alloc() The next patch will conditionalize the population block in pcpu_alloc() which will end up making a rather large indentation change obfuscating the actual logic change. This patch puts the block under "if (true)" so that the next patch can avoid indentation changes. The defintions of the local variables which are used only in the block are moved into the block. This patch is purely cosmetic. Signed-off-by: Tejun Heo diff --git a/mm/percpu.c b/mm/percpu.c index e18aa14..577d84f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -742,7 +742,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) struct pcpu_chunk *chunk; const char *err; int slot, off, new_alloc, cpu, ret; - int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -847,27 +846,32 @@ area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); - page_end = PFN_UP(off + size); + if (true) { + int page_start, page_end, rs, re; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - WARN_ON(chunk->immutable); + mutex_lock(&pcpu_alloc_mutex); - ret = pcpu_populate_chunk(chunk, rs, re); + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); - spin_lock_irqsave(&pcpu_lock, flags); - if (ret) { - mutex_unlock(&pcpu_alloc_mutex); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + bitmap_set(chunk->populated, rs, re - rs); + spin_unlock_irqrestore(&pcpu_lock, flags); } - bitmap_set(chunk->populated, rs, re - rs); - spin_unlock_irqrestore(&pcpu_lock, flags); - } - mutex_unlock(&pcpu_alloc_mutex); + mutex_unlock(&pcpu_alloc_mutex); + } /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) -- cgit v0.10.2 From 5835d96e9ce4efdba8c6cefffc2f1575925456de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: implement [__]alloc_percpu_gfp() Now that pcpu_alloc_area() can allocate only from populated areas, it's easy to add atomic allocation support to [__]alloc_percpu(). Update pcpu_alloc() so that it accepts @gfp and skips all the blocking operations and allocates only from the populated areas if @gfp doesn't contain GFP_KERNEL. New interface functions [__]alloc_percpu_gfp() are added. While this means that atomic allocations are possible, this isn't complete yet as there's no mechanism to ensure that certain amount of populated areas is kept available and atomic allocations may keep failing under certain conditions. Signed-off-by: Tejun Heo diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 6f61b61..d1b416d 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -122,11 +122,16 @@ extern void __init setup_per_cpu_areas(void); #endif extern void __init percpu_init_late(void); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); extern void __percpu *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -#define alloc_percpu(type) \ - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) +#define alloc_percpu_gfp(type, gfp) \ + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ + __alignof__(type), gfp) +#define alloc_percpu(type) \ + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) #endif /* __LINUX_PERCPU_H */ diff --git a/mm/percpu.c b/mm/percpu.c index 577d84f..c52b931 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -151,11 +151,6 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; -/* - * Free path accesses and alters only the index data structures and can be - * safely called from atomic context. When memory needs to be returned to - * the system, free path schedules reclaim_work. - */ static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ @@ -727,20 +722,21 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; + bool is_atomic = !(gfp & GFP_KERNEL); int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -773,14 +769,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) while ((new_alloc = pcpu_need_to_extend(chunk))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; @@ -797,6 +794,8 @@ restart: new_alloc = pcpu_need_to_extend(chunk); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -811,7 +810,7 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; } @@ -824,6 +823,9 @@ restart: * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ + if (is_atomic) + goto fail; + mutex_lock(&pcpu_alloc_mutex); if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { @@ -846,7 +848,7 @@ area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - if (true) { + if (!is_atomic) { int page_start, page_end, rs, re; mutex_lock(&pcpu_alloc_mutex); @@ -884,9 +886,9 @@ area_found: fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); @@ -895,22 +897,34 @@ fail: } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -932,7 +946,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** -- cgit v0.10.2 From 9c824b6a172c8d44a6b037946bae90127c969b1b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: make sure chunk->map array has available space An allocation attempt may require extending chunk->map array which requires GFP_KERNEL context which isn't available for atomic allocations. This patch ensures that chunk->map array usually keeps some amount of available space by directly allocating buffer space during GFP_KERNEL allocations and scheduling async extension during atomic ones. This should make atomic allocation failures from map space exhaustion rare. Signed-off-by: Tejun Heo diff --git a/mm/percpu.c b/mm/percpu.c index c52b931..546ced0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,8 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,9 +104,12 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ @@ -318,9 +323,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -329,15 +339,25 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } + + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -394,6 +414,20 @@ out_unlock: return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -647,6 +681,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -767,7 +802,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic || pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -792,7 +827,7 @@ restart: if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { if (is_atomic) continue; @@ -1418,6 +1453,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1446,6 +1482,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); -- cgit v0.10.2 From b539b87fed37ffc16c89a6bc3beca2d7aed82e1c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implmeent pcpu_nr_empty_pop_pages and chunk->nr_populated pcpu_nr_empty_pop_pages counts the number of empty populated pages across all chunks and chunk->nr_populated counts the number of populated pages in a chunk. Both will be used to implement pre/async population for atomic allocations. pcpu_chunk_[de]populated() are added to update chunk->populated, chunk->nr_populated and pcpu_nr_empty_pop_pages together. All successful chunk [de]populations should be followed by the corresponding pcpu_chunk_[de]populated() calls. Signed-off-by: Tejun Heo diff --git a/mm/percpu-km.c b/mm/percpu-km.c index e662b49..10e3d0b 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; spin_lock_irq(&pcpu_lock); - bitmap_fill(chunk->populated, nr_pages); + pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irq(&pcpu_lock); return chunk; diff --git a/mm/percpu.c b/mm/percpu.c index 546ced0..4f2d587 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -113,6 +113,7 @@ struct pcpu_chunk { void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -161,6 +162,12 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. + */ +static int pcpu_nr_empty_pop_pages; + /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); @@ -296,6 +303,38 @@ static void pcpu_mem_free(void *ptr, size_t size) } /** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + +/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -483,6 +522,7 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, * @size: wanted size in bytes * @align: wanted align * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -498,7 +538,7 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, * found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, - bool pop_only) + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -587,6 +627,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -602,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -610,7 +652,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -641,6 +684,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -696,6 +741,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -772,6 +861,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, struct pcpu_chunk *chunk; const char *err; bool is_atomic = !(gfp & GFP_KERNEL); + int occ_pages = 0; int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -812,7 +902,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, is_atomic); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -845,7 +936,8 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align, is_atomic); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } @@ -899,17 +991,20 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { mutex_unlock(&pcpu_alloc_mutex); - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, rs, re - rs); + pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1019,7 +1114,9 @@ static void pcpu_reclaim(struct work_struct *work) pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { pcpu_depopulate_chunk(chunk, rs, re); - bitmap_clear(chunk->populated, rs, re - rs); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); } @@ -1041,7 +1138,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -1055,7 +1152,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -1459,6 +1559,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1488,6 +1589,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1498,6 +1600,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ -- cgit v0.10.2 From fe6bd8c3d28357174587c4fe895d10b00321b692 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: rename pcpu_reclaim_work to pcpu_balance_work pcpu_reclaim_work will also be used to populate chunks asynchronously. Rename it to pcpu_balance_work in preparation. pcpu_reclaim() is renamed to pcpu_balance_workfn() and some of its local variables are renamed too. This is pure rename. Signed-off-by: Tejun Heo diff --git a/mm/percpu.c b/mm/percpu.c index 4f2d587..28a8305 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -168,9 +168,9 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +/* balance work is used to populate or destroy chunks asynchronously */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_addr_in_first_chunk(void *addr) { @@ -1080,36 +1080,33 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - reclaim fully free chunks, workqueue function * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { + list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { @@ -1163,7 +1160,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + schedule_work(&pcpu_balance_work); break; } } -- cgit v0.10.2 From 1a4d76076cda69b0abf15463a8cebc172406da25 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implement asynchronous chunk population The percpu allocator now supports atomic allocations by only allocating from already populated areas but the mechanism to ensure that there's adequate amount of populated areas was missing. This patch expands pcpu_balance_work so that in addition to freeing excess free chunks it also populates chunks to maintain an adequate level of populated areas. pcpu_alloc() schedules pcpu_balance_work if the amount of free populated areas is too low or after an atomic allocation failure. * PERPCU_DYNAMIC_RESERVE is increased by two pages to account for PCPU_EMPTY_POP_PAGES_LOW. * pcpu_async_enabled is added to gate both async jobs - chunk->map_extend_work and pcpu_balance_work - so that we don't end up scheduling them while the needed subsystems aren't up yet. Signed-off-by: Tejun Heo diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d1b416d..a3aa63e 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -48,9 +48,9 @@ * intelligent way to determine this would be nice. */ #if BITS_PER_LONG > 32 -#define PERCPU_DYNAMIC_RESERVE (20 << 10) +#define PERCPU_DYNAMIC_RESERVE (28 << 10) #else -#define PERCPU_DYNAMIC_RESERVE (12 << 10) +#define PERCPU_DYNAMIC_RESERVE (20 << 10) #endif extern void *pcpu_base_addr; diff --git a/mm/percpu.c b/mm/percpu.c index 28a8305..867efd3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -78,6 +78,8 @@ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* balance work is used to populate or destroy chunks asynchronously */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) schedule_work(&chunk->map_extend_work); } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; @@ -1005,6 +1021,9 @@ area_found: if (chunk != pcpu_reserved_chunk) pcpu_nr_empty_pop_pages -= occ_pages; + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1023,6 +1042,11 @@ fail: if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } @@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_balance_workfn - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. @@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work) LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_balance_work); + pcpu_schedule_balance_work(); break; } } @@ -2187,3 +2284,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); -- cgit v0.10.2 From ebd8fef304f99da84d4a52ad056f6137ac9652d4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: make percpu_counters_lock irq-safe percpu_counter is scheduled to grow @gfp support to allow atomic initialization. This patch makes percpu_counters_lock irq-safe so that it can be safely used from atomic contexts. Signed-off-by: Tejun Heo diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 7dd33577..3fde782 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -115,6 +115,8 @@ EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, struct lock_class_key *key) { + unsigned long flags __maybe_unused; + raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; @@ -126,9 +128,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); - spin_lock(&percpu_counters_lock); + spin_lock_irqsave(&percpu_counters_lock, flags); list_add(&fbc->list, &percpu_counters); - spin_unlock(&percpu_counters_lock); + spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } @@ -136,15 +138,17 @@ EXPORT_SYMBOL(__percpu_counter_init); void percpu_counter_destroy(struct percpu_counter *fbc) { + unsigned long flags __maybe_unused; + if (!fbc->counters) return; debug_percpu_counter_deactivate(fbc); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&percpu_counters_lock); + spin_lock_irqsave(&percpu_counters_lock, flags); list_del(&fbc->list); - spin_unlock(&percpu_counters_lock); + spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc->counters); fbc->counters = NULL; @@ -173,7 +177,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, return NOTIFY_OK; cpu = (unsigned long)hcpu; - spin_lock(&percpu_counters_lock); + spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; unsigned long flags; @@ -184,7 +188,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, *pcount = 0; raw_spin_unlock_irqrestore(&fbc->lock, flags); } - spin_unlock(&percpu_counters_lock); + spin_unlock_irq(&percpu_counters_lock); #endif return NOTIFY_OK; } -- cgit v0.10.2 From 908c7f1949cb7cc6e92ba8f18f2998e87e265b8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: add @gfp to percpu_counter_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_counter_init() so that !GFP_KERNEL allocation masks can be used with percpu_counters too. We could have left percpu_counter_init() alone and added percpu_counter_init_gfp(); however, the number of users isn't that high and introducing _gfp variants to all percpu data structures would be quite ugly, so let's just do the conversion. This is the one with the most users. Other percpu data structures are a lot easier to convert. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Jan Kara Acked-by: "David S. Miller" Cc: x86@kernel.org Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andrew Morton diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9314678..5bd53f2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4534,7 +4534,7 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto nomem; register_shrinker(&mmu_shrinker); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9..61dae01 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1180,7 +1180,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2193,13 +2193,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813537f..94ec71e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3493,7 +3493,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b88edc0..170dc41 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext2_count_free_blocks(sb)); + ext2_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext2_count_free_inodes(sb)); + ext2_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext2_count_dirs(sb)); + ext2_count_dirs(sb), GFP_KERNEL); } if (err) { ext2_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 08cdfe5..eba021b 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount2; } err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); + ext3_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); + ext3_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); + ext3_count_dirs(sb), GFP_KERNEL); } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 32b43ad..e25ca8f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3891,7 +3891,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -4105,17 +4106,20 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); + ext4_count_dirs(sb), GFP_KERNEL); if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd3..0bab12b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f2d0eee..8b663b2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2725,7 +2725,7 @@ static int __init dquot_init(void) panic("Cannot create dquot hash table"); for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0); + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); if (ret) panic("Cannot create dquot stat counters"); } diff --git a/fs/super.c b/fs/super.c index b9a214d..1b83610 100644 --- a/fs/super.c +++ b/fs/super.c @@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) goto fail; lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], &type->s_writers_key[i], 0); diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d5dd465..50e5009 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SMP @@ -26,14 +27,14 @@ struct percpu_counter { extern int percpu_counter_batch; -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key); -#define percpu_counter_init(fbc, value) \ +#define percpu_counter_init(fbc, value, gfp) \ ({ \ static struct lock_class_key __key; \ \ - __percpu_counter_init(fbc, value, &__key); \ + __percpu_counter_init(fbc, value, gfp, &__key); \ }) void percpu_counter_destroy(struct percpu_counter *fbc); @@ -89,7 +90,8 @@ struct percpu_counter { s64 count; }; -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, + gfp_t gfp) { fbc->count = amount; return 0; diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 2f26dfb..1f99a1d 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -63,7 +63,7 @@ static inline void dst_entries_add(struct dst_ops *dst, int val) static inline int dst_entries_init(struct dst_ops *dst) { - return percpu_counter_init(&dst->pcpuc_entries, 0); + return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 65a8855..8d17655 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -151,7 +151,7 @@ static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) static inline void init_frag_mem_limit(struct netns_frags *nf) { - percpu_counter_init(&nf->mem, 0); + percpu_counter_init(&nf->mem, 0, GFP_KERNEL); } static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index ebf3bac..b9d026b 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -40,7 +40,7 @@ int fprop_global_init(struct fprop_global *p) p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1); + err = percpu_counter_init(&p->events, 1, GFP_KERNEL); if (err) return err; seqcount_init(&p->sequence); @@ -172,7 +172,7 @@ int fprop_local_init_percpu(struct fprop_local_percpu *pl) { int err; - err = percpu_counter_init(&pl->events, 0); + err = percpu_counter_init(&pl->events, 0, GFP_KERNEL); if (err) return err; pl->period = 0; diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 3fde782..48144cd 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -112,7 +112,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) } EXPORT_SYMBOL(__percpu_counter_sum); -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { unsigned long flags __maybe_unused; @@ -120,7 +120,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; - fbc->counters = alloc_percpu(s32); + fbc->counters = alloc_percpu_gfp(s32, gfp); if (!fbc->counters) return -ENOMEM; diff --git a/lib/proportions.c b/lib/proportions.c index 05df848..ca95f8d 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0); + err = percpu_counter_init(&pd->pg[0].events, 0, GFP_KERNEL); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0); + err = percpu_counter_init(&pd->pg[1].events, 0, GFP_KERNEL); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -193,7 +193,7 @@ int prop_local_init_percpu(struct prop_local_percpu *pl) raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0); + return percpu_counter_init(&pl->events, 0, GFP_KERNEL); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbb..f19a818 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4..d7ec93e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3196,7 +3196,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index a881d96..bd1808e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/shmem.c b/mm/shmem.c index 0e5fb22..d4bc55d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2993,7 +2993,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index de2c1e7..e421edd 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1115,7 +1115,7 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0); + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; rc = -ENOBUFS; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 541f26a..d59c260 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3188,8 +3188,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 3af5226..1d19135 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) res_parent = &parent_cg->memory_allocated; res_counter_init(&cg_proto->memory_allocated, res_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0); + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6240834..f00a85a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1341,7 +1341,7 @@ static __init int sctp_init(void) if (!sctp_chunk_cachep) goto err_chunk_cachep; - status = percpu_counter_init(&sctp_sockets_allocated, 0); + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; -- cgit v0.10.2 From 20ae00792c6f1f18fc4fc5965445a145df92827e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: proportions: add @gfp to init functions Percpu allocator now supports allocation mask. Add @gfp to [flex_]proportions init functions so that !GFP_KERNEL allocation masks can be used with them too. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Peter Zijlstra diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 4ebc49f..0d348e0 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * When maximum proportion of some event type is specified, this is the @@ -32,7 +33,7 @@ struct fprop_global { seqcount_t sequence; }; -int fprop_global_init(struct fprop_global *p); +int fprop_global_init(struct fprop_global *p, gfp_t gfp); void fprop_global_destroy(struct fprop_global *p); bool fprop_new_period(struct fprop_global *p, int periods); @@ -79,7 +80,7 @@ struct fprop_local_percpu { raw_spinlock_t lock; /* Protect period and numerator */ }; -int fprop_local_init_percpu(struct fprop_local_percpu *pl); +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 26a8a4e..00e8e8f 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -12,6 +12,7 @@ #include #include #include +#include struct prop_global { /* @@ -40,7 +41,7 @@ struct prop_descriptor { struct mutex mutex; /* serialize the prop_global switch */ }; -int prop_descriptor_init(struct prop_descriptor *pd, int shift); +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); void prop_change_shift(struct prop_descriptor *pd, int new_shift); /* @@ -61,7 +62,7 @@ struct prop_local_percpu { raw_spinlock_t lock; /* protect the snapshot state */ }; -int prop_local_init_percpu(struct prop_local_percpu *pl); +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); void prop_local_destroy_percpu(struct prop_local_percpu *pl); void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index b9d026b..8f25652 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -34,13 +34,13 @@ */ #include -int fprop_global_init(struct fprop_global *p) +int fprop_global_init(struct fprop_global *p, gfp_t gfp) { int err; p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1, GFP_KERNEL); + err = percpu_counter_init(&p->events, 1, gfp); if (err) return err; seqcount_init(&p->sequence); @@ -168,11 +168,11 @@ void fprop_fraction_single(struct fprop_global *p, */ #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int fprop_local_init_percpu(struct fprop_local_percpu *pl) +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) { int err; - err = percpu_counter_init(&pl->events, 0, GFP_KERNEL); + err = percpu_counter_init(&pl->events, 0, gfp); if (err) return err; pl->period = 0; diff --git a/lib/proportions.c b/lib/proportions.c index ca95f8d..6f72429 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -73,7 +73,7 @@ #include #include -int prop_descriptor_init(struct prop_descriptor *pd, int shift) +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) { int err; @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, GFP_KERNEL); + err = percpu_counter_init(&pd->pg[0].events, 0, gfp); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0, GFP_KERNEL); + err = percpu_counter_init(&pd->pg[1].events, 0, gfp); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -188,12 +188,12 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int prop_local_init_percpu(struct prop_local_percpu *pl) +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) { raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0, GFP_KERNEL); + return percpu_counter_init(&pl->events, 0, gfp); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f19a818..64ec49d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef..5085994 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** -- cgit v0.10.2 From a34375ef9e65340a138fc0be287de5c940d260fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: percpu-refcount: add @gfp to percpu_ref_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_ref_init() so that !GFP_KERNEL allocation masks can be used with percpu_refs too. This patch doesn't make any functional difference. v2: blk-mq conversion was missing. Updated. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Benjamin LaHaise Cc: Li Zefan Cc: Nicholas A. Bellinger Cc: Jens Axboe diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1..702df07 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1776,7 +1776,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, + GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae6..4ab6da3 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,8 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index bd7ec2c..93fbcc0f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -666,10 +666,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3dfbf23..ee83251 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -49,6 +49,7 @@ #include #include #include +#include struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); @@ -66,7 +67,7 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release); + percpu_ref_func_t *release, gfp_t gfp); void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788..589b4d8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1628,7 +1628,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out; @@ -4487,7 +4487,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); if (err) goto err_free_css; @@ -4555,7 +4555,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe5a334..ff99032 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -40,6 +40,7 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @gfp: allocation mask to use * * Initializes the refcount in single atomic counter mode with a refcount of 1; * analagous to atomic_set(ref, 1). @@ -47,11 +48,12 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, + gfp_t gfp) { atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); + ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned, gfp); if (!ref->pcpu_count_ptr) return -ENOMEM; -- cgit v0.10.2 From 23cb8981ed929b4dd48141401cd0fd31e0fa4ed0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Sep 2014 08:02:45 +0900 Subject: percpu: fix locking regression in the failure path of pcpu_alloc() While updating locking, b38d08f3181c ("percpu: restructure locking") broke pcpu_create_chunk() creation path in pcpu_alloc(). It returns without releasing pcpu_alloc_mutex. Fix it. Signed-off-by: Tejun Heo Reported-by: Julia Lawall diff --git a/mm/percpu.c b/mm/percpu.c index 867efd3..af3dd27 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -974,6 +974,7 @@ restart: if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } -- cgit v0.10.2 From 4843c3320c3d23ab4ecf520f5eaf485aff8c7252 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 20 Sep 2014 01:27:24 -0400 Subject: percpu-refcount: improve WARN messages percpu_ref's WARN messages can be a lot more helpful by indicating who's the culprit. Make them report the release function that the offending percpu-refcount is associated with. This should make it a lot easier to track down the reported invalid refcnting operations. Signed-off-by: Tejun Heo Cc: Kent Overstreet diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index ff99032..70d28c9 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -145,8 +145,9 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); - WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", - atomic_read(&ref->count)); + WARN_ONCE(atomic_read(&ref->count) <= 0, + "percpu ref (%pf) <= 0 (%i) after killed", + ref->release, atomic_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out kill confirmation */ if (ref->confirm_kill) @@ -178,7 +179,8 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once!\n"); + "percpu_ref_kill() called more than once on %pf!", + ref->release); ref->pcpu_count_ptr |= PCPU_REF_DEAD; ref->confirm_kill = confirm_kill; -- cgit v0.10.2 From e625305b390790717cf2cccf61efb81299647028 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 20 Sep 2014 01:27:25 -0400 Subject: percpu-refcount: make percpu_ref based on longs instead of ints percpu_ref is currently based on ints and the number of refs it can cover is (1 << 31). This makes it impossible to use a percpu_ref to count memory objects or pages on 64bit machines as it may overflow. This forces those users to somehow aggregate the references before contributing to the percpu_ref which is often cumbersome and sometimes challenging to get the same level of performance as using the percpu_ref directly. While using ints for the percpu counters makes them pack tighter on 64bit machines, the possible gain from using ints instead of longs is extremely small compared to the overall gain from per-cpu operation. This patch makes percpu_ref based on longs so that it can be used to directly count memory objects or pages. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Johannes Weiner diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index ee83251..5df6784 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -55,7 +55,7 @@ struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); struct percpu_ref { - atomic_t count; + atomic_long_t count; /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. @@ -97,7 +97,7 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) * branches as it can't assume that @ref->pcpu_count is not NULL. */ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, - unsigned __percpu **pcpu_countp) + unsigned long __percpu **pcpu_countp) { unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); @@ -107,7 +107,7 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) return false; - *pcpu_countp = (unsigned __percpu *)pcpu_ptr; + *pcpu_countp = (unsigned long __percpu *)pcpu_ptr; return true; } @@ -119,14 +119,14 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, */ static inline void percpu_ref_get(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; rcu_read_lock_sched(); if (__pcpu_ref_alive(ref, &pcpu_count)) this_cpu_inc(*pcpu_count); else - atomic_inc(&ref->count); + atomic_long_inc(&ref->count); rcu_read_unlock_sched(); } @@ -142,7 +142,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; int ret = false; rcu_read_lock_sched(); @@ -151,7 +151,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) this_cpu_inc(*pcpu_count); ret = true; } else { - ret = atomic_inc_not_zero(&ref->count); + ret = atomic_long_inc_not_zero(&ref->count); } rcu_read_unlock_sched(); @@ -175,7 +175,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; int ret = false; rcu_read_lock_sched(); @@ -199,13 +199,13 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) */ static inline void percpu_ref_put(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; rcu_read_lock_sched(); if (__pcpu_ref_alive(ref, &pcpu_count)) this_cpu_dec(*pcpu_count); - else if (unlikely(atomic_dec_and_test(&ref->count))) + else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); rcu_read_unlock_sched(); @@ -219,11 +219,11 @@ static inline void percpu_ref_put(struct percpu_ref *ref) */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *pcpu_count; if (__pcpu_ref_alive(ref, &pcpu_count)) return false; - return !atomic_read(&ref->count); + return !atomic_long_read(&ref->count); } #endif diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 70d28c9..559ee0b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -25,15 +25,15 @@ * works. * * Converting to non percpu mode is done with some RCUish stuff in - * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t - * can't hit 0 before we've added up all the percpu refs. + * percpu_ref_kill. Additionally, we need a bias value so that the + * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ -#define PCPU_COUNT_BIAS (1U << 31) +#define PCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) -static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) +static unsigned long __percpu *pcpu_count_ptr(struct percpu_ref *ref) { - return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + return (unsigned long __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); } /** @@ -43,7 +43,7 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * @gfp: allocation mask to use * * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_set(ref, 1). + * analagous to atomic_long_set(ref, 1). * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). @@ -51,9 +51,9 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp) { - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned, gfp); + ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned long, gfp); if (!ref->pcpu_count_ptr) return -ENOMEM; @@ -75,13 +75,13 @@ EXPORT_SYMBOL_GPL(percpu_ref_init); */ void percpu_ref_reinit(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); int cpu; BUG_ON(!pcpu_count); WARN_ON(!percpu_ref_is_zero(ref)); - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); /* * Restore per-cpu operation. smp_store_release() is paired with @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit); */ void percpu_ref_exit(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); if (pcpu_count) { free_percpu(pcpu_count); @@ -121,14 +121,15 @@ EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); - unsigned count = 0; + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) count += *per_cpu_ptr(pcpu_count, cpu); - pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); + pr_debug("global %ld pcpu %ld", + atomic_long_read(&ref->count), (long)count); /* * It's crucial that we sum the percpu counters _before_ adding the sum @@ -143,11 +144,11 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * time is equivalent and saves us atomic operations: */ - atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + atomic_long_add((long)count - PCPU_COUNT_BIAS, &ref->count); - WARN_ONCE(atomic_read(&ref->count) <= 0, - "percpu ref (%pf) <= 0 (%i) after killed", - ref->release, atomic_read(&ref->count)); + WARN_ONCE(atomic_long_read(&ref->count) <= 0, + "percpu ref (%pf) <= 0 (%ld) after killed", + ref->release, atomic_long_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out kill confirmation */ if (ref->confirm_kill) -- cgit v0.10.2 From bb2e226b3bef596dd56be97df655d857b4603923 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 21 Sep 2014 15:04:53 -0700 Subject: Revert "percpu: free percpu allocation info for uniprocessor system" This reverts commit 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system"). The commit causes a hang with a crisv32 image. This may be an architecture problem, but at least for now the revert is necessary to be able to boot a crisv32 image. Cc: Tejun Heo Cc: Honggang Li Signed-off-by: Guenter Roeck Signed-off-by: Tejun Heo Fixes: 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system") Cc: stable@vger.kernel.org # Please don't apply 3189eddbcafc diff --git a/mm/percpu.c b/mm/percpu.c index af3dd27..e10f9f7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2250,8 +2250,6 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); - - pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ -- cgit v0.10.2 From 9eca80461a45177e456219a9cd944c27675d6512 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:07:33 -0400 Subject: Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe" This reverts commit 0a30288da1aec914e158c2d7a3482a85f632750f, which was a temporary fix for SCSI blk-mq stall issue. The following patches will fix the issue properly by introducing atomic mode to percpu_ref. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig diff --git a/block/blk-mq.c b/block/blk-mq.c index 255d79c..44a78ae 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,16 +119,7 @@ void blk_mq_freeze_queue(struct request_queue *q) spin_unlock_irq(q->queue_lock); if (freeze) { - /* - * XXX: Temporary kludge to work around SCSI blk-mq stall. - * SCSI synchronously creates and destroys many queues - * back-to-back during probe leading to lengthy stalls. - * This will be fixed by keeping ->mq_usage_counter in - * atomic mode until genhd registration, but, for now, - * let's work around using expedited synchronization. - */ - __percpu_ref_kill_expedited(&q->mq_usage_counter); - + percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_queues(q, false); } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 11b38ce..5df6784 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -72,7 +72,6 @@ void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); -void __percpu_ref_kill_expedited(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index c6c31e2..559ee0b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -189,19 +189,3 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); - -/* - * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by - * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 - * devel cycle. Do not use anywhere else. - */ -void __percpu_ref_kill_expedited(struct percpu_ref *ref) -{ - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once on %pf!", - ref->release); - - ref->pcpu_count_ptr |= PCPU_REF_DEAD; - synchronize_sched_expedited(); - percpu_ref_kill_rcu(&ref->rcu); -} -- cgit v0.10.2 From a2237370194484ee6aeeff04b617e4b14d178966 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: relocate percpu_ref_reinit() percpu_ref is gonna go through restructuring. Move percpu_ref_reinit() after percpu_ref_kill_and_confirm(). This will make later changes easier to follow and result in cleaner organization. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 5df6784..f015f13 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -68,10 +68,10 @@ struct percpu_ref { int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp); -void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 559ee0b..070dab5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -63,41 +63,6 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, EXPORT_SYMBOL_GPL(percpu_ref_init); /** - * percpu_ref_reinit - re-initialize a percpu refcount - * @ref: perpcu_ref to re-initialize - * - * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully, killed - * and reached 0 but not exited. - * - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while - * this function is in progress. - */ -void percpu_ref_reinit(struct percpu_ref *ref) -{ - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); - int cpu; - - BUG_ON(!pcpu_count); - WARN_ON(!percpu_ref_is_zero(ref)); - - atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); - - /* - * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees - * that the zeroing is visible to all percpu accesses which can see - * the following PCPU_REF_DEAD clearing. - */ - for_each_possible_cpu(cpu) - *per_cpu_ptr(pcpu_count, cpu) = 0; - - smp_store_release(&ref->pcpu_count_ptr, - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); -} -EXPORT_SYMBOL_GPL(percpu_ref_reinit); - -/** * percpu_ref_exit - undo percpu_ref_init() * @ref: percpu_ref to exit * @@ -189,3 +154,38 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); + +/** + * percpu_ref_reinit - re-initialize a percpu refcount + * @ref: perpcu_ref to re-initialize + * + * Re-initialize @ref so that it's in the same state as when it finished + * percpu_ref_init(). @ref must have been initialized successfully, killed + * and reached 0 but not exited. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. + */ +void percpu_ref_reinit(struct percpu_ref *ref) +{ + unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + int cpu; + + BUG_ON(!pcpu_count); + WARN_ON(!percpu_ref_is_zero(ref)); + + atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); + + /* + * Restore per-cpu operation. smp_store_release() is paired with + * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees + * that the zeroing is visible to all percpu accesses which can see + * the following PCPU_REF_DEAD clearing. + */ + for_each_possible_cpu(cpu) + *per_cpu_ptr(pcpu_count, cpu) = 0; + + smp_store_release(&ref->pcpu_count_ptr, + ref->pcpu_count_ptr & ~PCPU_REF_DEAD); +} +EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit v0.10.2 From 6251f9976af7656b6970a8820153f356430f5de2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: minor code and comment updates * Some comments became stale. Updated. * percpu_ref_tryget() unnecessarily initializes @ret. Removed. * A blank line removed from percpu_ref_kill_rcu(). * Explicit function name in a WARN format string replaced with __func__. * WARN_ON() in percpu_ref_reinit() converted to WARN_ON_ONCE(). Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index f015f13..d44b027 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -115,8 +115,10 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * - * Analagous to atomic_inc(). - */ + * Analagous to atomic_long_inc(). + * + * This function is safe to call as long as @ref is between init and exit. + */ static inline void percpu_ref_get(struct percpu_ref *ref) { unsigned long __percpu *pcpu_count; @@ -138,12 +140,12 @@ static inline void percpu_ref_get(struct percpu_ref *ref) * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * - * The caller is responsible for ensuring that @ref stays accessible. + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { unsigned long __percpu *pcpu_count; - int ret = false; + int ret; rcu_read_lock_sched(); @@ -166,12 +168,13 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * - * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget - * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be - * used. After the confirm_kill callback is invoked, it's guaranteed that - * no new reference will be given out by percpu_ref_tryget(). + * Completion of percpu_ref_kill() in itself doesn't guarantee that this + * function will fail. For such guarantee, percpu_ref_kill_and_confirm() + * should be used. After the confirm_kill callback is invoked, it's + * guaranteed that no new reference will be given out by + * percpu_ref_tryget_live(). * - * The caller is responsible for ensuring that @ref stays accessible. + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { @@ -196,6 +199,8 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) + * + * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { @@ -216,6 +221,8 @@ static inline void percpu_ref_put(struct percpu_ref *ref) * @ref: percpu_ref to test * * Returns %true if @ref reached zero. + * + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 070dab5..8ef3f5c 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -108,7 +108,6 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ - atomic_long_add((long)count - PCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, @@ -120,8 +119,8 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) ref->confirm_kill(ref); /* - * Now we're in single atomic_t mode with a consistent refcount, so it's - * safe to drop our initial ref: + * Now we're in single atomic_long_t mode with a consistent + * refcount, so it's safe to drop our initial ref: */ percpu_ref_put(ref); } @@ -134,8 +133,8 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be * called after @ref is seen as dead from all CPUs - all further - * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() - * for more details. + * invocations of percpu_ref_tryget_live() will fail. See + * percpu_ref_tryget_live() for more details. * * Due to the way percpu_ref is implemented, @confirm_kill will be called * after at least one full RCU grace period has passed but this is an @@ -145,8 +144,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once on %pf!", - ref->release); + "%s called more than once on %pf!", __func__, ref->release); ref->pcpu_count_ptr |= PCPU_REF_DEAD; ref->confirm_kill = confirm_kill; @@ -172,7 +170,7 @@ void percpu_ref_reinit(struct percpu_ref *ref) int cpu; BUG_ON(!pcpu_count); - WARN_ON(!percpu_ref_is_zero(ref)); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); -- cgit v0.10.2 From eecc16ba9a49b05dd847a317af166a6728eb56ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: replace pcpu_ prefix with percpu_ percpu_ref uses pcpu_ prefix for internal stuff and percpu_ for externally visible ones. This is the same convention used in the percpu allocator implementation. It works fine there but percpu_ref doesn't have too much internal-only stuff and scattered usages of pcpu_ prefix are confusing than helpful. This patch replaces all pcpu_ prefixes with percpu_. This is pure rename and there's no functional change. Note that PCPU_REF_DEAD is renamed to __PERCPU_REF_DEAD to signify that the flag is internal. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index d44b027..3d463a3 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -13,7 +13,7 @@ * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see - * percpu_ref_kill()/PCPU_COUNT_BIAS. + * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() @@ -60,7 +60,7 @@ struct percpu_ref { * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ - unsigned long pcpu_count_ptr; + unsigned long percpu_count_ptr; percpu_ref_func_t *release; percpu_ref_func_t *confirm_kill; struct rcu_head rcu; @@ -88,26 +88,26 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) return percpu_ref_kill_and_confirm(ref, NULL); } -#define PCPU_REF_DEAD 1 +#define __PERCPU_REF_DEAD 1 /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional - * branches as it can't assume that @ref->pcpu_count is not NULL. + * branches as it can't assume that @ref->percpu_count is not NULL. */ -static inline bool __pcpu_ref_alive(struct percpu_ref *ref, - unsigned long __percpu **pcpu_countp) +static inline bool __percpu_ref_alive(struct percpu_ref *ref, + unsigned long __percpu **percpu_countp) { - unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); + unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_DEAD)) return false; - *pcpu_countp = (unsigned long __percpu *)pcpu_ptr; + *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } @@ -121,12 +121,12 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, */ static inline void percpu_ref_get(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) - this_cpu_inc(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) + this_cpu_inc(*percpu_count); else atomic_long_inc(&ref->count); @@ -144,13 +144,13 @@ static inline void percpu_ref_get(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; int ret; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) { - this_cpu_inc(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); ret = true; } else { ret = atomic_long_inc_not_zero(&ref->count); @@ -178,13 +178,13 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; int ret = false; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) { - this_cpu_inc(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); ret = true; } @@ -204,12 +204,12 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) */ static inline void percpu_ref_put(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) - this_cpu_dec(*pcpu_count); + if (__percpu_ref_alive(ref, &percpu_count)) + this_cpu_dec(*percpu_count); else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); @@ -226,9 +226,9 @@ static inline void percpu_ref_put(struct percpu_ref *ref) */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count; + unsigned long __percpu *percpu_count; - if (__pcpu_ref_alive(ref, &pcpu_count)) + if (__percpu_ref_alive(ref, &percpu_count)) return false; return !atomic_long_read(&ref->count); } diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 8ef3f5c..5aea6b7 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -11,8 +11,8 @@ * percpu counters will all sum to the correct value * * (More precisely: because moduler arithmatic is commutative the sum of all the - * pcpu_count vars will be equal to what it would have been if all the gets and - * puts were done to a single integer, even if some of the percpu integers + * percpu_count vars will be equal to what it would have been if all the gets + * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect @@ -29,11 +29,12 @@ * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ -#define PCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) -static unsigned long __percpu *pcpu_count_ptr(struct percpu_ref *ref) +static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { - return (unsigned long __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + return (unsigned long __percpu *) + (ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); } /** @@ -51,10 +52,11 @@ static unsigned long __percpu *pcpu_count_ptr(struct percpu_ref *ref) int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp) { - atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned long, gfp); - if (!ref->pcpu_count_ptr) + ref->percpu_count_ptr = + (unsigned long)alloc_percpu_gfp(unsigned long, gfp); + if (!ref->percpu_count_ptr) return -ENOMEM; ref->release = release; @@ -74,11 +76,11 @@ EXPORT_SYMBOL_GPL(percpu_ref_init); */ void percpu_ref_exit(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); - if (pcpu_count) { - free_percpu(pcpu_count); - ref->pcpu_count_ptr = PCPU_REF_DEAD; + if (percpu_count) { + free_percpu(percpu_count); + ref->percpu_count_ptr = __PERCPU_REF_DEAD; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); @@ -86,14 +88,14 @@ EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) - count += *per_cpu_ptr(pcpu_count, cpu); + count += *per_cpu_ptr(percpu_count, cpu); - pr_debug("global %ld pcpu %ld", + pr_debug("global %ld percpu %ld", atomic_long_read(&ref->count), (long)count); /* @@ -108,7 +110,7 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ - atomic_long_add((long)count - PCPU_COUNT_BIAS, &ref->count); + atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, "percpu ref (%pf) <= 0 (%ld) after killed", @@ -143,10 +145,10 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, "%s called more than once on %pf!", __func__, ref->release); - ref->pcpu_count_ptr |= PCPU_REF_DEAD; + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; ref->confirm_kill = confirm_kill; call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); @@ -166,24 +168,24 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { - unsigned long __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; - BUG_ON(!pcpu_count); + BUG_ON(!percpu_count); WARN_ON_ONCE(!percpu_ref_is_zero(ref)); - atomic_long_set(&ref->count, 1 + PCPU_COUNT_BIAS); + atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); /* * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees - * that the zeroing is visible to all percpu accesses which can see - * the following PCPU_REF_DEAD clearing. + * smp_read_barrier_depends() in __percpu_ref_alive() and + * guarantees that the zeroing is visible to all percpu accesses + * which can see the following __PERCPU_REF_DEAD clearing. */ for_each_possible_cpu(cpu) - *per_cpu_ptr(pcpu_count, cpu) = 0; + *per_cpu_ptr(percpu_count, cpu) = 0; - smp_store_release(&ref->pcpu_count_ptr, - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + smp_store_release(&ref->percpu_count_ptr, + ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit v0.10.2 From 9e804d1f58da1eca079f796347c1cf1d1df564e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:48 -0400 Subject: percpu_ref: rename things to prepare for decoupling percpu/atomic mode switch percpu_ref will be restructured so that percpu/atomic mode switching and reference killing are dedoupled. In preparation, do the following renames. * percpu_ref->confirm_kill -> percpu_ref->confirm_switch * __PERCPU_REF_DEAD -> __PERCPU_REF_ATOMIC * __percpu_ref_alive() -> __ref_is_percpu() This patch is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3d463a3..910e5f7 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -54,6 +54,11 @@ struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); +/* flags set in the lower bits of percpu_ref->percpu_count_ptr */ +enum { + __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ +}; + struct percpu_ref { atomic_long_t count; /* @@ -62,7 +67,7 @@ struct percpu_ref { */ unsigned long percpu_count_ptr; percpu_ref_func_t *release; - percpu_ref_func_t *confirm_kill; + percpu_ref_func_t *confirm_switch; struct rcu_head rcu; }; @@ -88,23 +93,21 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) return percpu_ref_kill_and_confirm(ref, NULL); } -#define __PERCPU_REF_DEAD 1 - /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional * branches as it can't assume that @ref->percpu_count is not NULL. */ -static inline bool __percpu_ref_alive(struct percpu_ref *ref, - unsigned long __percpu **percpu_countp) +static inline bool __ref_is_percpu(struct percpu_ref *ref, + unsigned long __percpu **percpu_countp) { unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(percpu_ptr & __PERCPU_REF_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; @@ -125,7 +128,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) this_cpu_inc(*percpu_count); else atomic_long_inc(&ref->count); @@ -149,7 +152,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) { + if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_inc(*percpu_count); ret = true; } else { @@ -183,7 +186,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) { + if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_inc(*percpu_count); ret = true; } @@ -208,7 +211,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref) rcu_read_lock_sched(); - if (__percpu_ref_alive(ref, &percpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) this_cpu_dec(*percpu_count); else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); @@ -228,7 +231,7 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; - if (__percpu_ref_alive(ref, &percpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) return false; return !atomic_long_read(&ref->count); } diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 5aea6b7..7aef590 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -34,7 +34,7 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) - (ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } /** @@ -80,7 +80,7 @@ void percpu_ref_exit(struct percpu_ref *ref) if (percpu_count) { free_percpu(percpu_count); - ref->percpu_count_ptr = __PERCPU_REF_DEAD; + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); @@ -117,8 +117,8 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) ref->release, atomic_long_read(&ref->count)); /* @ref is viewed as dead on all CPUs, send out kill confirmation */ - if (ref->confirm_kill) - ref->confirm_kill(ref); + if (ref->confirm_switch) + ref->confirm_switch(ref); /* * Now we're in single atomic_long_t mode with a consistent @@ -145,11 +145,11 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC, "%s called more than once on %pf!", __func__, ref->release); - ref->percpu_count_ptr |= __PERCPU_REF_DEAD; - ref->confirm_kill = confirm_kill; + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + ref->confirm_switch = confirm_kill; call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } @@ -178,14 +178,14 @@ void percpu_ref_reinit(struct percpu_ref *ref) /* * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __percpu_ref_alive() and - * guarantees that the zeroing is visible to all percpu accesses - * which can see the following __PERCPU_REF_DEAD clearing. + * smp_read_barrier_depends() in __ref_is_percpu() and guarantees + * that the zeroing is visible to all percpu accesses which can see + * the following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, - ref->percpu_count_ptr & ~__PERCPU_REF_DEAD); + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit v0.10.2 From 27344a9017cdaff82a167827da3001a0918afdc3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:49 -0400 Subject: percpu_ref: add PCPU_REF_DEAD percpu_ref will be restructured so that percpu/atomic mode switching and reference killing are dedoupled. In preparation, add PCPU_REF_DEAD and PCPU_REF_ATOMIC_DEAD which is OR of ATOMIC and DEAD. For now, ATOMIC and DEAD are changed together and all PCPU_REF_ATOMIC uses are converted to PCPU_REF_ATOMIC_DEAD without causing any behavior changes. percpu_ref_init() now specifies an explicit alignment when allocating the percpu counters so that the pointer has enough unused low bits to accomodate the flags. Note that one flag was fine as min alignment for percpu memory is 2 bytes but two flags are already too many for the natural alignment of unsigned longs on archs like cris and m68k. v2: The original patch had BUILD_BUG_ON() which triggers if unsigned long's alignment isn't enough to accomodate the flags, which triggered on cris and m64k. percpu_ref_init() updated to specify the required alignment explicitly. Reported by Fengguang. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: kbuild test robot diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 910e5f7..bd9483d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -57,6 +57,10 @@ typedef void (percpu_ref_func_t)(struct percpu_ref *); /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ enum { __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ + __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ + __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, + + __PERCPU_REF_FLAG_BITS = 2, }; struct percpu_ref { @@ -107,7 +111,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 7aef590..e2ff19f 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -34,7 +34,7 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) - (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** @@ -52,10 +52,13 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp) { + size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, + __alignof__(unsigned long)); + atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); - ref->percpu_count_ptr = - (unsigned long)alloc_percpu_gfp(unsigned long, gfp); + ref->percpu_count_ptr = (unsigned long) + __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; @@ -80,7 +83,7 @@ void percpu_ref_exit(struct percpu_ref *ref) if (percpu_count) { free_percpu(percpu_count); - ref->percpu_count_ptr = __PERCPU_REF_ATOMIC; + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); @@ -145,10 +148,10 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC, + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC_DEAD, "%s called more than once on %pf!", __func__, ref->release); - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC_DEAD; ref->confirm_switch = confirm_kill; call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); @@ -180,12 +183,12 @@ void percpu_ref_reinit(struct percpu_ref *ref) * Restore per-cpu operation. smp_store_release() is paired with * smp_read_barrier_depends() in __ref_is_percpu() and guarantees * that the zeroing is visible to all percpu accesses which can see - * the following __PERCPU_REF_ATOMIC clearing. + * the following __PERCPU_REF_ATOMIC_DEAD clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, - ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit v0.10.2 From 490c79a65708873228cf114cf00e32c204e4e907 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:49 -0400 Subject: percpu_ref: decouple switching to atomic mode and killing percpu_ref has treated the dropping of the base reference and switching to atomic mode as an integral operation; however, there's nothing inherent tying the two together. The use cases for percpu_ref have been expanding continuously. While the current init/kill/reinit/exit model can cover a lot, the coupling of kill/reinit with atomic/percpu mode switching is turning out to be too restrictive for use cases where many percpu_refs are created and destroyed back-to-back with only some of them reaching extended operation. The coupling also makes implementing always-atomic debug mode difficult. This patch separates out atomic mode switching into percpu_ref_switch_to_atomic() and reimplements percpu_ref_kill_and_confirm() on top of it. * The handling of __PERCPU_REF_ATOMIC and __PERCPU_REF_DEAD is now differentiated. Among get/put operations, percpu_ref_tryget_live() is the only one which cares about DEAD. * percpu_ref_switch_to_atomic() can be called multiple times on the same ref. This means that multiple @confirm_switch may get queued up which we can't do reliably without extra memory area. This is handled by making the later invocation synchronously wait for the completion of the previous one. This isn't particularly desirable but such synchronous waits shouldn't happen in most cases. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index bd9483d..d1252e1 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -78,9 +78,11 @@ struct percpu_ref { int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch); +void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); -void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref @@ -111,7 +113,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; @@ -193,6 +195,8 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_inc(*percpu_count); ret = true; + } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { + ret = atomic_long_inc_not_zero(&ref->count); } rcu_read_unlock_sched(); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index e2ff19f..6e0d143 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -1,6 +1,8 @@ #define pr_fmt(fmt) "%s: " fmt "\n", __func__ #include +#include +#include #include /* @@ -31,6 +33,8 @@ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); + static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) @@ -88,7 +92,19 @@ void percpu_ref_exit(struct percpu_ref *ref) } EXPORT_SYMBOL_GPL(percpu_ref_exit); -static void percpu_ref_kill_rcu(struct rcu_head *rcu) +static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) +{ + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); + + ref->confirm_switch(ref); + ref->confirm_switch = NULL; + wake_up_all(&percpu_ref_switch_waitq); + + /* drop ref from percpu_ref_switch_to_atomic() */ + percpu_ref_put(ref); +} + +static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); unsigned long __percpu *percpu_count = percpu_count_ptr(ref); @@ -116,47 +132,79 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); WARN_ONCE(atomic_long_read(&ref->count) <= 0, - "percpu ref (%pf) <= 0 (%ld) after killed", + "percpu ref (%pf) <= 0 (%ld) after switching to atomic", ref->release, atomic_long_read(&ref->count)); - /* @ref is viewed as dead on all CPUs, send out kill confirmation */ - if (ref->confirm_switch) - ref->confirm_switch(ref); + /* @ref is viewed as dead on all CPUs, send out switch confirmation */ + percpu_ref_call_confirm_rcu(rcu); +} - /* - * Now we're in single atomic_long_t mode with a consistent - * refcount, so it's safe to drop our initial ref: - */ - percpu_ref_put(ref); +static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) +{ +} + +static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that + * switching is in progress. Use noop one if unspecified. + */ + WARN_ON_ONCE(ref->confirm_switch); + ref->confirm_switch = + confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); + } else if (confirm_switch) { + /* + * Somebody already set ATOMIC. Switching may still be in + * progress. @confirm_switch must be invoked after the + * switching is complete and a full sched RCU grace period + * has passed. Wait synchronously for the previous + * switching and schedule @confirm_switch invocation. + */ + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); + ref->confirm_switch = confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + } } /** - * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation - * @ref: percpu_ref to kill - * @confirm_kill: optional confirmation callback + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback * - * Equivalent to percpu_ref_kill() but also schedules kill confirmation if - * @confirm_kill is not NULL. @confirm_kill, which may not block, will be - * called after @ref is seen as dead from all CPUs - all further - * invocations of percpu_ref_tryget_live() will fail. See - * percpu_ref_tryget_live() for more details. + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. * - * Due to the way percpu_ref is implemented, @confirm_kill will be called - * after at least one full RCU grace period has passed but this is an - * implementation detail and callers must not depend on it. + * This function normally doesn't block and can be called from any context + * but it may block if @confirm_kill is specified and @ref is already in + * the process of switching to atomic mode. In such cases, @confirm_switch + * will be invoked after the switching is complete. + * + * Due to the way percpu_ref is implemented, @confirm_switch will be called + * after at least one full sched RCU grace period has passed but this is an + * implementation detail and must not be depended upon. */ -void percpu_ref_kill_and_confirm(struct percpu_ref *ref, - percpu_ref_func_t *confirm_kill) +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) { - WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC_DEAD, - "%s called more than once on %pf!", __func__, ref->release); - - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC_DEAD; - ref->confirm_switch = confirm_kill; - - call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); + __percpu_ref_switch_to_atomic(ref, confirm_switch); } -EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); /** * percpu_ref_reinit - re-initialize a percpu refcount @@ -192,3 +240,34 @@ void percpu_ref_reinit(struct percpu_ref *ref) ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); + +/** + * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation + * @ref: percpu_ref to kill + * @confirm_kill: optional confirmation callback + * + * Equivalent to percpu_ref_kill() but also schedules kill confirmation if + * @confirm_kill is not NULL. @confirm_kill, which may not block, will be + * called after @ref is seen as dead from all CPUs at which point all + * further invocations of percpu_ref_tryget_live() will fail. See + * percpu_ref_tryget_live() for details. + * + * This function normally doesn't block and can be called from any context + * but it may block if @confirm_kill is specified and @ref is already in + * the process of switching to atomic mode by percpu_ref_switch_atomic(). + * + * Due to the way percpu_ref is implemented, @confirm_switch will be called + * after at least one full sched RCU grace period has passed but this is an + * implementation detail and must not be depended upon. + */ +void percpu_ref_kill_and_confirm(struct percpu_ref *ref, + percpu_ref_func_t *confirm_kill) +{ + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, + "%s called more than once on %pf!", __func__, ref->release); + + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + __percpu_ref_switch_to_atomic(ref, confirm_kill); + percpu_ref_put(ref); +} +EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); -- cgit v0.10.2 From f47ad45784611297b699f3dffb6c7222b76afe64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:49 -0400 Subject: percpu_ref: decouple switching to percpu mode and reinit percpu_ref has treated the dropping of the base reference and switching to atomic mode as an integral operation; however, there's nothing inherent tying the two together. The use cases for percpu_ref have been expanding continuously. While the current init/kill/reinit/exit model can cover a lot, the coupling of kill/reinit with atomic/percpu mode switching is turning out to be too restrictive for use cases where many percpu_refs are created and destroyed back-to-back with only some of them reaching extended operation. The coupling also makes implementing always-atomic debug mode difficult. This patch separates out percpu mode switching into percpu_ref_switch_to_percpu() and reimplements percpu_ref_reinit() on top of it. * DEAD still requires ATOMIC. A dead ref can't be switched to percpu mode w/o going through reinit. v2: __percpu_ref_switch_to_percpu() was missing static. Fixed. Reported by Fengguang aka kbuild test robot. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner Cc: kbuild test robot diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index d1252e1..cd7e20f 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -80,9 +80,10 @@ int __must_check percpu_ref_init(struct percpu_ref *ref, void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); -void percpu_ref_reinit(struct percpu_ref *ref); +void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 6e0d143..5a6d43b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -206,40 +206,54 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, __percpu_ref_switch_to_atomic(ref, confirm_switch); } -/** - * percpu_ref_reinit - re-initialize a percpu refcount - * @ref: perpcu_ref to re-initialize - * - * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully, killed - * and reached 0 but not exited. - * - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while - * this function is in progress. - */ -void percpu_ref_reinit(struct percpu_ref *ref) +static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; BUG_ON(!percpu_count); - WARN_ON_ONCE(!percpu_ref_is_zero(ref)); - atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) + return; + + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); + + atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* * Restore per-cpu operation. smp_store_release() is paired with * smp_read_barrier_depends() in __ref_is_percpu() and guarantees * that the zeroing is visible to all percpu accesses which can see - * the following __PERCPU_REF_ATOMIC_DEAD clearing. + * the following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, - ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); +} + +/** + * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode + * @ref: percpu_ref to switch to percpu mode + * + * There's no reason to use this function for the usual reference counting. + * To re-use an expired ref, use percpu_ref_reinit(). + * + * Switch @ref to percpu mode. This function may be invoked concurrently + * with all the get/put operations and can safely be mixed with kill and + * reinit operations. + * + * This function normally doesn't block and can be called from any context + * but it may block if @ref is in the process of switching to atomic mode + * by percpu_ref_switch_atomic(). + */ +void percpu_ref_switch_to_percpu(struct percpu_ref *ref) +{ + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ + if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_percpu(ref); } -EXPORT_SYMBOL_GPL(percpu_ref_reinit); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation @@ -253,8 +267,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit); * percpu_ref_tryget_live() for details. * * This function normally doesn't block and can be called from any context - * but it may block if @confirm_kill is specified and @ref is already in - * the process of switching to atomic mode by percpu_ref_switch_atomic(). + * but it may block if @confirm_kill is specified and @ref is in the + * process of switching to atomic mode by percpu_ref_switch_atomic(). * * Due to the way percpu_ref is implemented, @confirm_switch will be called * after at least one full sched RCU grace period has passed but this is an @@ -271,3 +285,24 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_put(ref); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); + +/** + * percpu_ref_reinit - re-initialize a percpu refcount + * @ref: perpcu_ref to re-initialize + * + * Re-initialize @ref so that it's in the same state as when it finished + * percpu_ref_init(). @ref must have been initialized successfully and + * reached 0 but not exited. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. + */ +void percpu_ref_reinit(struct percpu_ref *ref) +{ + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + + ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; + percpu_ref_get(ref); + __percpu_ref_switch_to_percpu(ref); +} +EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit v0.10.2 From 2aad2a86f6685c10360ec8a5a55eb9ab7059cb72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: percpu_ref: add PERCPU_REF_INIT_* flags With the recent addition of percpu_ref_reinit(), percpu_ref now can be used as a persistent switch which can be turned on and off repeatedly where turning off maps to killing the ref and waiting for it to drain; however, there currently isn't a way to initialize a percpu_ref in its off (killed and drained) state, which can be inconvenient for certain persistent switch use cases. Similarly, percpu_ref_switch_to_atomic/percpu() allow dynamic selection of operation mode; however, currently a newly initialized percpu_ref is always in percpu mode making it impossible to avoid the latency overhead of switching to atomic mode. This patch adds @flags to percpu_ref_init() and implements the following flags. * PERCPU_REF_INIT_ATOMIC : start ref in atomic mode * PERCPU_REF_INIT_DEAD : start ref killed and drained These flags should be able to serve the above two use cases. v2: target_core_tpg.c conversion was missing. Fixed. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner diff --git a/block/blk-mq.c b/block/blk-mq.c index 44a78ae..d85fe01 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1796,7 +1796,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) goto err_hctxs; if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - GFP_KERNEL)) + 0, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 4ab6da3..be783f7 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,7 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index 8d217ed..84a7510 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index cd7e20f..b0293f2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -63,6 +63,21 @@ enum { __PERCPU_REF_FLAG_BITS = 2, }; +/* @flags for percpu_ref_init() */ +enum { + /* + * Start w/ ref == 1 in atomic mode. Can be switched to percpu + * operation using percpu_ref_switch_to_percpu(). + */ + PERCPU_REF_INIT_ATOMIC = 1 << 0, + + /* + * Start dead w/ ref == 0 in atomic mode. Must be revived with + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. + */ + PERCPU_REF_INIT_DEAD = 1 << 1, +}; + struct percpu_ref { atomic_long_t count; /* @@ -76,7 +91,8 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release, gfp_t gfp); + percpu_ref_func_t *release, unsigned int flags, + gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a99d504..753df01 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1634,7 +1634,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -4510,7 +4511,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4583,7 +4584,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 5a6d43b..ed280fb 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -45,27 +45,40 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * - * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_long_set(ref, 1). + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a + * refcount of 1; analagous to atomic_long_set(ref, 1). See the + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, - gfp_t gfp) + unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); - - atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); + unsigned long start_count = 0; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + else + start_count += PERCPU_COUNT_BIAS; + + if (flags & PERCPU_REF_INIT_DEAD) + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + else + start_count++; + + atomic_long_set(&ref->count, start_count); + ref->release = release; return 0; } -- cgit v0.10.2 From 1cae13e75b7a7848c03138636d4eb8d8a5054dd5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: percpu_ref: make INIT_ATOMIC and switch_to_atomic() sticky Currently, a percpu_ref which is initialized with PERPCU_REF_INIT_ATOMIC or switched to atomic mode via switch_to_atomic() automatically reverts to percpu mode on the first percpu_ref_reinit(). This makes the atomic mode difficult to use for cases where a percpu_ref is used as a persistent on/off switch which may be cycled multiple times. This patch makes such atomic state sticky so that it survives through kill/reinit cycles. After this patch, atomic state is cleared only by an explicit percpu_ref_switch_to_percpu() call. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b0293f2..00c01fc 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -67,7 +67,9 @@ enum { enum { /* * Start w/ ref == 1 in atomic mode. Can be switched to percpu - * operation using percpu_ref_switch_to_percpu(). + * operation using percpu_ref_switch_to_percpu(). If initialized + * with this flag, the ref will stay in atomic mode until + * percpu_ref_switch_to_percpu() is invoked on it. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, @@ -87,6 +89,7 @@ struct percpu_ref { unsigned long percpu_count_ptr; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; + bool force_atomic:1; struct rcu_head rcu; }; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index ed280fb..6111bcb 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -67,6 +67,8 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, if (!ref->percpu_count_ptr) return -ENOMEM; + ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; else @@ -202,7 +204,8 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, * are guaraneed to be in atomic mode, @confirm_switch, which may not * block, is invoked. This function may be invoked concurrently with all * the get/put operations and can safely be mixed with kill and reinit - * operations. + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is already in @@ -216,6 +219,7 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { + ref->force_atomic = true; __percpu_ref_switch_to_atomic(ref, confirm_switch); } @@ -255,7 +259,10 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) * * Switch @ref to percpu mode. This function may be invoked concurrently * with all the get/put operations and can safely be mixed with kill and - * reinit operations. + * reinit operations. This function reverses the sticky atomic state set + * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is + * dying or dead, the actual switching takes place on the following + * percpu_ref_reinit(). * * This function normally doesn't block and can be called from any context * but it may block if @ref is in the process of switching to atomic mode @@ -263,6 +270,8 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { + ref->force_atomic = false; + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) __percpu_ref_switch_to_percpu(ref); @@ -304,8 +313,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); * @ref: perpcu_ref to re-initialize * * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully and - * reached 0 but not exited. + * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been + * initialized successfully and reached 0 but not exited. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. @@ -316,6 +325,7 @@ void percpu_ref_reinit(struct percpu_ref *ref) ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - __percpu_ref_switch_to_percpu(ref); + if (!ref->force_atomic) + __percpu_ref_switch_to_percpu(ref); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); -- cgit v0.10.2 From 17497acbdce9506fd6a75115dee4ab80c3cc5ee5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: blk-mq, percpu_ref: start q->mq_usage_counter in atomic mode blk-mq uses percpu_ref for its usage counter which tracks the number of in-flight commands and used to synchronously drain the queue on freeze. percpu_ref shutdown takes measureable wallclock time as it involves a sched RCU grace period. This means that draining a blk-mq takes measureable wallclock time. One would think that this shouldn't matter as queue shutdown should be a rare event which takes place asynchronously w.r.t. userland. Unfortunately, SCSI probing involves synchronously setting up and then tearing down a lot of request_queues back-to-back for non-existent LUNs. This means that SCSI probing may take above ten seconds when scsi-mq is used. [ 0.949892] scsi host0: Virtio SCSI HBA [ 1.007864] scsi 0:0:0:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5 [ 1.021299] scsi 0:0:1:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5 [ 1.520356] tsc: Refined TSC clocksource calibration: 2491.910 MHz [ 16.186549] sd 0:0:0:0: Attached scsi generic sg0 type 0 [ 16.190478] sd 0:0:1:0: Attached scsi generic sg1 type 0 [ 16.194099] osd: LOADED open-osd 0.2.1 [ 16.203202] sd 0:0:0:0: [sda] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB) [ 16.208478] sd 0:0:0:0: [sda] Write Protect is off [ 16.211439] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA [ 16.218771] sd 0:0:1:0: [sdb] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB) [ 16.223264] sd 0:0:1:0: [sdb] Write Protect is off [ 16.225682] sd 0:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA This is also the reason why request_queues start in bypass mode which is ended on blk_register_queue() as shutting down a fully functional queue also involves a RCU grace period and the queues for non-existent SCSI devices never reach registration. blk-mq basically needs to do the same thing - start the mq in a degraded mode which is faster to shut down and then make it fully functional only after the queue reaches registration. percpu_ref recently grew facilities to force atomic operation until explicitly switched to percpu mode, which can be used for this purpose. This patch makes blk-mq initialize q->mq_usage_counter in atomic mode and switch it to percpu mode only once blk_register_queue() is reached. Note that this issue was previously worked around by 0a30288da1ae ("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe") for v3.17. The temp fix was reverted in preparation of adding persistent atomic mode to percpu_ref by 9eca80461a45 ("Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe""). This patch and the prerequisite percpu_ref changes will be merged during v3.18 devel cycle. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de Fixes: add703fda981 ("blk-mq: use percpu_ref for mq usage count") Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Johannes Weiner diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ed52178..371d880 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -402,6 +402,12 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } +/* see blk_register_queue() */ +void blk_mq_finish_init(struct request_queue *q) +{ + percpu_ref_switch_to_percpu(&q->mq_usage_counter); +} + int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); diff --git a/block/blk-mq.c b/block/blk-mq.c index d85fe01..38f4a165 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1795,8 +1795,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - 0, GFP_KERNEL)) + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84..521ae90 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -551,12 +551,19 @@ int blk_register_queue(struct gendisk *disk) return -ENXIO; /* - * Initialization must be complete by now. Finish the initial - * bypass from queue allocation. + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. */ if (!blk_queue_init_done(q)) { queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); blk_queue_bypass_end(q); + if (q->mq_ops) + blk_mq_finish_init(q); } ret = blk_trace_init_sysfs(dev); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f2..c13a0c0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,7 @@ enum { }; struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +void blk_mq_finish_init(struct request_queue *q); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); -- cgit v0.10.2 From 6ae833c7fe0c6ef1f0ab13cc775da230d6f4c256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Oct 2014 12:01:52 -0400 Subject: percpu: fix how @gfp is interpreted by the percpu allocator When @gfp is specified, the percpu allocator is interested in whether it contains all of GFP_KERNEL or not. If it does, the normal allocation path is taken; otherwise, the atomic allocation path. Unfortunately, pcpu_alloc() was incorrectly testing for whether @gfp contains any part of GFP_KERNEL. Fix it by testing "(gfp & GFP_KERNEL) != GFP_KERNEL" instead of "!(gfp & GFP_KERNEL)" to decide whether the allocation should be atomic or not. Signed-off-by: Tejun Heo diff --git a/mm/percpu.c b/mm/percpu.c index e10f9f7..014bab6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -876,7 +876,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - bool is_atomic = !(gfp & GFP_KERNEL); + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; int occ_pages = 0; int slot, off, new_alloc, cpu, ret; unsigned long flags; -- cgit v0.10.2