From db2a0dd7a43de595d3f0542986bb17ccb6cc364c Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Fri, 6 Nov 2015 16:28:06 -0800 Subject: mm/oom_kill.c: introduce is_sysrq_oom helper Introduce is_sysrq_oom helper function indicating oom kill triggered by sysrq to improve readability. No functional changes. Signed-off-by: Yaowei Bai Acked-by: David Rientjes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e477828..d13a339 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -118,6 +118,15 @@ found: return t; } +/* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ +static inline bool is_sysrq_oom(struct oom_control *oc) +{ + return oc->order == -1; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -265,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (oc->order != -1) + if (!is_sysrq_oom(oc)) return OOM_SCAN_ABORT; } if (!task->mm) @@ -278,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && oc->order != -1) + if (task_will_free_mem(task) && !is_sysrq_oom(oc)) return OOM_SCAN_ABORT; return OOM_SCAN_OK; @@ -629,7 +638,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, return; } /* Do not panic for oom kills triggered by sysrq */ - if (oc->order == -1) + if (is_sysrq_oom(oc)) return; dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", @@ -709,7 +718,7 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && oc->order != -1) { + if (!p && !is_sysrq_oom(oc)) { dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } -- cgit v0.10.2 From e2b19197ff9dc46f3e3888f273c4395f9e5a9856 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:09 -0800 Subject: mm, page_alloc: remove unnecessary parameter from zone_watermark_ok_safe Overall, the intent of this series is to remove the zonelist cache which was introduced to avoid high overhead in the page allocator. Once this is done, it is necessary to reduce the cost of watermark checks. The series starts with minor micro-optimisations. Next it notes that GFP flags that affect watermark checks are abused. __GFP_WAIT historically identified callers that could not sleep and could access reserves. This was later abused to identify callers that simply prefer to avoid sleeping and have other options. A patch distinguishes between atomic callers, high-priority callers and those that simply wish to avoid sleep. The zonelist cache has been around for a long time but it is of dubious merit with a lot of complexity and some issues that are explained. The most important issue is that a failed THP allocation can cause a zone to be treated as "full". This potentially causes unnecessary stalls, reclaim activity or remote fallbacks. The issues could be fixed but it's not worth it. The series places a small number of other micro-optimisations on top before examining GFP flags watermarks. High-order watermarks enforcement can cause high-order allocations to fail even though pages are free. The watermark checks both protect high-order atomic allocations and make kswapd aware of high-order pages but there is a much better way that can be handled using migrate types. This series uses page grouping by mobility to reserve pageblocks for high-order allocations with the size of the reservation depending on demand. kswapd awareness is maintained by examining the free lists. By patch 12 in this series, there are no high-order watermark checks while preserving the properties that motivated the introduction of the watermark checks. This patch (of 10): No user of zone_watermark_ok_safe() specifies alloc_flags. This patch removes the unnecessary parameter. Signed-off-by: Mel Gorman Acked-by: David Rientjes Acked-by: Vlastimil Babka Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2d7e660..e326843 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -817,7 +817,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags); + unsigned long mark, int classzone_idx); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 446bb36..d73c346 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2249,6 +2249,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) @@ -2278,14 +2279,14 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags) + unsigned long mark, int classzone_idx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, classzone_idx, 0, free_pages); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 55721b6..e0cd7ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2477,7 +2477,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where @@ -2960,7 +2960,7 @@ static bool zone_balanced(struct zone *zone, int order, unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, -- cgit v0.10.2 From c9ab0c4fbeb0202bac3548378a977e1536ebe3ca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:12 -0800 Subject: mm, page_alloc: remove unnecessary recalculations for dirty zone balancing File-backed pages that will be immediately written are balanced between zones. This heuristic tries to avoid having a single zone filled with recently dirtied pages but the checks are unnecessarily expensive. Move consider_zone_balanced into the alloc_context instead of checking bitmaps multiple times. The patch also gives the parameter a more meaningful name. Signed-off-by: Mel Gorman Acked-by: David Rientjes Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/internal.h b/mm/internal.h index d4b807d..ff0f1ad 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -129,6 +129,7 @@ struct alloc_context { int classzone_idx; int migratetype; enum zone_type high_zoneidx; + bool spread_dirty_pages; }; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d73c346..6739098 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2478,8 +2478,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; @@ -2534,14 +2532,14 @@ zonelist_scan: * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath - * (ALLOC_WMARK_LOW unset) before going into reclaim, + * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if (consider_zone_dirty && !zone_dirty_ok(zone)) + if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; @@ -3232,6 +3230,10 @@ retry_cpuset: /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; + + /* Dirty zone balancing only done in the fast path */ + ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, @@ -3250,6 +3252,7 @@ retry_cpuset: * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; page = __alloc_pages_slowpath(alloc_mask, order, &ac); } -- cgit v0.10.2 From 46e700abc44ce215acb4341d9702ce3972eda571 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:15 -0800 Subject: mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled There is a seqcounter that protects against spurious allocation failures when a task is changing the allowed nodes in a cpuset. There is no need to check the seqcounter until a cpuset exists. Signed-off-by: Mel Gorman Acked-by: Christoph Lameter Acked-by: David Rientjes Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 5a13119..85a868c 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -104,6 +104,9 @@ extern void cpuset_print_current_mems_allowed(void); */ static inline unsigned int read_mems_allowed_begin(void) { + if (!cpusets_enabled()) + return 0; + return read_seqcount_begin(¤t->mems_allowed_seq); } @@ -115,6 +118,9 @@ static inline unsigned int read_mems_allowed_begin(void) */ static inline bool read_mems_allowed_retry(unsigned int seq) { + if (!cpusets_enabled()) + return false; + return read_seqcount_retry(¤t->mems_allowed_seq, seq); } -- cgit v0.10.2 From 016c13daa5c9e4827eca703e2f0621c131f2cca3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:18 -0800 Subject: mm, page_alloc: use masks and shifts when converting GFP flags to migrate types This patch redefines which GFP bits are used for specifying mobility and the order of the migrate types. Once redefined it's possible to convert GFP flags to a migrate type with a simple mask and shift. The only downside is that readers of OOM kill messages and allocation failures may have been used to the existing values but scripts/gfp-translate will help. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f92cbd2..440fca3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -14,7 +14,7 @@ struct vm_area_struct; #define ___GFP_HIGHMEM 0x02u #define ___GFP_DMA32 0x04u #define ___GFP_MOVABLE 0x08u -#define ___GFP_WAIT 0x10u +#define ___GFP_RECLAIMABLE 0x10u #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u @@ -29,7 +29,7 @@ struct vm_area_struct; #define ___GFP_NOMEMALLOC 0x10000u #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u -#define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_WAIT 0x80000u #define ___GFP_NOACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u @@ -126,6 +126,7 @@ struct vm_area_struct; /* This mask makes up all the page movable related flags */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) +#define GFP_MOVABLE_SHIFT 3 /* Control page allocator reclaim behavior */ #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ @@ -152,14 +153,15 @@ struct vm_area_struct; /* Convert GFP flags to their corresponding migrate type */ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) { - WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); + VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); + BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); + BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ - return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) | - ((gfp_flags & __GFP_RECLAIMABLE) != 0); + return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #ifdef CONFIG_HIGHMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e326843..38bed71 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -37,8 +37,8 @@ enum { MIGRATE_UNMOVABLE, - MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, + MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_RESERVE = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA -- cgit v0.10.2 From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/balance b/Documentation/vm/balance index c46e68c..9645954 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance @@ -1,12 +1,14 @@ Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for non __GFP_WAIT as well as for non -__GFP_IO allocations. +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. -There are two reasons to be requesting non __GFP_WAIT allocations: -the caller can not sleep (typically intr context), or does not want -to incur cost overheads of page stealing and possible swap io for -whatever reasons. +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. __GFP_IO allocation requests are made to prevent file system deadlocks. diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ad4eb2d..e62400e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -651,12 +651,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, if (nommu()) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT)) + else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM)) addr = __alloc_from_contiguous(dev, size, prot, &page, caller, want_vaddr); else if (is_coherent) addr = __alloc_simple_buffer(dev, size, gfp, &page); - else if (!(gfp & __GFP_WAIT)) + else if (!gfpflags_allow_blocking(gfp)) addr = __alloc_from_pool(size, &page); else addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, @@ -1363,7 +1363,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size, *handle = DMA_ERROR_CODE; size = PAGE_ALIGN(size); - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return __iommu_alloc_atomic(dev, size, handle); /* diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 7c34f71..c5f9a9e 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -25,7 +25,7 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) { struct memblock_region *reg; - gfp_t flags = __GFP_NOWARN; + gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; for_each_memblock(memory, reg) { if (reg->base < (phys_addr_t)0xffffffff) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6320361..bb4bf6a 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) { + if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) { struct page *page; void *addr; @@ -148,7 +148,7 @@ static void *__dma_alloc(struct device *dev, size_t size, size = PAGE_ALIGN(size); - if (!coherent && !(flags & __GFP_WAIT)) { + if (!coherent && !gfpflags_allow_blocking(flags)) { struct page *page = NULL; void *addr = __alloc_from_pool(size, &page, flags); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cd99433..6ba014c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) { + if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/block/bio.c b/block/bio.c index ad3f276..4f184d9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { diff --git a/block/blk-core.c b/block/blk-core.c index 89eec79..9e32f08 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1206,8 +1206,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1227,7 +1227,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1305,11 +1305,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45..381cb50 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60ac684..a07ca34 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(data->gfp)) return -1; bs = bt_wait_ptr(bt, hctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c27b3e..68c0a34 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -244,11 +244,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_WAIT|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c097909..b4b5680 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -357,7 +357,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto } if (has_payload && data_size) { - page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + page = drbd_alloc_pages(peer_device, nr_pages, + gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; } diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index e229425..1b709a4 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -271,7 +271,7 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) goto err_out; tmp->bi_bdev = NULL; - gfpmask &= ~__GFP_WAIT; + gfpmask &= ~__GFP_DIRECT_RECLAIM; tmp->bi_next = NULL; if (!new_chain) diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 30f5228..d7373ca 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -124,7 +124,8 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, if (group) return netlink_broadcast(dev->nls, skb, portid, group, gfp_mask); - return netlink_unicast(dev->nls, skb, portid, !(gfp_mask&__GFP_WAIT)); + return netlink_unicast(dev->nls, skb, portid, + !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 2a3973a..36a7c2d 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -486,7 +486,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg) static int add_client_resource(struct client *client, struct client_resource *resource, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 4d631a9..d58cb9e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2215,7 +2215,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) */ mapping = file_inode(obj->base.filp)->i_mapping; gfp = mapping_gfp_mask(mapping); - gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD; + gfp |= __GFP_NORETRY | __GFP_NOWARN; gfp &= ~(__GFP_IO | __GFP_WAIT); sg = st->sgl; st->nents = 0; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8c014b3..59ab264 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1083,7 +1083,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { - bool preload = !!(gfp_mask & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 0d533bb..8b2be1e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2668,7 +2668,7 @@ static void *alloc_coherent(struct device *dev, size_t size, page = alloc_pages(flag | __GFP_NOWARN, get_order(size)); if (!page) { - if (!(flag & __GFP_WAIT)) + if (!gfpflags_allow_blocking(flag)) return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7cf80c1..f1042da 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3647,7 +3647,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, flags |= GFP_DMA32; } - if (flags & __GFP_WAIT) { + if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; page = dma_alloc_from_contiguous(dev, count, order); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3729b39..917d47e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -994,7 +994,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) struct bio_vec *bvec; retry: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_lock(&cc->bio_alloc_lock); clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); @@ -1010,7 +1010,7 @@ retry: if (!page) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - gfp_mask |= __GFP_WAIT; + gfp_mask |= __GFP_DIRECT_RECLAIM; goto retry; } @@ -1027,7 +1027,7 @@ retry: } return_clone: - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_unlock(&cc->bio_alloc_lock); return clone; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3a7cade..1452ed9 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc, *pages = NULL; do { - pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM); if (unlikely(!pl)) { /* Use reserved pages */ pl = kc->pages; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c index 1bd2fd4..4432fd6 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c @@ -1297,7 +1297,7 @@ static struct solo_enc_dev *solo_enc_alloc(struct solo_dev *solo_dev, solo_enc->vidq.ops = &solo_enc_video_qops; solo_enc->vidq.mem_ops = &vb2_dma_sg_memops; solo_enc->vidq.drv_priv = solo_enc; - solo_enc->vidq.gfp_flags = __GFP_DMA32; + solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf); solo_enc->vidq.lock = &solo_enc->lock; diff --git a/drivers/media/pci/solo6x10/solo6x10-v4l2.c b/drivers/media/pci/solo6x10/solo6x10-v4l2.c index 26df903..f7ce493 100644 --- a/drivers/media/pci/solo6x10/solo6x10-v4l2.c +++ b/drivers/media/pci/solo6x10/solo6x10-v4l2.c @@ -678,7 +678,7 @@ int solo_v4l2_init(struct solo_dev *solo_dev, unsigned nr) solo_dev->vidq.mem_ops = &vb2_dma_contig_memops; solo_dev->vidq.drv_priv = solo_dev; solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; - solo_dev->vidq.gfp_flags = __GFP_DMA32; + solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf); solo_dev->vidq.lock = &solo_dev->lock; ret = vb2_queue_init(&solo_dev->vidq); diff --git a/drivers/media/pci/tw68/tw68-video.c b/drivers/media/pci/tw68/tw68-video.c index 4c3293d..46642ef 100644 --- a/drivers/media/pci/tw68/tw68-video.c +++ b/drivers/media/pci/tw68/tw68-video.c @@ -979,7 +979,7 @@ int tw68_video_init2(struct tw68_dev *dev, int video_nr) dev->vidq.ops = &tw68_video_qops; dev->vidq.mem_ops = &vb2_dma_sg_memops; dev->vidq.drv_priv = dev; - dev->vidq.gfp_flags = __GFP_DMA32; + dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM; dev->vidq.buf_struct_size = sizeof(struct tw68_buf); dev->vidq.lock = &dev->lock; dev->vidq.min_buffers_needed = 2; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 8bbbb75..2dfb291 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1188,8 +1188,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | - __GFP_NORETRY | __GFP_NO_KSWAPD; + gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 44173be..f8d7a2f 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -691,7 +691,7 @@ static void *bnx2x_frag_alloc(const struct bnx2x_fastpath *fp, gfp_t gfp_mask) { if (fp->rx_frag_size) { /* GFP_KERNEL allocations are used only during initialization */ - if (unlikely(gfp_mask & __GFP_WAIT)) + if (unlikely(gfpflags_allow_blocking(gfp_mask))) return (void *)__get_free_page(gfp_mask); return netdev_alloc_frag(fp->rx_frag_size); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index ada724a..d4c3e55 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -27,7 +27,7 @@ #include "ion_priv.h" static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN | - __GFP_NORETRY) & ~__GFP_WAIT; + __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM; static gfp_t low_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN); static const unsigned int orders[] = {8, 4, 0}; static const int num_orders = ARRAY_SIZE(orders); diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index 6af733d..f0b0423 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -95,7 +95,7 @@ do { \ do { \ LASSERT(!in_interrupt() || \ ((size) <= LIBCFS_VMALLOC_SIZE && \ - ((mask) & __GFP_WAIT) == 0)); \ + !gfpflags_allow_blocking(mask))); \ } while (0) #define LIBCFS_ALLOC_POST(ptr, size) \ diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c index 0a94895..692ccc6 100644 --- a/drivers/usb/host/u132-hcd.c +++ b/drivers/usb/host/u132-hcd.c @@ -2244,7 +2244,7 @@ static int u132_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, { struct u132 *u132 = hcd_to_u132(hcd); if (irqs_disabled()) { - if (__GFP_WAIT & mem_flags) { + if (gfpflags_allow_blocking(mem_flags)) { printk(KERN_ERR "invalid context for function that might sleep\n"); return -EINVAL; } diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 6b70d7f..1c1e95a 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -99,7 +99,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order, * below the first 16MB. */ - flags = __GFP_DMA | __GFP_HIGH; + flags = __GFP_DMA | __GFP_HIGH | __GFP_KSWAPD_RECLAIM; va->logical = __get_free_pages(flags, --max_order); } while (va->logical == 0 && max_order > min_order); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1e60d00..c339d56 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2572,7 +2572,7 @@ int open_ctree(struct super_block *sb, fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3915c94..032abfb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -594,7 +594,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -718,7 +718,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -850,7 +850,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { prealloc = alloc_extent_state(mask); BUG_ON(!prealloc); } @@ -1028,7 +1028,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -1076,7 +1076,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1253,7 +1253,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); first_iteration = false; goto again; @@ -4319,7 +4319,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; - if ((mask & __GFP_WAIT) && + if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > 16 * 1024 * 1024) { u64 len; while (start <= end) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6fc73586..e023919 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -156,8 +156,8 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); return dev; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a63c7b0..49f6c78 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1058,7 +1058,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); + wait & ~__GFP_DIRECT_RECLAIM); return try_to_free_buffers(page); } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d403c69..43040721 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie( /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 483bbc6..79483b3 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) /* * decide whether a page can be released, possibly by cancelling a store to it - * - we're allowed to sleep if __GFP_WAIT is flagged + * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged */ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, @@ -122,7 +122,7 @@ page_busy: * allocator as the work threads writing to the cache may all end up * sleeping on memory allocation, so we may need to impose a timeout * too. */ - if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) { fscache_stat(&fscache_n_store_vmscan_busy); return false; } @@ -132,7 +132,7 @@ page_busy: _debug("fscache writeout timeout page: %p{%lx}", page, page->index); - gfp &= ~__GFP_WAIT; + gfp &= ~__GFP_DIRECT_RECLAIM; goto try_again; } EXPORT_SYMBOL(__fscache_maybe_release_page); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6b8338e..89463ee 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1937,8 +1937,8 @@ out: * @journal: journal for operation * @page: to try and free * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. + * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit + * code to release the buffers. * * * For all the buffers on this page, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 37f639d..93e2364 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Always try to initiate a 'commit' if relevant, but only - * wait for it if __GFP_WAIT is set. Even then, only wait 1 - * second and only if the 'bdi' is not congested. + * wait for it if the caller allows blocking. Even then, + * only wait 1 second and only if the 'bdi' is not congested. * Waiting indefinitely can cause deadlocks when the NFS * server is on this machine, when a new TCP connection is * needed and in other rare cases. There is no particular @@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) if (mapping) { struct nfs_server *nfss = NFS_SERVER(mapping->host); nfs_commit_inode(mapping->host, 0); - if ((gfp & __GFP_WAIT) && + if (gfpflags_allow_blocking(gfp) && !bdi_write_congested(&nfss->backing_dev_info)) { wait_on_page_bit_killable_timeout(page, PG_private, HZ); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index eac9549..587174f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -525,7 +525,7 @@ xfs_qm_shrink_scan( unsigned long freed; int error; - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM)) return 0; INIT_LIST_HEAD(&isol.buffers); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 440fca3..b56e811 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -29,12 +29,13 @@ struct vm_area_struct; #define ___GFP_NOMEMALLOC 0x10000u #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u -#define ___GFP_WAIT 0x80000u +#define ___GFP_ATOMIC 0x80000u #define ___GFP_NOACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u -#define ___GFP_NO_KSWAPD 0x400000u +#define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u +#define ___GFP_KSWAPD_RECLAIM 0x2000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -71,7 +72,7 @@ struct vm_area_struct; * __GFP_MOVABLE: Flag that this page will be movable by the page migration * mechanism or reclaimed */ -#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */ +#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) /* Caller cannot wait or reschedule */ #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */ #define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */ #define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */ @@ -94,23 +95,37 @@ struct vm_area_struct; #define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ -#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* + * A caller that is willing to wait may enter direct reclaim and will + * wake kswapd to reclaim pages in the background until the high + * watermark is met. A caller may wish to clear __GFP_DIRECT_RECLAIM to + * avoid unnecessary delays when a fallback option is available but + * still allow kswapd to reclaim in the background. The kswapd flag + * can be cleared when the reclaiming of pages would cause unnecessary + * disruption. + */ +#define __GFP_WAIT ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ +#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ + +/* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 26 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) -/* This equals 0, but use constants in case they ever change */ -#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) -/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ -#define GFP_ATOMIC (__GFP_HIGH) +/* + * GFP_ATOMIC callers can not sleep, need the allocation to succeed. + * A lower watermark is applied to allow access to "atomic reserves" + */ +#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) @@ -119,10 +134,10 @@ struct vm_area_struct; #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) -#define GFP_IOFS (__GFP_IO | __GFP_FS) -#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ - __GFP_NO_KSWAPD) +#define GFP_IOFS (__GFP_IO | __GFP_FS | __GFP_KSWAPD_RECLAIM) +#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ + ~__GFP_KSWAPD_RECLAIM) /* This mask makes up all the page movable related flags */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -164,6 +179,11 @@ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } +static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) +{ + return gfp_flags & __GFP_DIRECT_RECLAIM; +} + #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 24f4dfd..4355129 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1224,7 +1224,7 @@ static inline int skb_cloned(const struct sk_buff *skb) static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); @@ -1308,7 +1308,7 @@ static inline int skb_shared(const struct sk_buff *skb) */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); @@ -1344,7 +1344,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { - might_sleep_if(pri & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); diff --git a/include/net/sock.h b/include/net/sock.h index f570e75e..bbf7c2c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2041,7 +2041,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (sk->sk_allocation & __GFP_WAIT) + if (gfpflags_allow_blocking(sk->sk_allocation)) return ¤t->task_frag; return &sk->sk_frag; diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5..dde6bf0 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -20,7 +20,7 @@ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \ - {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \ + {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "GFP_IO"}, \ {(unsigned long)__GFP_COLD, "GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \ @@ -36,7 +36,8 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ + {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a3..5ffcbd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce..f1603c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4..deae390 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4..3a97060 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/smp.c b/kernel/smp.c index 0785447..d903c02 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); diff --git a/lib/idr.c b/lib/idr.c index 5335c43..6098336 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -399,7 +399,7 @@ void idr_preload(gfp_t gfp_mask) * allocation guarantee. Disallow usage from those contexts. */ WARN_ON_ONCE(in_interrupt()); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); preempt_disable(); @@ -453,7 +453,7 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) struct idr_layer *pa[MAX_IDR_LEVEL + 1]; int id; - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); /* sanity checks */ if (WARN_ON_ONCE(start < 0)) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f9ebe1c..fcf5d98 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -188,7 +188,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) * preloading in the interrupt anyway as all the allocations have to * be atomic. So just do normal allocation when in interrupt. */ - if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { + if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -249,7 +249,7 @@ radix_tree_node_free(struct radix_tree_node *node) * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without - * __GFP_WAIT being passed to INIT_RADIX_TREE(). + * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static int __radix_tree_preload(gfp_t gfp_mask) { @@ -286,12 +286,12 @@ out: * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without - * __GFP_WAIT being passed to INIT_RADIX_TREE(). + * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ - WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask); } EXPORT_SYMBOL(radix_tree_preload); @@ -303,7 +303,7 @@ EXPORT_SYMBOL(radix_tree_preload); */ int radix_tree_maybe_preload(gfp_t gfp_mask) { - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 619984f..8ed2ffd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfp & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp)); if (!memcg_css->parent) return &bdi->wb; diff --git a/mm/dmapool.c b/mm/dmapool.c index 312a716..57312b5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(mem_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(mem_flags)); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc502e5..05374f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2046,7 +2046,7 @@ retry: if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -4364,8 +4364,8 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; diff --git a/mm/mempool.c b/mm/mempool.c index 4c533bc..004d42b 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: @@ -349,7 +349,7 @@ repeat_alloc: } /* - * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { @@ -358,8 +358,8 @@ repeat_alloc: goto repeat_alloc; } - /* We must not sleep if !__GFP_WAIT */ - if (!(gfp_mask & __GFP_WAIT)) { + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 2834fab..e60379e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1578,7 +1578,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + ~(__GFP_IO | __GFP_FS), 0); return newpage; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6739098..70461f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -169,12 +169,12 @@ void pm_restrict_gfp_mask(void) WARN_ON(!mutex_is_locked(&pm_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~GFP_IOFS; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } bool pm_suspended_storage(void) { - if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) return false; return true; } @@ -2183,7 +2183,7 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2685,7 +2685,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { @@ -2945,7 +2945,6 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2954,11 +2953,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (atomic) { + if (gfp_mask & __GFP_ATOMIC) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -2995,11 +2994,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +{ + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { - const gfp_t wait = gfp_mask & __GFP_WAIT; + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; @@ -3020,15 +3024,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + + /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) goto nopage; retry: - if (!(gfp_mask & __GFP_NO_KSWAPD)) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* @@ -3071,8 +3083,8 @@ retry: } } - /* Atomic allocations - we can't balance anything */ - if (!wait) { + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) { /* * All existing users of the deprecated __GFP_NOFAIL are * blockable, so warn of any new users that actually allow this @@ -3102,7 +3114,7 @@ retry: goto got_pg; /* Checks for THP-specific high-order allocations */ - if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + if (is_thp_gfp_mask(gfp_mask)) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -3137,8 +3149,7 @@ retry: * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || - (current->flags & PF_KTHREAD)) + if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ @@ -3209,7 +3220,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, lockdep_trace_alloc(gfp_mask); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) return NULL; diff --git a/mm/slab.c b/mm/slab.c index 272e809..a9ef77d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1031,12 +1031,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not invoke reclaim - * or warn about failures. + * Construct gfp mask to allocate from a specific node but do not direct reclaim + * or warn about failures. kswapd may still wake to reclaim in the background. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; } #endif @@ -2633,7 +2633,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* @@ -2663,7 +2663,7 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, page); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); check_irq_off(); spin_lock(&n->list_lock); @@ -2677,7 +2677,7 @@ static int cache_grow(struct kmem_cache *cachep, opps1: kmem_freepages(cachep, page); failed: - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return 0; } @@ -2869,7 +2869,7 @@ force_grow: static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); #if DEBUG kmem_flagcheck(cachep, flags); #endif @@ -3057,11 +3057,11 @@ retry: */ struct page *page; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); kmem_flagcheck(cache, flags); page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); if (page) { /* diff --git a/mm/slub.c b/mm/slub.c index 75a5fa9..9769562 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1265,7 +1265,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s->object_size, flags, s->flags)) return NULL; @@ -1353,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_enable(); flags |= s->allocflags; @@ -1363,8 +1363,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1424,7 +1424,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_disable(); if (!page) return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9db9ef5..7ee94dc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1617,7 +1617,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index e0cd7ee..2aec424 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1476,7 +1476,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -3791,7 +3791,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* diff --git a/mm/zswap.c b/mm/zswap.c index 4043df7..e54166d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -571,7 +571,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); ret = zpool_malloc(entry->pool->zpool, len, - __GFP_NORETRY | __GFP_NOWARN, &handle); + __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, + &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fab4599..aa41e6d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -414,7 +414,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -481,7 +481,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, len += NET_SKB_PAD + NET_IP_ALIGN; if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || - (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; @@ -4452,7 +4452,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; gfp_head = gfp_mask; - if (gfp_head & __GFP_WAIT) + if (gfp_head & __GFP_DIRECT_RECLAIM) gfp_head |= __GFP_REPEAT; *errcode = -ENOBUFS; @@ -4467,7 +4467,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, while (order) { if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_WAIT) | + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, diff --git a/net/core/sock.c b/net/core/sock.c index 7529eb9..1e4dd54 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1944,8 +1944,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER) { - pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP | - __GFP_NOWARN | __GFP_NORETRY, + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fafe33b..59651af 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2116,7 +2116,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid consume_skb(info.skb2); if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) + if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 96744b7..977fb86 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (gfp & __GFP_WAIT) { + if (gfp & __GFP_DIRECT_RECLAIM) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -379,7 +379,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; - bool can_wait = !!(gfp & __GFP_WAIT); + bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM); u32 pos; /* the goal here is to just make sure that someone, somewhere diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 692b3e6..6c71ed1 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -500,7 +500,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, if (bundle->num_conns >= 20) { _debug("too many conns"); - if (!(gfp & __GFP_WAIT)) { + if (!gfpflags_allow_blocking(gfp)) { _leave(" = -EAGAIN"); return -EAGAIN; } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index b00f1f9..559afd0 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1590,7 +1590,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { - bool preload = !!(gfp & __GFP_WAIT); + bool preload = gfpflags_allow_blocking(gfp); int ret; /* If the id is already assigned, keep it. */ -- cgit v0.10.2 From 40113370836e8e79befa585277296ed42781ef31 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:25 -0800 Subject: mm: page_alloc: remove GFP_IOFS GFP_IOFS was intended to be shorthand for clearing two flags, not a set of allocation flags. There is only one user of this flag combination now and there appears to be no reason why Lustre had to be protected from reclaim stalls. As none of the sites appear to be atomic, this patch simply deletes GFP_IOFS and converts Lustre to using GFP_KERNEL, GFP_NOFS or GFP_NOIO as appropriate. Signed-off-by: Mel Gorman Cc: Oleg Drokin Cc: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c index fe49f1b..4ea651c 100644 --- a/drivers/staging/lustre/lnet/lnet/router.c +++ b/drivers/staging/lustre/lnet/lnet/router.c @@ -1245,7 +1245,7 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) for (i = 0; i < npages; i++) { page = alloc_pages_node( cfs_cpt_spread_node(lnet_cpt_table(), cpt), - __GFP_ZERO | GFP_IOFS, 0); + GFP_KERNEL | __GFP_ZERO, 0); if (page == NULL) { while (--i >= 0) __free_page(rb->rb_kiov[i].kiov_page); diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c index 0060ff6..64a0335 100644 --- a/drivers/staging/lustre/lnet/selftest/conrpc.c +++ b/drivers/staging/lustre/lnet/selftest/conrpc.c @@ -860,7 +860,7 @@ lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats, bulk->bk_iovs[i].kiov_offset = 0; bulk->bk_iovs[i].kiov_len = len; bulk->bk_iovs[i].kiov_page = - alloc_page(GFP_IOFS); + alloc_page(GFP_KERNEL); if (bulk->bk_iovs[i].kiov_page == NULL) { lstcon_rpc_put(*crpc); diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c index 162f9d3..7005002 100644 --- a/drivers/staging/lustre/lnet/selftest/rpc.c +++ b/drivers/staging/lustre/lnet/selftest/rpc.c @@ -146,7 +146,7 @@ srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink) int nob; pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt), - GFP_IOFS, 0); + GFP_KERNEL, 0); if (pg == NULL) { CERROR("Can't allocate page %d of %d\n", i, bulk_npg); srpc_free_bulk(bk); diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c index 50e8fd2..07a6859 100644 --- a/drivers/staging/lustre/lustre/libcfs/module.c +++ b/drivers/staging/lustre/lustre/libcfs/module.c @@ -319,7 +319,7 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a struct libcfs_ioctl_data *data; int err = 0; - LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS); + LIBCFS_ALLOC_GFP(buf, 1024, GFP_KERNEL); if (buf == NULL) return -ENOMEM; diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c index 973c7c2..f2d018d 100644 --- a/drivers/staging/lustre/lustre/libcfs/tracefile.c +++ b/drivers/staging/lustre/lustre/libcfs/tracefile.c @@ -810,7 +810,7 @@ int cfs_trace_allocate_string_buffer(char **str, int nob) if (nob > 2 * PAGE_CACHE_SIZE) /* string must be "sensible" */ return -EINVAL; - *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO); + *str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO); if (*str == NULL) return -ENOMEM; diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c index c902133..fe4a722 100644 --- a/drivers/staging/lustre/lustre/llite/remote_perm.c +++ b/drivers/staging/lustre/lustre/llite/remote_perm.c @@ -82,7 +82,7 @@ static struct hlist_head *alloc_rmtperm_hash(void) struct hlist_head *hash; int i; - hash = kmem_cache_alloc(ll_rmtperm_hash_cachep, GFP_IOFS | __GFP_ZERO); + hash = kmem_cache_alloc(ll_rmtperm_hash_cachep, GFP_NOFS | __GFP_ZERO); if (!hash) return NULL; diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c index b81efcd..5f53f3b 100644 --- a/drivers/staging/lustre/lustre/mgc/mgc_request.c +++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c @@ -1112,7 +1112,7 @@ static int mgc_apply_recover_logs(struct obd_device *mgc, LASSERT(cfg->cfg_instance != NULL); LASSERT(cfg->cfg_sb == cfg->cfg_instance); - inst = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + inst = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); if (!inst) return -ENOMEM; @@ -1308,14 +1308,14 @@ static int mgc_process_recover_log(struct obd_device *obd, if (cfg->cfg_last_idx == 0) /* the first time */ nrpages = CONFIG_READ_NRPAGES_INIT; - pages = kcalloc(nrpages, sizeof(*pages), GFP_NOFS); + pages = kcalloc(nrpages, sizeof(*pages), GFP_KERNEL); if (pages == NULL) { rc = -ENOMEM; goto out; } for (i = 0; i < nrpages; i++) { - pages[i] = alloc_page(GFP_IOFS); + pages[i] = alloc_page(GFP_KERNEL); if (pages[i] == NULL) { rc = -ENOMEM; goto out; @@ -1466,7 +1466,7 @@ static int mgc_process_cfg_log(struct obd_device *mgc, if (cld->cld_cfg.cfg_sb) lsi = s2lsi(cld->cld_cfg.cfg_sb); - env = kzalloc(sizeof(*env), GFP_NOFS); + env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return -ENOMEM; diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c index b6f000b..f61ef66 100644 --- a/drivers/staging/lustre/lustre/obdecho/echo_client.c +++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -1562,7 +1562,7 @@ static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, (oa->o_valid & OBD_MD_FLFLAGS) != 0 && (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); - gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER; + gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER; LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); LASSERT(lsm != NULL); diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c index cfb83bc..b1d1a87f 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -346,7 +346,7 @@ static struct osc_extent *osc_extent_alloc(struct osc_object *obj) { struct osc_extent *ext; - ext = kmem_cache_alloc(osc_extent_kmem, GFP_IOFS | __GFP_ZERO); + ext = kmem_cache_alloc(osc_extent_kmem, GFP_NOFS | __GFP_ZERO); if (ext == NULL) return NULL; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b56e811..86f9f7d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -134,7 +134,6 @@ struct vm_area_struct; #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) -#define GFP_IOFS (__GFP_IO | __GFP_FS | __GFP_KSWAPD_RECLAIM) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ ~__GFP_KSWAPD_RECLAIM) -- cgit v0.10.2 From 71baba4b92dc1fa1bc461742c6ab1942ec6034e9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:28 -0800 Subject: mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM __GFP_WAIT was used to signal that the caller was in atomic context and could not sleep. Now it is possible to distinguish between true atomic context and callers that are not willing to sleep. The latter should clear __GFP_DIRECT_RECLAIM so kswapd will still wake. As clearing __GFP_WAIT behaves differently, there is a risk that people will clear the wrong flags. This patch renames __GFP_WAIT to __GFP_RECLAIM to clearly indicate what it does -- setting it allows all reclaim activity, clearing them prevents it. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Christoph Lameter Acked-by: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/blk-core.c b/block/blk-core.c index 9e32f08..590cca2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp) if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, @@ -2038,7 +2038,7 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) { + if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { q->make_request_fn(q, bio); diff --git a/block/blk-mq.c b/block/blk-mq.c index 68c0a34..694f870 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|__GFP_HIGH, false, ctx, hctx); + __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index dda653c..0774799 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; goto error; } @@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e5e0f19..3dc53a1 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1007,7 +1007,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM); copy_highpage(page, b->bm_pages[page_nr]); bm_store_page_idx(page, page_nr); } else diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f504232..a28a562 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; - rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true); return blk_mq_rq_to_pdu(rq); } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index b9242d7..562b5a4 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -723,7 +723,7 @@ static int pd_special_command(struct pd_unit *disk, struct request *rq; int err = 0; - rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7be2375..5959c29 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,14 +704,14 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * int ret = 0; rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - WRITE : READ, __GFP_WAIT); + WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out; } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d58cb9e..7e505d4 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2216,7 +2216,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) mapping = file_inode(obj->base.filp)->i_mapping; gfp = mapping_gfp_mask(mapping); gfp |= __GFP_NORETRY | __GFP_NOWARN; - gfp &= ~(__GFP_IO | __GFP_WAIT); + gfp &= ~(__GFP_IO | __GFP_RECLAIM); sg = st->sgl; st->nents = 0; for (i = 0; i < page_count; i++) { diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 1362ad8..05352f4 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, struct request *rq; int error; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = (char *)pc; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 64a6b82..ef907fd 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -441,7 +441,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, struct request *rq; int error; - rq = blk_get_request(drive->queue, write, __GFP_WAIT); + rq = blk_get_request(drive->queue, write, __GFP_RECLAIM); memcpy(rq->cmd, cmd, BLK_MAX_CDB); rq->cmd_type = REQ_TYPE_ATA_PC; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 066e390..474173e 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -303,7 +303,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) struct request *rq; int ret; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_flags = REQ_QUIET; ret = blk_execute_rq(drive->queue, cd->disk, rq, 0); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index b05a74d..0dd43b4 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -165,7 +165,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, if (!(setting->flags & DS_SYNC)) return setting->set(drive, arg); - rq = blk_get_request(q, READ, __GFP_WAIT); + rq = blk_get_request(q, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 5; rq->cmd[0] = REQ_DEVSET_EXEC; diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 56b9708..37a8a90 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -477,7 +477,7 @@ static int set_multcount(ide_drive_t *drive, int arg) if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) return -EBUSY; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; drive->mult_req = arg; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index aa2e9b7..d05db24 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) if (NULL == (void *) arg) { struct request *rq; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; err = blk_execute_rq(drive->queue, NULL, rq, 0); blk_put_request(rq); @@ -221,7 +221,7 @@ static int generic_drive_reset(ide_drive_t *drive) struct request *rq; int ret = 0; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index c808685..2d7dca5 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -31,7 +31,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) } spin_unlock_irq(&hwif->lock); - rq = blk_get_request(q, READ, __GFP_WAIT); + rq = blk_get_request(q, READ, __GFP_RECLAIM); rq->cmd[0] = REQ_PARK_HEADS; rq->cmd_len = 1; rq->cmd_type = REQ_TYPE_DRV_PRIV; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 081e434..e34af48 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -18,7 +18,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; @@ -88,7 +88,7 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; rq->special = &rqpm; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index f5d51d1..12fa049 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -852,7 +852,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); BUG_ON(size < 0 || size % tape->blk_size); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; @@ -860,7 +860,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) if (size) { ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out_put; } diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 0979e12..a716693 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -430,7 +430,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, int error; int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE; - rq = blk_get_request(drive->queue, rw, __GFP_WAIT); + rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; /* @@ -441,7 +441,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, */ if (nsect) { error = blk_rq_map_kern(drive->queue, rq, buf, - nsect * SECTOR_SIZE, __GFP_WAIT); + nsect * SECTOR_SIZE, __GFP_RECLAIM); if (error) goto put_req; } diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 7e00470..4ff340f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1680,7 +1680,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; egrcnt = rcd->rcvegrcnt; egroff = rcd->rcvegr_tid_base; diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 8930087..1e688bf 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -75,7 +75,7 @@ MODULE_LICENSE("GPL"); /* * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't - * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use + * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use * __GFP_NOWARN, to suppress page allocation failure warnings. */ #define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e878590..6c19555 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1025,11 +1025,13 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->special = (void *)0; if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); + ret = blk_rq_map_kern(q, req, buffer, bufflen, + __GFP_DIRECT_RECLAIM); if (ret) goto out; } else if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + __GFP_DIRECT_RECLAIM); if (ret) goto out; bio = req->bio; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 66a96cd..984ddcb 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1970,7 +1970,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) struct request *req; /* - * blk_get_request with GFP_KERNEL (__GFP_WAIT) sleeps until a + * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a * request becomes available */ req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 126a48c..dd8ad2a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -222,13 +222,13 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int write = (data_direction == DMA_TO_DEVICE); int ret = DRIVER_ERROR << 24; - req = blk_get_request(sdev->request_queue, write, __GFP_WAIT); + req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM); if (IS_ERR(req)) return ret; blk_rq_set_block_pc(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, - buffer, bufflen, __GFP_WAIT)) + buffer, bufflen, __GFP_RECLAIM)) goto out; req->cmd_len = COMMAND_SIZE(cmd[0]); diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index 47a1202..8666f3a 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -1560,7 +1560,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; /* * The minimum size of the eager buffers is a groups of MTU-sized diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c index 5d9b9db..13c3cd1 100644 --- a/drivers/staging/rdma/ipath/ipath_file_ops.c +++ b/drivers/staging/rdma/ipath/ipath_file_ops.c @@ -905,7 +905,7 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; egrcnt = dd->ipath_rcvegrcnt; /* TID number offset for this port */ diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index aecd085..9c4b737 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -30,7 +30,7 @@ extern unsigned cachefiles_debug; #define CACHEFILES_DEBUG_KLEAVE 2 #define CACHEFILES_DEBUG_KDEBUG 4 -#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) +#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC) /* * node records diff --git a/fs/direct-io.c b/fs/direct-io.c index 3ae0e04..18e7554 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -361,7 +361,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, /* * bio_alloc() is guaranteed to return a bio when called with - * __GFP_WAIT and we request a valid number of vectors. + * __GFP_RECLAIM and we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index fe529a8..03246ca 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) } /* Default GFP flags using highmem */ -#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) +#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM) int nilfs_mdt_get_block(struct inode *, unsigned long, int, void (*init_block)(struct inode *, diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 86f9f7d..3692272 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -107,7 +107,7 @@ struct vm_area_struct; * can be cleared when the reclaiming of pages would cause unnecessary * disruption. */ -#define __GFP_WAIT ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ @@ -126,12 +126,12 @@ struct vm_area_struct; */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) -#define GFP_NOIO (__GFP_WAIT) -#define GFP_NOFS (__GFP_WAIT | __GFP_IO) -#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ +#define GFP_NOIO (__GFP_RECLAIM) +#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) +#define GFP_TEMPORARY (__GFP_RECLAIM | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) -#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ @@ -143,12 +143,12 @@ struct vm_area_struct; #define GFP_MOVABLE_SHIFT 3 /* Control page allocator reclaim behavior */ -#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) /* Control slab gfp mask during early boot */ -#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb..12cd989 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; @@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index f757151..6d40944 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -135,7 +135,7 @@ static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) * TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course). * * @gfp indicates whether or not to wait until a free id is available (it's not - * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep + * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep * however long it takes until another thread frees an id (same semantics as a * mempool). * diff --git a/mm/failslab.c b/mm/failslab.c index 98fb490..79171b4 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -3,11 +3,11 @@ static struct { struct fault_attr attr; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; bool cache_filter; } failslab = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .cache_filter = false, }; @@ -16,7 +16,7 @@ bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) @@ -42,7 +42,7 @@ static int __init failslab_debugfs_init(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait)) + &failslab.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) diff --git a/mm/filemap.c b/mm/filemap.c index 58e04e2..6ef3674 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2713,7 +2713,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * page is known to the local caching routines. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * */ int try_to_release_page(struct page *page, gfp_t gfp_mask) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5c08b4..9812d46 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -786,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 05374f0..a547067 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2120,7 +2120,7 @@ done_restock: /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here - * if __GFP_WAIT but let's always punt for simplicity and so that + * if __GFP_RECLAIM but let's always punt for simplicity and so that * GFP_KERNEL can consistently be used during reclaim. @memcg is * not recorded as it most likely matches current's and won't * change in the meantime. As high limit is checked again before diff --git a/mm/migrate.c b/mm/migrate.c index e60379e..7890d0b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1752,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70461f3..1b37309 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2160,11 +2160,11 @@ static struct { struct fault_attr attr; bool ignore_gfp_highmem; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; @@ -2183,7 +2183,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_DIRECT_RECLAIM)) + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2202,7 +2203,7 @@ static int __init fail_page_alloc_debugfs(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait)) + &fail_page_alloc.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem)) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index e24121a..6eb6293 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -126,7 +126,7 @@ static void *ima_alloc_pages(loff_t max_size, size_t *allocated_size, { void *ptr; int order = ima_maxorder; - gfp_t gfp_mask = __GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY; + gfp_t gfp_mask = __GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY; if (order) order = min(get_order(max_size), order); -- cgit v0.10.2 From f77cf4e4cc9d40310a7224a1a67c733aeec78836 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:31 -0800 Subject: mm, page_alloc: delete the zonelist_cache The zonelist cache (zlc) was introduced to skip over zones that were recently known to be full. This avoided expensive operations such as the cpuset checks, watermark calculations and zone_reclaim. The situation today is different and the complexity of zlc is harder to justify. 1) The cpuset checks are no-ops unless a cpuset is active and in general are a lot cheaper. 2) zone_reclaim is now disabled by default and I suspect that was a large source of the cost that zlc wanted to avoid. When it is enabled, it's known to be a major source of stalling when nodes fill up and it's unwise to hit every other user with the overhead. 3) Watermark checks are expensive to calculate for high-order allocation requests. Later patches in this series will reduce the cost of the watermark checking. 4) The most important issue is that in the current implementation it is possible for a failed THP allocation to mark a zone full for order-0 allocations and cause a fallback to remote nodes. The last issue could be addressed with additional complexity but as the benefit of zlc is questionable, it is better to remove it. If stalls due to zone_reclaim are ever reported then an alternative would be to introduce deferring logic based on a timeout inside zone_reclaim itself and leave the page allocator fast paths alone. The impact on page-allocator microbenchmarks is negligible as they don't hit the paths where the zlc comes into play. Most page-reclaim related workloads showed no noticeable difference as a result of the removal. The impact was noticeable in a workload called "stutter". One part uses a lot of anonymous memory, a second measures mmap latency and a third copies a large file. In an ideal world the latency application would not notice the mmap latency. On a 2-node machine the results of this patch are stutter 4.3.0-rc1 4.3.0-rc1 baseline nozlc-v4 Min mmap 20.9243 ( 0.00%) 20.7716 ( 0.73%) 1st-qrtle mmap 22.0612 ( 0.00%) 22.0680 ( -0.03%) 2nd-qrtle mmap 22.3291 ( 0.00%) 22.3809 ( -0.23%) 3rd-qrtle mmap 25.2244 ( 0.00%) 25.2396 ( -0.06%) Max-90% mmap 48.0995 ( 0.00%) 28.3713 ( 41.02%) Max-93% mmap 52.5557 ( 0.00%) 36.0170 ( 31.47%) Max-95% mmap 55.8173 ( 0.00%) 47.3163 ( 15.23%) Max-99% mmap 67.3781 ( 0.00%) 70.1140 ( -4.06%) Max mmap 24447.6375 ( 0.00%) 12915.1356 ( 47.17%) Mean mmap 33.7883 ( 0.00%) 27.7944 ( 17.74%) Best99%Mean mmap 27.7825 ( 0.00%) 25.2767 ( 9.02%) Best95%Mean mmap 26.3912 ( 0.00%) 23.7994 ( 9.82%) Best90%Mean mmap 24.9886 ( 0.00%) 23.2251 ( 7.06%) Best50%Mean mmap 22.0157 ( 0.00%) 22.0261 ( -0.05%) Best10%Mean mmap 21.6705 ( 0.00%) 21.6083 ( 0.29%) Best5%Mean mmap 21.5581 ( 0.00%) 21.4611 ( 0.45%) Best1%Mean mmap 21.3079 ( 0.00%) 21.1631 ( 0.68%) Note that the maximum stall latency went from 24 seconds to 12 which is still bad but an improvement. The milage varies considerably 2-node machine on an earlier test went from 494 seconds to 47 seconds and a 4-node machine that tested an earlier version of this patch went from a worst case stall time of 6 seconds to 67ms. The nature of the benchmark is inherently unpredictable as it is hammering the system and the milage will vary between machines. There is a secondary impact with potentially more direct reclaim because zones are now being considered instead of being skipped by zlc. In this particular test run it did not occur so will not be described. However, in at least one test the following was observed 1. Direct reclaim rates were higher. This was likely due to direct reclaim being entered instead of the zlc disabling a zone and busy looping. Busy looping may have the effect of allowing kswapd to make more progress and in some cases may be better overall. If this is found then the correct action is to put direct reclaimers to sleep on a waitqueue and allow kswapd make forward progress. Busy looping on the zlc is even worse than when the allocator used to blindly call congestion_wait(). 2. There was higher swap activity as direct reclaim was active. 3. Direct reclaim efficiency was lower. This is related to 1 as more scanning activity also encountered more pages that could not be immediately reclaimed In that case, the direct page scan and reclaim rates are noticeable but it is not considered a problem for a few reasons 1. The test is primarily concerned with latency. The mmap attempts are also faulted which means there are THP allocation requests. The ZLC could cause zones to be disabled causing the process to busy loop instead of reclaiming. This looks like elevated direct reclaim activity but it's the correct action to take based on what processes requested. 2. The test hammers reclaim and compaction heavily. The number of successful THP faults is highly variable but affects the reclaim stats. It's not a realistic or reasonable measure of page reclaim activity. 3. No other page-reclaim intensive workload that was tested showed a problem. 4. If a workload is identified that benefitted from the busy looping then it should be fixed by having direct reclaimers sleep on a wait queue until woken by kswapd instead of busy looping. We had this class of problem before when congestion_waits() with a fixed timeout was a brain damaged decision but happened to benefit some workloads. If a workload is identified that relied on the zlc to busy loop then it should be fixed correctly and have a direct reclaimer sleep on a waitqueue until woken by kswapd. Signed-off-by: Mel Gorman Acked-by: David Rientjes Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 38bed71..1e88aae 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -589,75 +589,8 @@ static inline bool zone_is_empty(struct zone *zone) * [1] : No fallback (__GFP_THISNODE) */ #define MAX_ZONELISTS 2 - - -/* - * We cache key information from each zonelist for smaller cache - * footprint when scanning for free pages in get_page_from_freelist(). - * - * 1) The BITMAP fullzones tracks which zones in a zonelist have come - * up short of free memory since the last time (last_fullzone_zap) - * we zero'd fullzones. - * 2) The array z_to_n[] maps each zone in the zonelist to its node - * id, so that we can efficiently evaluate whether that node is - * set in the current tasks mems_allowed. - * - * Both fullzones and z_to_n[] are one-to-one with the zonelist, - * indexed by a zones offset in the zonelist zones[] array. - * - * The get_page_from_freelist() routine does two scans. During the - * first scan, we skip zones whose corresponding bit in 'fullzones' - * is set or whose corresponding node in current->mems_allowed (which - * comes from cpusets) is not set. During the second scan, we bypass - * this zonelist_cache, to ensure we look methodically at each zone. - * - * Once per second, we zero out (zap) fullzones, forcing us to - * reconsider nodes that might have regained more free memory. - * The field last_full_zap is the time we last zapped fullzones. - * - * This mechanism reduces the amount of time we waste repeatedly - * reexaming zones for free memory when they just came up low on - * memory momentarilly ago. - * - * The zonelist_cache struct members logically belong in struct - * zonelist. However, the mempolicy zonelists constructed for - * MPOL_BIND are intentionally variable length (and usually much - * shorter). A general purpose mechanism for handling structs with - * multiple variable length members is more mechanism than we want - * here. We resort to some special case hackery instead. - * - * The MPOL_BIND zonelists don't need this zonelist_cache (in good - * part because they are shorter), so we put the fixed length stuff - * at the front of the zonelist struct, ending in a variable length - * zones[], as is needed by MPOL_BIND. - * - * Then we put the optional zonelist cache on the end of the zonelist - * struct. This optional stuff is found by a 'zlcache_ptr' pointer in - * the fixed length portion at the front of the struct. This pointer - * both enables us to find the zonelist cache, and in the case of - * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) - * to know that the zonelist cache is not there. - * - * The end result is that struct zonelists come in two flavors: - * 1) The full, fixed length version, shown below, and - * 2) The custom zonelists for MPOL_BIND. - * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. - * - * Even though there may be multiple CPU cores on a node modifying - * fullzones or last_full_zap in the same zonelist_cache at the same - * time, we don't lock it. This is just hint data - if it is wrong now - * and then, the allocator will still function, perhaps a bit slower. - */ - - -struct zonelist_cache { - unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ - DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ - unsigned long last_full_zap; /* when last zap'd (jiffies) */ -}; #else #define MAX_ZONELISTS 1 -struct zonelist_cache; #endif /* @@ -675,9 +608,6 @@ struct zoneref { * allocation, the other zones are fallback zones, in decreasing * priority. * - * If zlcache_ptr is not NULL, then it is just the address of zlcache, - * as explained above. If zlcache_ptr is NULL, there is no zlcache. - * * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are @@ -687,11 +617,7 @@ struct zoneref { * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { - struct zonelist_cache *zlcache_ptr; // NULL or &zlcache struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; -#ifdef CONFIG_NUMA - struct zonelist_cache zlcache; // optional ... -#endif }; #ifndef CONFIG_DISCONTIGMEM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b37309..8dc6e3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2292,122 +2292,6 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA -/* - * zlc_setup - Setup for "zonelist cache". Uses cached zone data to - * skip over zones that are not allowed by the cpuset, or that have - * been recently (in last second) found to be nearly full. See further - * comments in mmzone.h. Reduces cache footprint of zonelist scans - * that have to skip over a lot of full or unallowed zones. - * - * If the zonelist cache is present in the passed zonelist, then - * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_MEMORY].) - * - * If the zonelist cache is not available for this zonelist, does - * nothing and returns NULL. - * - * If the fullzones BITMAP in the zonelist cache is stale (more than - * a second since last zap'd) then we zap it out (clear its bits.) - * - * We hold off even calling zlc_setup, until after we've checked the - * first zone in the zonelist, on the theory that most allocations will - * be satisfied from that first zone, so best to examine that zone as - * quickly as we can. - */ -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - nodemask_t *allowednodes; /* zonelist_cache approximation */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return NULL; - - if (time_after(jiffies, zlc->last_full_zap + HZ)) { - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - zlc->last_full_zap = jiffies; - } - - allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? - &cpuset_current_mems_allowed : - &node_states[N_MEMORY]; - return allowednodes; -} - -/* - * Given 'z' scanning a zonelist, run a couple of quick checks to see - * if it is worth looking at further for free memory: - * 1) Check that the zone isn't thought to be full (doesn't have its - * bit set in the zonelist_cache fullzones BITMAP). - * 2) Check that the zones node (obtained from the zonelist_cache - * z_to_n[] mapping) is allowed in the passed in allowednodes mask. - * Return true (non-zero) if zone is worth looking at further, or - * else return false (zero) if it is not. - * - * This check -ignores- the distinction between various watermarks, - * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is - * found to be full for any variation of these watermarks, it will - * be considered full for up to one second by all requests, unless - * we are so low on memory on all allowed nodes that we are forced - * into the second scan of the zonelist. - * - * In the second scan we ignore this zonelist cache and exactly - * apply the watermarks to all zones, even it is slower to do so. - * We are low on memory in the second scan, and should leave no stone - * unturned looking for a free page. - */ -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - int n; /* node that zone *z is on */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return 1; - - i = z - zonelist->_zonerefs; - n = zlc->z_to_n[i]; - - /* This zone is worth trying if it is allowed but not full */ - return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); -} - -/* - * Given 'z' scanning a zonelist, set the corresponding bit in - * zlc->fullzones, so that subsequent attempts to allocate a page - * from that zone don't waste time re-examining it. - */ -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - i = z - zonelist->_zonerefs; - - set_bit(i, zlc->fullzones); -} - -/* - * clear all zones full, called after direct reclaim makes progress so that - * a zone that was recently full is not skipped over for up to a second - */ -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return local_zone->node == zone->node; @@ -2418,28 +2302,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } - #else /* CONFIG_NUMA */ - -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - return NULL; -} - -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - return 1; -} - -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ -} - -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return true; @@ -2449,7 +2312,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } - #endif /* CONFIG_NUMA */ static void reset_alloc_batches(struct zone *preferred_zone) @@ -2476,9 +2338,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ - int zlc_active = 0; /* set if using zonelist_cache */ - int did_zlc_setup = 0; /* just call zlc_setup() one time */ int nr_fair_skipped = 0; bool zonelist_rescan; @@ -2493,9 +2352,6 @@ zonelist_scan: ac->nodemask) { unsigned long mark; - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) - continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed(zone, gfp_mask)) @@ -2553,28 +2409,8 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && - !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup if there are multiple nodes - * and before considering the first zone allowed - * by the cpuset. - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } - if (zone_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zone, zone)) - goto this_zone_full; - - /* - * As we may have just activated ZLC, check if the first - * eligible zone has failed zone_reclaim recently. - */ - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2591,19 +2427,6 @@ zonelist_scan: ac->classzone_idx, alloc_flags)) goto try_this_zone; - /* - * Failed to reclaim enough to meet watermark. - * Only mark the zone full if checking the min - * watermark or if we failed to reclaim just - * 1<zonelist); - retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -4228,20 +4038,6 @@ static void build_zonelists(pg_data_t *pgdat) build_thisnode_zonelists(pgdat); } -/* Construct the zonelist performance cache - see further mmzone.h */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - struct zonelist *zonelist; - struct zonelist_cache *zlc; - struct zoneref *z; - - zonelist = &pgdat->node_zonelists[0]; - zonelist->zlcache_ptr = zlc = &zonelist->zlcache; - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - for (z = zonelist->_zonerefs; z->zone; z++) - zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); -} - #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. @@ -4302,12 +4098,6 @@ static void build_zonelists(pg_data_t *pgdat) zonelist->_zonerefs[j].zone_idx = 0; } -/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - pgdat->node_zonelists[0].zlcache_ptr = NULL; -} - #endif /* CONFIG_NUMA */ /* @@ -4348,14 +4138,12 @@ static int __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); - build_zonelist_cache(self); } for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); - build_zonelist_cache(pgdat); } /* -- cgit v0.10.2 From 974a786e63c96a2401a78ddba926f34c128474f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:34 -0800 Subject: mm, page_alloc: remove MIGRATE_RESERVE MIGRATE_RESERVE preserves an old property of the buddy allocator that existed prior to fragmentation avoidance -- min_free_kbytes worth of pages tended to remain contiguous until the only alternative was to fail the allocation. At the time it was discovered that high-order atomic allocations relied on this property so MIGRATE_RESERVE was introduced. A later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch deletes MIGRATE_RESERVE and supporting code so it'll be easier to review. Note that this patch in isolation may look like a false regression if someone was bisecting high-order atomic allocation failures. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e88aae..b86cfa3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -39,8 +39,6 @@ enum { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, - MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ - MIGRATE_RESERVE = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way @@ -63,6 +61,8 @@ enum { MIGRATE_TYPES }; +#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1) + #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) #else @@ -429,12 +429,6 @@ struct zone { const char *name; - /* - * Number of MIGRATE_RESERVE page block. To maintain for just - * optimization. Protected by zone->lock. - */ - int nr_migrate_reserve_block; - #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9812d46..dabd247 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void) for_each_populated_zone(zone) nr_zones++; - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8dc6e3c..5888126 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -817,7 +817,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); } while (--to_free && --batch_free && !list_empty(list)); @@ -1417,15 +1416,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][4] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif - [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ #endif }; @@ -1598,7 +1596,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, *can_steal = false; for (i = 0;; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_RESERVE) + if (fallback_mt == MIGRATE_TYPES) break; if (list_empty(&area->free_list[fallback_mt])) @@ -1676,25 +1674,13 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; -retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - - if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { + if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); if (!page) page = __rmqueue_fallback(zone, order, migratetype); - - /* - * Use MIGRATE_RESERVE rather than fail an allocation. goto - * is used because __rmqueue_smallest is an inline function - * and we want just one call site - */ - if (!page) { - migratetype = MIGRATE_RESERVE; - goto retry_reserve; - } } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -3492,7 +3478,6 @@ static void show_migration_types(unsigned char type) [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RESERVE] = 'R', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif @@ -4303,120 +4288,6 @@ static inline unsigned long wait_table_bits(unsigned long size) } /* - * Check if a pageblock contains reserved pages - */ -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) - return 1; - } - return 0; -} - -/* - * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on min_wmark_pages(zone). The memory within - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes - * higher will lead to a bigger reserve which will get freed as contiguous - * blocks as reclaim kicks in - */ -static void setup_zone_migrate_reserve(struct zone *zone) -{ - unsigned long start_pfn, pfn, end_pfn, block_end_pfn; - struct page *page; - unsigned long block_migratetype; - int reserve; - int old_reserve; - - /* - * Get the start pfn, end pfn and the number of blocks to reserve - * We have to be careful to be aligned to pageblock_nr_pages to - * make sure that we always check pfn_valid for the first page in - * the block. - */ - start_pfn = zone->zone_start_pfn; - end_pfn = zone_end_pfn(zone); - start_pfn = roundup(start_pfn, pageblock_nr_pages); - reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> - pageblock_order; - - /* - * Reserve blocks are generally in place to help high-order atomic - * allocations that are short-lived. A min_free_kbytes value that - * would result in more than 2 reserve blocks for atomic allocations - * is assumed to be in place to help anti-fragmentation for the - * future allocation of hugepages at runtime. - */ - reserve = min(2, reserve); - old_reserve = zone->nr_migrate_reserve_block; - - /* When memory hot-add, we almost always need to do nothing */ - if (reserve == old_reserve) - return; - zone->nr_migrate_reserve_block = reserve; - - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) - return; - - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - - /* Watch out for overlapping nodes */ - if (page_to_nid(page) != zone_to_nid(zone)) - continue; - - block_migratetype = get_pageblock_migratetype(page); - - /* Only test what is necessary when the reserves are not met */ - if (reserve > 0) { - /* - * Blocks with reserved pages will never free, skip - * them. - */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - - /* If this block is reserved, account for it */ - if (block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } - - /* Suitable for reserving if this block is movable */ - if (block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, - MIGRATE_RESERVE); - move_freepages_block(zone, page, - MIGRATE_RESERVE); - reserve--; - continue; - } - } else if (!old_reserve) { - /* - * At boot time we don't need to scan the whole zone - * for turning off MIGRATE_RESERVE. - */ - break; - } - - /* - * If the reserve is met and this is a previous reserved block, - * take it back - */ - if (block_migratetype == MIGRATE_RESERVE) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); - } - } -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -4455,9 +4326,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() + * kernel allocations are made. * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) @@ -6018,7 +5887,6 @@ static void __setup_per_zone_wmarks(void) high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } diff --git a/mm/vmstat.c b/mm/vmstat.c index ffcb4f5..5b289dc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -923,7 +923,6 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Reclaimable", "Movable", - "Reserve", #ifdef CONFIG_CMA "CMA", #endif -- cgit v0.10.2 From 0aaa29a56e4fb0fc9e24edb649e2733a672ca099 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:37 -0800 Subject: mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand High-order watermark checking exists for two reasons -- kswapd high-order awareness and protection for high-order atomic requests. Historically the kernel depended on MIGRATE_RESERVE to preserve min_free_kbytes as high-order free pages for as long as possible. This patch introduces MIGRATE_HIGHATOMIC that reserves pageblocks for high-order atomic allocations on demand and avoids using those blocks for order-0 allocations. This is more flexible and reliable than MIGRATE_RESERVE was. A MIGRATE_HIGHORDER pageblock is created when an atomic high-order allocation request steals a pageblock but limits the total number to 1% of the zone. Callers that speculatively abuse atomic allocations for long-lived high-order allocations to access the reserve will quickly fail. Note that SLUB is currently not such an abuser as it reclaims at least once. It is possible that the pageblock stolen has few suitable high-order pages and will need to steal again in the near future but there would need to be strong justification to search all pageblocks for an ideal candidate. The pageblocks are unreserved if an allocation fails after a direct reclaim attempt. The watermark checks account for the reserved pageblocks when the allocation request is not a high-order atomic allocation. The reserved pageblocks can not be used for order-0 allocations. This may allow temporary wastage until a failed reclaim reassigns the pageblock. This is deliberate as the intent of the reservation is to satisfy a limited number of atomic high-order short-lived requests if the system requires them. The stutter benchmark was used to evaluate this but while it was running there was a systemtap script that randomly allocated between 1 high-order page and 12.5% of memory's worth of order-3 pages using GFP_ATOMIC. This is much larger than the potential reserve and it does not attempt to be realistic. It is intended to stress random high-order allocations from an unknown source, show that there is a reduction in failures without introducing an anomaly where atomic allocations are more reliable than regular allocations. The amount of memory reserved varied throughout the workload as reserves were created and reclaimed under memory pressure. The allocation failures once the workload warmed up were as follows; 4.2-rc5-vanilla 70% 4.2-rc5-atomic-reserve 56% The failure rate was also measured while building multiple kernels. The failure rate was 14% but is 6% with this patch applied. Overall, this is a small reduction but the reserves are small relative to the number of allocation requests. In early versions of the patch, the failure rate reduced by a much larger amount but that required much larger reserves and perversely made atomic allocations seem more reliable than regular allocations. [yalin.wang2010@gmail.com: fix redundant check and a memory leak] Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: yalin wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b86cfa3..d3bafe4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -39,6 +39,8 @@ enum { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, + MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ + MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way @@ -61,8 +63,6 @@ enum { MIGRATE_TYPES }; -#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1) - #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) #else @@ -334,6 +334,8 @@ struct zone { /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; + unsigned long nr_reserved_highatomic; + /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5888126..55e9c56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1615,6 +1615,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; } +/* + * Reserve a pageblock for exclusive use of high-order atomic allocations if + * there are no empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, + unsigned int alloc_order) +{ + int mt; + unsigned long max_managed, flags; + + /* + * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * Check is race-prone but harmless. + */ + max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_HIGHATOMIC && + !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + zone->nr_reserved_highatomic += pageblock_nr_pages; + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + */ +static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { + /* Preserve at least one pageblock */ + if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &(zone->free_area[order]); + + if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + continue; + + page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, + struct page, lru); + + /* + * It should never happen but changes to locking could + * inadvertently allow a per-cpu drain to add pages + * to MIGRATE_HIGHATOMIC while unreserving so be safe + * and watch for underflows. + */ + zone->nr_reserved_highatomic -= min(pageblock_nr_pages, + zone->nr_reserved_highatomic); + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + set_pageblock_migratetype(page, ac->migratetype); + move_freepages_block(zone, page, ac->migratetype); + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + spin_unlock_irqrestore(&zone->lock, flags); + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) @@ -1670,7 +1765,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) + int migratetype, gfp_t gfp_flags) { struct page *page; @@ -1700,7 +1795,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page = __rmqueue(zone, order, migratetype, 0); if (unlikely(page == NULL)) break; @@ -2072,7 +2167,7 @@ int split_free_page(struct page *page) static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int alloc_flags, int migratetype) { unsigned long flags; struct page *page; @@ -2115,7 +2210,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order, migratetype); + + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype, gfp_flags); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2226,15 +2329,24 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages may go negative - that's OK */ long min = mark; int o; long free_cma = 0; + /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (alloc_flags & ALLOC_HARDER) + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!(alloc_flags & ALLOC_HARDER))) + free_pages -= z->nr_reserved_highatomic; + else min -= min / 4; #ifdef CONFIG_CMA @@ -2419,10 +2531,18 @@ zonelist_scan: try_this_zone: page = buffered_rmqueue(ac->preferred_zone, zone, order, - gfp_mask, ac->migratetype); + gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; + + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + reserve_highatomic_pageblock(page, zone, order); + return page; } } @@ -2695,9 +2815,11 @@ retry: /* * If an allocation failed after direct reclaim, it could be because - * pages are pinned on the per-cpu lists. Drain them and try again + * pages are pinned on the per-cpu lists or in high alloc reserves. + * Shrink them them and try again */ if (!page && !drained) { + unreserve_highatomic_pageblock(ac); drain_all_pages(NULL); drained = true; goto retry; diff --git a/mm/vmstat.c b/mm/vmstat.c index 5b289dc..879a2be 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -923,6 +923,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Reclaimable", "Movable", + "HighAtomic", #ifdef CONFIG_CMA "CMA", #endif -- cgit v0.10.2 From 97a16fc82a7c5b0cfce95c05dfb9561e306ca1b1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:40 -0800 Subject: mm, page_alloc: only enforce watermarks for order-0 allocations The primary purpose of watermarks is to ensure that reclaim can always make forward progress in PF_MEMALLOC context (kswapd and direct reclaim). These assume that order-0 allocations are all that is necessary for forward progress. High-order watermarks serve a different purpose. Kswapd had no high-order awareness before they were introduced (https://lkml.kernel.org/r/413AA7B2.4000907@yahoo.com.au). This was particularly important when there were high-order atomic requests. The watermarks both gave kswapd awareness and made a reserve for those atomic requests. There are two important side-effects of this. The most important is that a non-atomic high-order request can fail even though free pages are available and the order-0 watermarks are ok. The second is that high-order watermark checks are expensive as the free list counts up to the requested order must be examined. With the introduction of MIGRATE_HIGHATOMIC it is no longer necessary to have high-order watermarks. Kswapd and compaction still need high-order awareness which is handled by checking that at least one suitable high-order page is free. With the patch applied, there was little difference in the allocation failure rates as the atomic reserves are small relative to the number of allocation attempts. The expected impact is that there will never be an allocation failure report that shows suitable pages on the free lists. The one potential side-effect of this is that in a vanilla kernel, the watermark checks may have kept a free page for an atomic allocation. Now, we are 100% relying on the HighAtomic reserves and an early allocation to have allocated them. If the first high-order atomic allocation is after the system is already heavily fragmented then it'll fail. [akpm@linux-foundation.org: simplify __zone_watermark_ok(), per Vlastimil] Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 55e9c56..b8d560a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2322,8 +2322,10 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return true if free pages are above 'mark'. This takes into account the order - * of the allocation. + * Return true if free base pages are above 'mark'. For high-order checks it + * will return true of the order-0 watermark is reached and there is at least + * one free page of a suitable size. Checking now avoids taking the zone lock + * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, @@ -2331,7 +2333,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, { long min = mark; int o; - long free_cma = 0; + const int alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2344,7 +2346,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ - if (likely(!(alloc_flags & ALLOC_HARDER))) + if (likely(!alloc_harder)) free_pages -= z->nr_reserved_highatomic; else min -= min / 4; @@ -2352,22 +2354,45 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + /* + * Check watermarks for an order-0 allocation request. If these + * are not met, then a high-order request also cannot go ahead + * even if a suitable page happened to be free. + */ + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) return false; - for (o = 0; o < order; o++) { - /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; - /* Require fewer higher order pages to be free */ - min >>= 1; + /* If this is an order-0 request then the watermark is fine */ + if (!order) + return true; + + /* For a high-order request, check at least one suitable page is free */ + for (o = order; o < MAX_ORDER; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + + if (!area->nr_free) + continue; + + if (alloc_harder) + return true; - if (free_pages <= min) - return false; + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { + if (!list_empty(&area->free_list[mt])) + return true; + } + +#ifdef CONFIG_CMA + if ((alloc_flags & ALLOC_CMA) && + !list_empty(&area->free_list[MIGRATE_CMA])) { + return true; + } +#endif } - return true; + return false; } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, -- cgit v0.10.2 From dd56b046426760aa0c852ad6e4b6b07891222d65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:43 -0800 Subject: mm: page_alloc: hide some GFP internals and document the bits and flag combinations Andrew stated the following We have quite a history of remote parts of the kernel using weird/wrong/inexplicable combinations of __GFP_ flags. I tend to think that this is because we didn't adequately explain the interface. And I don't think that gfp.h really improved much in this area as a result of this patchset. Could you go through it some time and decide if we've adequately documented all this stuff? This patches first moves some GFP flag combinations that are part of the MM internals to mm/internal.h. The rest of the patch documents the __GFP_FOO bits under various headings and then documents the flag combinations. It will not help callers that are brain damaged but the clarity might motivate some fixes and avoid future mistakes. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Rik van Riel Cc: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3692272..6523109 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,9 +39,7 @@ struct vm_area_struct; /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* - * GFP bitmasks.. - * - * Zone modifiers (see linux/mmzone.h - low three bits) + * Physical address zone modifiers (see linux/mmzone.h - low four bits) * * Do not put any conditional on these. If necessary modify the definitions * without the underscores and use them consistently. The definitions here may @@ -51,120 +49,211 @@ struct vm_area_struct; #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */ +#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) + /* - * Action modifiers - doesn't change the zoning + * Page mobility and placement hints * - * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt - * _might_ fail. This depends upon the particular VM implementation. + * These flags provide hints about how mobile the page is. Pages with similar + * mobility are placed within the same pageblocks to minimise problems due + * to external fragmentation. * - * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. New users should be evaluated carefully - * (and the flag should be used only when there is no reasonable failure policy) - * but it is definitely preferable to use the flag rather than opencode endless - * loop around allocator. + * __GFP_MOVABLE (also a zone modifier) indicates that the page can be + * moved by page migration during memory compaction or can be reclaimed. * - * __GFP_NORETRY: The VM implementation must not retry indefinitely and will - * return NULL when direct reclaim and memory compaction have failed to allow - * the allocation to succeed. The OOM killer is not called with the current - * implementation. + * __GFP_RECLAIMABLE is used for slab allocations that specify + * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. + * + * __GFP_WRITE indicates the caller intends to dirty the page. Where possible, + * these pages will be spread between local zones to avoid all the dirty + * pages being in one zone (fair zone allocation policy). * - * __GFP_MOVABLE: Flag that this page will be movable by the page migration - * mechanism or reclaimed + * __GFP_HARDWALL enforces the cpuset memory allocation policy. + * + * __GFP_THISNODE forces the allocation to be satisified from the requested + * node with no fallbacks or placement policy enforcements. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) /* Caller cannot wait or reschedule */ -#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */ -#define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */ -#define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */ -#define __GFP_COLD ((__force gfp_t)___GFP_COLD) /* Cache-cold page required */ -#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) /* Suppress page allocation failure warning */ -#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ -#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ -#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ -#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */ -#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ -#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ -#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves. - * This takes precedence over the - * __GFP_MEMALLOC flag if both are - * set - */ -#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ -#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ -#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ -#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */ -#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ - -#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ +#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) +#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) +#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) /* - * A caller that is willing to wait may enter direct reclaim and will - * wake kswapd to reclaim pages in the background until the high - * watermark is met. A caller may wish to clear __GFP_DIRECT_RECLAIM to - * avoid unnecessary delays when a fallback option is available but - * still allow kswapd to reclaim in the background. The kswapd flag - * can be cleared when the reclaiming of pages would cause unnecessary - * disruption. + * Watermark modifiers -- controls access to emergency reserves + * + * __GFP_HIGH indicates that the caller is high-priority and that granting + * the request is necessary before the system can make forward progress. + * For example, creating an IO context to clean pages. + * + * __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is + * high priority. Users are typically interrupt handlers. This may be + * used in conjunction with __GFP_HIGH + * + * __GFP_MEMALLOC allows access to all memory. This should only be used when + * the caller guarantees the allocation will allow more memory to be freed + * very shortly e.g. process exiting or swapping. Users either should + * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * + * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. + * This takes precedence over the __GFP_MEMALLOC flag if both are set. + * + * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement. */ -#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) +#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) +#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) +#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) +#define __GFP_NOACCOUNT ((__force gfp_t)___GFP_NOACCOUNT) + +/* + * Reclaim modifiers + * + * __GFP_IO can start physical IO. + * + * __GFP_FS can call down to the low-level FS. Clearing the flag avoids the + * allocator recursing into the filesystem which might already be holding + * locks. + * + * __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. + * This flag can be cleared to avoid unnecessary delays when a fallback + * option is available. + * + * __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when + * the low watermark is reached and have it reclaim pages until the high + * watermark is reached. A caller may wish to clear this flag when fallback + * options are available and the reclaim is likely to disrupt the system. The + * canonical example is THP allocation where a fallback is cheap but + * reclaim/compaction may cause indirect stalls. + * + * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. + * + * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt + * _might_ fail. This depends upon the particular VM implementation. + * + * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller + * cannot handle allocation failures. New users should be evaluated carefully + * (and the flag should be used only when there is no reasonable failure + * policy) but it is definitely preferable to use the flag rather than + * opencode endless loop around allocator. + * + * __GFP_NORETRY: The VM implementation must not retry indefinitely and will + * return NULL when direct reclaim and memory compaction have failed to allow + * the allocation to succeed. The OOM killer is not called with the current + * implementation. + */ +#define __GFP_IO ((__force gfp_t)___GFP_IO) +#define __GFP_FS ((__force gfp_t)___GFP_FS) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ +#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) +#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) +#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* - * This may seem redundant, but it's a way of annotating false positives vs. - * allocations that simply cannot be supported (e.g. page tables). + * Action modifiers + * + * __GFP_COLD indicates that the caller does not expect to be used in the near + * future. Where possible, a cache-cold page will be returned. + * + * __GFP_NOWARN suppresses allocation failure reports. + * + * __GFP_COMP address compound page metadata. + * + * __GFP_ZERO returns a zeroed page on success. + * + * __GFP_NOTRACK avoids tracking with kmemcheck. + * + * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of + * distinguishing in the source between false positives and allocations that + * cannot be supported (e.g. page tables). + * + * __GFP_OTHER_NODE is for allocations that are on a remote node but that + * should not be accounted for as a remote allocation in vmstat. A + * typical user would be khugepaged collapsing a huge page on a remote + * node. */ +#define __GFP_COLD ((__force gfp_t)___GFP_COLD) +#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) +#define __GFP_COMP ((__force gfp_t)___GFP_COMP) +#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) +#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) +#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) -#define __GFP_BITS_SHIFT 26 /* Room for N __GFP_FOO bits */ +/* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 26 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* - * GFP_ATOMIC callers can not sleep, need the allocation to succeed. - * A lower watermark is applied to allow access to "atomic reserves" + * Useful GFP flag combinations that are commonly used. It is recommended + * that subsystems start with one of these combinations and then set/clear + * __GFP_FOO flags as necessary. + * + * GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower + * watermark is applied to allow access to "atomic reserves" + * + * GFP_KERNEL is typical for kernel-internal allocations. The caller requires + * ZONE_NORMAL or a lower zone for direct access but can direct reclaim. + * + * GFP_NOWAIT is for kernel allocations that should not stall for direct + * reclaim, start physical IO or use any filesystem callback. + * + * GFP_NOIO will use direct reclaim to discard clean pages or slab pages + * that do not require the starting of any physical IO. + * + * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * + * GFP_USER is for userspace allocations that also need to be directly + * accessibly by the kernel or hardware. It is typically used by hardware + * for buffers that are mapped to userspace (e.g. graphics) that hardware + * still must DMA to. cpuset limits are enforced for these allocations. + * + * GFP_DMA exists for historical reasons and should be avoided where possible. + * The flags indicates that the caller requires that the lowest zone be + * used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but + * it would require careful auditing as some users really require it and + * others use the flag to avoid lowmem reserves in ZONE_DMA and treat the + * lowest zone as a type of emergency reserve. + * + * GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit + * address. + * + * GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, + * do not need to be directly accessible by the kernel but that cannot + * move once in use. An example may be a hardware allocation that maps + * data directly into userspace but has no addressing limitations. + * + * GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not + * need direct access to but can use kmap() when access is required. They + * are expected to be movable via page reclaim or page migration. Typically, + * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. + * + * GFP_TRANSHUGE is used for THP allocations. They are compound allocations + * that will fail quickly if memory is not available and will not wake + * kswapd on failure. */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) -#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_TEMPORARY (__GFP_RECLAIM | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_DMA __GFP_DMA +#define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ ~__GFP_KSWAPD_RECLAIM) -/* This mask makes up all the page movable related flags */ +/* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 -/* Control page allocator reclaim behavior */ -#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ - __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) - -/* Control slab gfp mask during early boot */ -#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) - -/* Control allocation constraints */ -#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) - -/* Do not use these with a slab allocator */ -#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) - -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some - platforms, used as appropriate on others */ - -#define GFP_DMA __GFP_DMA - -/* 4GB DMA on some platforms */ -#define GFP_DMA32 __GFP_DMA32 - -/* Convert GFP flags to their corresponding migrate type */ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); @@ -177,6 +266,8 @@ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) /* Group based on mobility */ return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } +#undef GFP_MOVABLE_MASK +#undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { diff --git a/mm/internal.h b/mm/internal.h index ff0f1ad..5b7841f6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,25 @@ #include #include +/* + * The set of flags that only affect watermark checking and reclaim + * behaviour. This is used by the MM to obey the caller constraints + * about IO, FS and watermark checking while ignoring placement + * hints such as HIGHMEM usage. + */ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + +/* The GFP flags allowed during early boot */ +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) + +/* Control allocation cpuset and node placement constraints */ +#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) + +/* Do not use these with a slab allocator */ +#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/shmem.c b/mm/shmem.c index 3b8b739..9187eee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt; #include #include +#include "internal.h" + #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ee94dc..d045634 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -35,6 +35,8 @@ #include #include +#include "internal.h" + struct vfree_deferred { struct llist_head list; struct work_struct wq; -- cgit v0.10.2 From 89903327607232de32f05100cf03f9390b858e0b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Nov 2015 16:28:46 -0800 Subject: include/linux/mmzone.h: reflow comment Someone has an 86 column display. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d3bafe4..e23a9e7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -337,12 +337,13 @@ struct zone { unsigned long nr_reserved_highatomic; /* - * We don't know if the memory that we're going to allocate will be freeable - * or/and it will be released eventually, so to avoid totally wasting several - * GB of ram we must reserve some of the lower zone memory (otherwise we risk - * to run OOM on the lower zones despite there's tons of freeable ram - * on the higher zones). This array is recalculated at runtime if the - * sysctl_lowmem_reserve_ratio sysctl changes. + * We don't know if the memory that we're going to allocate will be + * freeable or/and it will be released eventually, so to avoid totally + * wasting several GB of ram we must reserve some of the lower zone + * memory (otherwise we risk to run OOM on the lower zones despite + * there being tons of freeable ram on the higher zones). This array is + * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl + * changes. */ long lowmem_reserve[MAX_NR_ZONES]; -- cgit v0.10.2 From c62d25556be6c965dc14288e796a576e8e39a7e9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 6 Nov 2015 16:28:49 -0800 Subject: mm, fs: introduce mapping_gfp_constraint() There are many places which use mapping_gfp_mask to restrict a more generic gfp mask which would be used for allocations which are not directly related to the page cache but they are performed in the same context. Let's introduce a helper function which makes the restriction explicit and easier to track. This patch doesn't introduce any functional changes. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 3c2d4ab..1d47d2e 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -491,7 +491,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) * __GFP_DMA32 to be set in mapping_gfp_mask(inode->i_mapping) * so shmem can relocate pages during swapin if required. */ - BUG_ON((mapping_gfp_mask(mapping) & __GFP_DMA32) && + BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) && (page_to_pfn(p) >= 0x00100000UL)); } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 7e505d4..399aab2 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2214,9 +2214,8 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) * Fail silently without starting the shrinker */ mapping = file_inode(obj->base.filp)->i_mapping; - gfp = mapping_gfp_mask(mapping); + gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM)); gfp |= __GFP_NORETRY | __GFP_NOWARN; - gfp &= ~(__GFP_IO | __GFP_RECLAIM); sg = st->sgl; st->nents = 0; for (i = 0; i < page_count; i++) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 57ee8ca..36dfeff 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & - ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) break; - if (add_to_page_cache_lru(page, mapping, pg_index, - GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { page_cache_release(page); goto next; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 938efe3..eb90f0f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3316,7 +3316,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { - return mapping_gfp_mask(mapping) & ~__GFP_FS; + return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* extent-tree.c */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index abe3a66..ed05da1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & - ~(__GFP_FS | __GFP_HIGHMEM)); + mapping_gfp_constraint(inode->i_mapping, + ~(__GFP_FS | __GFP_HIGHMEM))); return inode; } diff --git a/fs/buffer.c b/fs/buffer.c index 82283ab..51aff02 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -999,7 +999,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; /* * XXX: __getblk_slow() can not really deal with failure and diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9d23e78..b7d218a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int ret1; struct address_space *mapping = inode->i_mapping; struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_mask(mapping) & - ~__GFP_FS); + mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out; @@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, if (i_size_read(inode) == 0) return; page = find_or_create_page(mapping, 0, - mapping_gfp_mask(mapping) & ~__GFP_FS); + mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) return; if (PageUptodate(page)) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 47c5c97..0068e82 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); INIT_LIST_HEAD(tmplist); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 612fbcf..60aaecd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3344,7 +3344,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, int err = 0; page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return -ENOMEM; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 560af04..1061611 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) + mapping_gfp_constraint(mapping, GFP_KERNEL))) goto next_page; } diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 7f9b096..6de0fbf 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, filler_t *filler = super->s_devops->readpage; struct page *page; - BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS)); if (use_filler) page = read_cache_page(mapping, index, filler, sb); else { diff --git a/fs/mpage.c b/fs/mpage.c index 09abba7..1480d3a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -361,7 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); map_bh.b_state = 0; map_bh.b_size = 0; @@ -397,7 +397,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping); + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); map_bh.b_state = 0; map_bh.b_size = 0; diff --git a/fs/namei.c b/fs/namei.c index 0d3340b..3c18970 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4604,7 +4604,7 @@ EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, - !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); + !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); } EXPORT_SYMBOL(page_symlink); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 4a73d6d..ac2f649 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) goto failed; mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); root = NILFS_I(dir)->i_root; ii = NILFS_I(inode); @@ -522,7 +522,7 @@ static int __nilfs_read_inode(struct super_block *sb, up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); return 0; failed_unmap: diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 262561f..9d383e5 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, } } err = add_to_page_cache_lru(*cached_page, mapping, - index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(err)) { if (err == -EEXIST) continue; diff --git a/fs/splice.c b/fs/splice.c index 5fc1e50..801c21c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a6c78e0..26eabf5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -69,6 +69,13 @@ static inline gfp_t mapping_gfp_mask(struct address_space * mapping) return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; } +/* Restricts the given gfp_mask to what the mapping allows. */ +static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, + gfp_t gfp_mask) +{ + return mapping_gfp_mask(mapping) & gfp_mask; +} + /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... diff --git a/mm/filemap.c b/mm/filemap.c index 6ef3674..1bb00762 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1722,7 +1722,7 @@ no_cached_page: goto out; } error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1824,7 +1824,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) return -ENOMEM; ret = add_to_page_cache_lru(page, mapping, offset, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) diff --git a/mm/readahead.c b/mm/readahead.c index 998ad59..ba22d7f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -90,7 +90,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, page = list_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -128,7 +128,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, struct page *page = list_to_page(pages); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } page_cache_release(page); -- cgit v0.10.2 From d6669d689f397137381fe6729293e0eba1ef09a8 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 6 Nov 2015 16:28:52 -0800 Subject: thp: remove unused vma parameter from khugepaged_alloc_page The "vma" parameter to khugepaged_alloc_page() is unused. It has to remain unused or the drop read lock 'map_sem' optimisation introduce by commit 8b1645685acf ("mm, THP: don't hold mmap_sem in khugepaged when allocating THP") wouldn't be safe. So let's remove it. Signed-off-by: Aaron Tomlin Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dabd247..73266ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2413,8 +2413,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { VM_BUG_ON_PAGE(*hpage, *hpage); @@ -2481,8 +2480,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); @@ -2530,7 +2528,7 @@ static void collapse_huge_page(struct mm_struct *mm, __GFP_THISNODE; /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); if (!new_page) return; -- cgit v0.10.2 From 23d0127096cb91cb6d354bdc71bd88a7bae3a1d5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 6 Nov 2015 16:28:55 -0800 Subject: fs/sync.c: make sync_file_range(2) use WB_SYNC_NONE writeback sync_file_range(2) is documented to issue writeback only for pages that are not currently being written. After all the system call has been created for userspace to be able to issue background writeout and so waiting for in-flight IO is undesirable there. However commit ee53a891f474 ("mm: do_sync_mapping_range integrity fix") switched do_sync_mapping_range() and thus sync_file_range() to issue writeback in WB_SYNC_ALL mode since do_sync_mapping_range() was used by other code relying on WB_SYNC_ALL semantics. These days do_sync_mapping_range() went away and we can switch sync_file_range(2) back to issuing WB_SYNC_NONE writeback. That should help PostgreSQL avoid large latency spikes when flushing data in the background. Andres measured a 20% increase in transactions per second on an SSD disk. Signed-off-by: Jan Kara Reported-by: Andres Freund Tested-By: Andres Freund Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/sync.c b/fs/sync.c index 4ec430a..dd5d171 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -348,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - ret = filemap_fdatawrite_range(mapping, offset, endbyte); + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); if (ret < 0) goto out_put; } -- cgit v0.10.2 From 6f6461562e8805784538fa469e9a538cd2b3d553 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Nov 2015 16:28:58 -0800 Subject: mm/memcontrol.c: uninline mem_cgroup_usage gcc version 5.2.1 20151010 (Debian 5.2.1-22) $ size mm/memcontrol.o mm/memcontrol.o.before text data bss dec hex filename 35535 7908 64 43507 a9f3 mm/memcontrol.o 35762 7908 64 43734 aad6 mm/memcontrol.o.before Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a547067..9acfb16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2801,7 +2801,7 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; -- cgit v0.10.2 From 1d5b43bfb60f7ba2b51792978a6b0781d4ebba93 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 6 Nov 2015 16:29:01 -0800 Subject: zram: introduce comp algorithm fallback functionality When the user supplies an unsupported compression algorithm, keep the previously selected one (knowingly supported) or the default one (if the compression algorithm hasn't been changed yet). Note that previously this operation (i.e. setting an invalid algorithm) would result in no algorithm being selected, which means that this represents a small change in the default behaviour. Minchan said: For initializing zram, we need to set up 3 optional parameters in advance. 1. the number of compression streams 2. memory limitation 3. compression algorithm Although user pass completely wrong value to set up for 1 and 2 parameters, it's okay because they have default value so zram will be initialized with the default value (of course, when user passes a wrong value via *echo*, sysfs returns -EINVAL so the user can notice it). But 3 is not consistent with other optional parameters. IOW, if the user passes a wrong value to set up 3 parameter, zram's initialization would fail unlike other optional parameters. So this patch makes them consistent. Signed-off-by: Luis Henriques Acked-by: Minchan Kim Acked-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9fa15bb..c93aeb8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -365,6 +365,9 @@ static ssize_t comp_algorithm_store(struct device *dev, struct zram *zram = dev_to_zram(dev); size_t sz; + if (!zcomp_available_algorithm(buf)) + return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { up_write(&zram->init_lock); @@ -378,9 +381,6 @@ static ssize_t comp_algorithm_store(struct device *dev, if (sz > 0 && zram->compressor[sz - 1] == '\n') zram->compressor[sz - 1] = 0x00; - if (!zcomp_available_algorithm(zram->compressor)) - len = -EINVAL; - up_write(&zram->init_lock); return len; } -- cgit v0.10.2 From 1237275580f3e2b998d355b2ff7f84c6b423aa11 Mon Sep 17 00:00:00 2001 From: Sergey SENOZHATSKY Date: Fri, 6 Nov 2015 16:29:04 -0800 Subject: zram: keep the exact overcommited value in mem_used_max `mem_used_max' is designed to store the max amount of memory zram consumed to store the data. However, it does not represent the actual 'overcommited' (max) value. The existing code goes to -ENOMEM overcommited case before it updates `->stats.max_used_pages', which hides the reason we went to -ENOMEM in the first place -- we actually used more memory than `->limit_pages': alloced_pages = zs_get_total_pages(meta->mem_pool); if (zram->limit_pages && alloced_pages > zram->limit_pages) { zs_free(meta->mem_pool, handle); ret = -ENOMEM; goto out; } update_used_max(zram, alloced_pages); Which is misleading. User will see -ENOMEM, check `->limit_pages', check `->stats.max_used_pages', which will keep the value BEFORE zram passed `->limit_pages', and see: `->stats.max_used_pages' < `->limit_pages' Move update_used_max() before we do `->limit_pages' check, so that user will see: `->stats.max_used_pages' > `->limit_pages' should the overcommit and -ENOMEM happen. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c93aeb8..3e8d8ff 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -726,14 +726,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } alloced_pages = zs_get_total_pages(meta->mem_pool); + update_used_max(zram, alloced_pages); + if (zram->limit_pages && alloced_pages > zram->limit_pages) { zs_free(meta->mem_pool, handle); ret = -ENOMEM; goto out; } - update_used_max(zram, alloced_pages); - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { -- cgit v0.10.2 From 1c53e0d2737f3ce4afa27d5703494eb14610ec26 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 6 Nov 2015 16:29:06 -0800 Subject: zram: make is_partial_io/valid_io_request/page_zero_filled return boolean Make is_partial_io()/valid_io_request()/page_zero_filled() return boolean, since each function only uses either one or zero as its return value. Signed-off-by: Geliang Tang Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3e8d8ff..81a557c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -106,7 +106,7 @@ static void zram_set_obj_size(struct zram_meta *meta, meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } -static inline int is_partial_io(struct bio_vec *bvec) +static inline bool is_partial_io(struct bio_vec *bvec) { return bvec->bv_len != PAGE_SIZE; } @@ -114,25 +114,25 @@ static inline int is_partial_io(struct bio_vec *bvec) /* * Check if request is within bounds and aligned on zram logical blocks. */ -static inline int valid_io_request(struct zram *zram, +static inline bool valid_io_request(struct zram *zram, sector_t start, unsigned int size) { u64 end, bound; /* unaligned request */ if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; + return false; if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; + return false; end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; /* out of range range */ if (unlikely(start >= bound || end > bound || start > end)) - return 0; + return false; /* I/O request is valid */ - return 1; + return true; } static void update_position(u32 *index, int *offset, struct bio_vec *bvec) @@ -157,7 +157,7 @@ static inline void update_used_max(struct zram *zram, } while (old_max != cur_max); } -static int page_zero_filled(void *ptr) +static bool page_zero_filled(void *ptr) { unsigned int pos; unsigned long *page; @@ -166,10 +166,10 @@ static int page_zero_filled(void *ptr) for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { if (page[pos]) - return 0; + return false; } - return 1; + return true; } static void handle_zero_page(struct bio_vec *bvec) -- cgit v0.10.2 From b0c9865fd2d7a34aa58bb58756ff088d2ec4bbcd Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Fri, 6 Nov 2015 16:29:09 -0800 Subject: mm/zswap.c: remove unneeded initialization to NULL in zswap_entry_find_get() On the next line entry variable will be re-initialized so no need to init it with NULL. Signed-off-by: Alexey Klimov Cc: Seth Jennings Cc: Dan Streetman Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zswap.c b/mm/zswap.c index e54166d..8275689 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -342,7 +342,7 @@ static void zswap_entry_put(struct zswap_tree *tree, static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { - struct zswap_entry *entry = NULL; + struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) -- cgit v0.10.2 From 3d9c637f4ae74b45d95bb6cbd793fbffad0a709c Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 6 Nov 2015 16:29:12 -0800 Subject: module: export param_free_charp() Change the param_free_charp() function from static to exported. It is used by zswap in the next patch ("zswap: use charp for zswap param strings"). Signed-off-by: Dan Streetman Acked-by: Rusty Russell Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index c12f214..52666d9 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -386,6 +386,7 @@ extern int param_get_ullong(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_charp; extern int param_set_charp(const char *val, const struct kernel_param *kp); extern int param_get_charp(char *buffer, const struct kernel_param *kp); +extern void param_free_charp(void *arg); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ diff --git a/kernel/params.c b/kernel/params.c index b6554aa..93a380a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -325,10 +325,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, -- cgit v0.10.2 From c99b42c3529e5e1bff00f68250dc869f7de3bd5f Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 6 Nov 2015 16:29:15 -0800 Subject: zswap: use charp for zswap param strings Instead of using a fixed-length string for the zswap params, use charp. This simplifies the code and uses less memory, as most zswap param strings will be less than the current maximum length. Signed-off-by: Dan Streetman Cc: Rusty Russell Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zswap.c b/mm/zswap.c index 8275689..025f8dc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -82,33 +82,27 @@ module_param_named(enabled, zswap_enabled, bool, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" -static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT; -static struct kparam_string zswap_compressor_kparam = { - .string = zswap_compressor, - .maxlen = sizeof(zswap_compressor), -}; +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, - .get = param_get_string, + .get = param_get_charp, + .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, - &zswap_compressor_kparam, 0644); + &zswap_compressor, 0644); /* Compressed storage zpool to use */ #define ZSWAP_ZPOOL_DEFAULT "zbud" -static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT; -static struct kparam_string zswap_zpool_kparam = { - .string = zswap_zpool_type, - .maxlen = sizeof(zswap_zpool_type), -}; +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_zpool_param_ops = { - .set = zswap_zpool_param_set, - .get = param_get_string, + .set = zswap_zpool_param_set, + .get = param_get_charp, + .free = param_free_charp, }; -module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644); +module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; @@ -615,19 +609,29 @@ error: return NULL; } -static struct zswap_pool *__zswap_pool_create_fallback(void) +static __init struct zswap_pool *__zswap_pool_create_fallback(void) { if (!crypto_has_comp(zswap_compressor, 0, 0)) { + if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("default compressor %s not available\n", + zswap_compressor); + return NULL; + } pr_err("compressor %s not available, using default %s\n", zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); - strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT, - sizeof(zswap_compressor)); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; } if (!zpool_has_pool(zswap_zpool_type)) { + if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + return NULL; + } pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); - strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT, - sizeof(zswap_zpool_type)); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; } return zswap_pool_create(zswap_zpool_type, zswap_compressor); @@ -684,43 +688,39 @@ static void zswap_pool_put(struct zswap_pool *pool) * param callbacks **********************************/ +/* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; - char str[kp->str->maxlen], *s; + char *s = strstrip((char *)val); int ret; - /* - * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined - * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or - * 32 (arbitrary). - */ - strlcpy(str, val, kp->str->maxlen); - s = strim(str); + /* no change required */ + if (!strcmp(s, *(char **)kp->arg)) + return 0; /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ if (!zswap_init_started) - return param_set_copystring(s, kp); - - /* no change required */ - if (!strncmp(kp->str->string, s, kp->str->maxlen)) - return 0; + return param_set_charp(s, kp); if (!type) { - type = s; - if (!zpool_has_pool(type)) { - pr_err("zpool %s not available\n", type); + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); return -ENOENT; } + type = s; } else if (!compressor) { - compressor = s; - if (!crypto_has_comp(compressor, 0, 0)) { - pr_err("compressor %s not available\n", compressor); + if (!crypto_has_comp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); return -ENOENT; } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&zswap_pools_lock); @@ -736,7 +736,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } if (pool) - ret = param_set_copystring(s, kp); + ret = param_set_charp(s, kp); else ret = -EINVAL; -- cgit v0.10.2 From 69e18f4dbedfbf208452e9da9979c92da30d2442 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 6 Nov 2015 16:29:18 -0800 Subject: zpool: remove redundant zpool->type string, const-ify zpool_get_type Make the return type of zpool_get_type const; the string belongs to the zpool driver and should not be modified. Remove the redundant type field in the struct zpool; it is private to zpool.c and isn't needed since ->driver->type can be used directly. Add comments indicating strings must be null-terminated. Signed-off-by: Dan Streetman Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 42f8ec9..1f405be 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -41,7 +41,7 @@ bool zpool_has_pool(char *type); struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, const struct zpool_ops *ops); -char *zpool_get_type(struct zpool *pool); +const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); diff --git a/mm/zpool.c b/mm/zpool.c index 8f670d3..13f524d 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -18,8 +18,6 @@ #include struct zpool { - char *type; - struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; @@ -73,6 +71,7 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); +/* this assumes @type is null-terminated. */ static struct zpool_driver *zpool_get_driver(char *type) { struct zpool_driver *driver; @@ -113,6 +112,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * not be loaded, and calling @zpool_create_pool() with the pool type will * fail. * + * The @type string must be null-terminated. + * * Returns: true if @type pool is available, false if not */ bool zpool_has_pool(char *type) @@ -145,6 +146,8 @@ EXPORT_SYMBOL(zpool_has_pool); * * Implementations must guarantee this to be thread-safe. * + * The @type and @name strings must be null-terminated. + * * Returns: New zpool on success, NULL on failure. */ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, @@ -174,7 +177,6 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - zpool->type = driver->type; zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; @@ -208,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_debug("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->driver->type); spin_lock(&pools_lock); list_del(&zpool->list); @@ -228,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool) * * Returns: The type of zpool. */ -char *zpool_get_type(struct zpool *zpool) +const char *zpool_get_type(struct zpool *zpool) { - return zpool->type; + return zpool->driver->type; } /** -- cgit v0.10.2 From 6f3526d6db7cbe8b53e42d6bf0cad2072afcf3fe Mon Sep 17 00:00:00 2001 From: Sergey SENOZHATSKY Date: Fri, 6 Nov 2015 16:29:21 -0800 Subject: mm: zsmalloc: constify struct zs_pool name Constify `struct zs_pool' ->name. [akpm@inux-foundation.org: constify zpool_create_pool()'s `type' arg also] Signed-off-by: Sergey Senozhatsky Acked-by: Dan Streetman Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 1f405be..2e97b77 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -38,7 +38,7 @@ enum zpool_mapmode { bool zpool_has_pool(char *type); -struct zpool *zpool_create_pool(char *type, char *name, +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, const struct zpool_ops *ops); const char *zpool_get_type(struct zpool *pool); @@ -83,7 +83,9 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops, + void *(*create)(const char *name, + gfp_t gfp, + const struct zpool_ops *ops, struct zpool *zpool); void (*destroy)(void *pool); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 6398dfa..34eb160 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -41,7 +41,7 @@ struct zs_pool_stats { struct zs_pool; -struct zs_pool *zs_create_pool(char *name, gfp_t flags); +struct zs_pool *zs_create_pool(const char *name, gfp_t flags); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size); diff --git a/mm/zbud.c b/mm/zbud.c index fa48bcdf..d8a181f 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -137,7 +137,7 @@ static const struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(char *name, gfp_t gfp, +static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { diff --git a/mm/zpool.c b/mm/zpool.c index 13f524d..fd3ff71 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -72,7 +72,7 @@ int zpool_unregister_driver(struct zpool_driver *driver) EXPORT_SYMBOL(zpool_unregister_driver); /* this assumes @type is null-terminated. */ -static struct zpool_driver *zpool_get_driver(char *type) +static struct zpool_driver *zpool_get_driver(const char *type) { struct zpool_driver *driver; @@ -150,7 +150,7 @@ EXPORT_SYMBOL(zpool_has_pool); * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, const struct zpool_ops *ops) { struct zpool_driver *driver; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f135b1b..8b8e0da 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -237,7 +237,7 @@ struct link_free { }; struct zs_pool { - char *name; + const char *name; struct size_class **size_class; struct kmem_cache *handle_cachep; @@ -311,7 +311,7 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, +static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { @@ -548,7 +548,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(char *name, struct zs_pool *pool) +static int zs_pool_stat_create(const char *name, struct zs_pool *pool) { struct dentry *entry; @@ -588,7 +588,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) { return 0; } @@ -1866,7 +1866,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name, gfp_t flags) { int i; struct zs_pool *pool; -- cgit v0.10.2 From 8f958c98f28d088a1ef3e021ab7aeb59a234b953 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Fri, 6 Nov 2015 16:29:23 -0800 Subject: zsmalloc: add comments for ->inuse to zspage [akpm@linux-foundation.org: fix grammar] Signed-off-by: Hui Zhu Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8b8e0da..4396b82 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -38,6 +38,7 @@ * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page -- cgit v0.10.2 From 12a7bfad58cd604616dd5205efa6dc2be6f299eb Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Fri, 6 Nov 2015 16:29:26 -0800 Subject: zsmalloc: fix obj_to_head use page_private(page) as value but not pointer In obj_malloc(): if (!class->huge) /* record handle in the header of allocated chunk */ link->handle = handle; else /* record handle in first_page->private */ set_page_private(first_page, handle); In the hugepage we save handle to private directly. But in obj_to_head(): if (class->huge) { VM_BUG_ON(!is_first_page(page)); return *(unsigned long *)page_private(page); } else return *(unsigned long *)obj; It is used as a pointer. The reason why there is no problem until now is huge-class page is born with ZS_FULL so it can't be migrated. However, we need this patch for future work: "VM-aware zsmalloced page migration" to reduce external fragmentation. Signed-off-by: Hui Zhu Acked-by: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4396b82..1fe9928 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -825,7 +825,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, { if (class->huge) { VM_BUG_ON(!is_first_page(page)); - return *(unsigned long *)page_private(page); + return page_private(page); } else return *(unsigned long *)obj; } -- cgit v0.10.2 From 759b26b29885a8ef6101aa554d9990803f6ef792 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 6 Nov 2015 16:29:29 -0800 Subject: zsmalloc: use preempt.h for in_interrupt() A cosmetic change. Commit c60369f01125 ("staging: zsmalloc: prevent mappping in interrupt context") added in_interrupt() check to zs_map_object() and 'hardirq.h' include; but in_interrupt() macro is defined in 'preempt.h' not in 'hardirq.h', so include it instead. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 1fe9928..c481106 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v0.10.2 From 2c35169572b84897b43e6f3e9667fd1904451f34 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 6 Nov 2015 16:29:32 -0800 Subject: zsmalloc: don't test shrinker_enabled in zs_shrinker_count() We don't let user to disable shrinker in zsmalloc (once it's been enabled), so no need to check ->shrinker_enabled in zs_shrinker_count(), at the moment at least. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c481106..fd0593e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1823,9 +1823,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - if (!pool->shrinker_enabled) - return 0; - for (i = zs_size_classes - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) -- cgit v0.10.2 From 6f0b22760b7d8317569252cc7c36cbed22ebe401 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Fri, 6 Nov 2015 16:29:35 -0800 Subject: mm/zsmalloc.c: remove useless line in obj_free() Signed-off-by: Hui Zhu Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fd0593e..a8ff24a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1429,8 +1429,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - enum fullness_group fullness; BUG_ON(!obj); @@ -1438,7 +1436,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); - get_zspage_mapping(first_page, &class_idx, &fullness); f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); vaddr = kmap_atomic(f_page); -- cgit v0.10.2 From 6fe5186f0c7c18a8beb6d96c21e2390df7a12375 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 6 Nov 2015 16:29:38 -0800 Subject: zsmalloc: reduce size_class memory usage Each `struct size_class' contains `struct zs_size_stat': an array of NR_ZS_STAT_TYPE `unsigned long'. For zsmalloc built with no CONFIG_ZSMALLOC_STAT this results in a waste of `2 * sizeof(unsigned long)' per-class. The patch removes unneeded `struct zs_size_stat' members by redefining NR_ZS_STAT_TYPE (max stat idx in array). Since both NR_ZS_STAT_TYPE and zs_stat_type are compile time constants, GCC can eliminate zs_stat_inc()/zs_stat_dec() calls that use zs_stat_type larger than NR_ZS_STAT_TYPE: CLASS_ALMOST_EMPTY and CLASS_ALMOST_FULL at the moment. ./scripts/bloat-o-meter mm/zsmalloc.o.old mm/zsmalloc.o.new add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-39 (-39) function old new delta fix_fullness_group 97 94 -3 insert_zspage 100 86 -14 remove_zspage 141 119 -22 To summarize: a) each class now uses less memory b) we avoid a number of dec/inc stats (a minor optimization, but still). The gain will increase once we introduce additional stats. A simple IO test. iozone -t 4 -R -r 32K -s 60M -I +Z patched base " Initial write " 4145599.06 4127509.75 " Rewrite " 4146225.94 4223618.50 " Read " 17157606.00 17211329.50 " Re-read " 17380428.00 17267650.50 " Reverse Read " 16742768.00 16162732.75 " Stride read " 16586245.75 16073934.25 " Random read " 16349587.50 15799401.75 " Mixed workload " 10344230.62 9775551.50 " Random write " 4277700.62 4260019.69 " Pwrite " 4302049.12 4313703.88 " Pread " 6164463.16 6126536.72 " Fwrite " 7131195.00 6952586.00 " Fread " 12682602.25 12619207.50 Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a8ff24a..2858b20 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -167,9 +167,14 @@ enum zs_stat_type { OBJ_USED, CLASS_ALMOST_FULL, CLASS_ALMOST_EMPTY, - NR_ZS_STAT_TYPE, }; +#ifdef CONFIG_ZSMALLOC_STAT +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif + struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -448,19 +453,23 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] += cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] -= cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - return class->stats.objs[type]; + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; } #ifdef CONFIG_ZSMALLOC_STAT -- cgit v0.10.2 From 474e4eeaf26b6c3298ca3ae9d0a705b0853efb2a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:40 -0800 Subject: mm: drop page->slab_page Since 8456a648cf44 ("slab: use struct page for slab management") nobody uses slab_page field in struct page. Let's drop it. Signed-off-by: Kirill A. Shutemov Acked-by: Christoph Lameter Acked-by: David Rientjes Acked-by: Vlastimil Babka Reviewed-by: Andrea Arcangeli Cc: Joonsoo Kim Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Aneesh Kumar K.V Cc: Hugh Dickins Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0a85da2..c0ec46d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -131,7 +131,6 @@ struct page { #endif }; - struct slab *slab_page; /* slab fields */ struct rcu_head rcu_head; /* Used by SLAB * when destroying via RCU */ -- cgit v0.10.2 From bc4f610d5a884eedfeac3b08f2ac87df74a1e55f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:44 -0800 Subject: slab, slub: use page->rcu_head instead of page->lru plus cast We have properly typed page->rcu_head, no need to cast page->lru. Signed-off-by: Kirill A. Shutemov Reviewed-by: Andrea Arcangeli Acked-by: Christoph Lameter Cc: "Paul E. McKenney" Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: David Rientjes Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index a9ef77d..e0819fa 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1889,21 +1889,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - - /* - * RCU free overloads the RCU head over the LRU. - * slab_page has been overloeaded over the LRU, - * however it is not used from now on so that - * we can use it safely. - */ - head = (void *)&page->rcu_head; - call_rcu(head, kmem_rcu_free); - - } else { + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + call_rcu(&page->rcu_head, kmem_rcu_free); + else kmem_freepages(cachep, page); - } /* * From now on, we don't use freelist diff --git a/mm/slub.c b/mm/slub.c index 9769562..7cb4bf9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1507,10 +1507,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(s->reserved != sizeof(*head)); head = page_address(page) + offset; } else { - /* - * RCU free overloads the RCU head over the LRU - */ - head = (void *)&page->lru; + head = &page->rcu_head; } call_rcu(head, rcu_free_slab); -- cgit v0.10.2 From 32e7ba1ea1f8d1f0ea4983e768f8b566770a55b3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:47 -0800 Subject: zsmalloc: use page->private instead of page->first_page We are going to rework how compound_head() work. It will not use page->first_page as we have it now. The only other user of page->first_page beyond compound pages is zsmalloc. Let's use page->private instead of page->first_page here. It occupies the same storage space. Signed-off-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: Sergey Senozhatsky Reviewed-by: Andrea Arcangeli Cc: "Paul E. McKenney" Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Christoph Lameter Cc: David Rientjes Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2858b20..9f15bdd 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,7 +16,7 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -26,8 +26,7 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page * If the page is first_page for huge object, it stores handle. * Look at size_class->huge. * page->freelist: points to the first free object in zspage. @@ -774,7 +773,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -959,7 +958,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -984,7 +983,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) if (i == 1) set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ -- cgit v0.10.2 From f1e61557f0230d51a3df8d825f2c156e75563bff Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:50 -0800 Subject: mm: pack compound_dtor and compound_order into one word in struct page The patch halves space occupied by compound_dtor and compound_order in struct page. For compound_order, it's trivial long -> short conversion. For get_compound_page_dtor(), we now use hardcoded table for destructor lookup and store its index in the struct page instead of direct pointer to destructor. It shouldn't be a big trouble to maintain the table: we have only two destructor and NULL currently. This patch free up one word in tail pages for reuse. This is preparation for the next patch. Signed-off-by: Kirill A. Shutemov Reviewed-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Andrea Arcangeli Cc: "Paul E. McKenney" Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Christoph Lameter Cc: David Rientjes Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 906c46a..6581c21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -568,18 +568,32 @@ int split_free_page(struct page *page); /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. - * These are _only_ valid on the head of a PG_compound page. + * These are _only_ valid on the head of a compound page. */ +typedef void compound_page_dtor(struct page *); + +/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ +enum compound_dtor_id { + NULL_COMPOUND_DTOR, + COMPOUND_PAGE_DTOR, +#ifdef CONFIG_HUGETLB_PAGE + HUGETLB_PAGE_DTOR, +#endif + NR_COMPOUND_DTORS, +}; +extern compound_page_dtor * const compound_page_dtors[]; static inline void set_compound_page_dtor(struct page *page, - compound_page_dtor *dtor) + enum compound_dtor_id compound_dtor) { - page[1].compound_dtor = dtor; + VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); + page[1].compound_dtor = compound_dtor; } static inline compound_page_dtor *get_compound_page_dtor(struct page *page) { - return page[1].compound_dtor; + VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); + return compound_page_dtors[page[1].compound_dtor]; } static inline int compound_order(struct page *page) @@ -589,7 +603,7 @@ static inline int compound_order(struct page *page) return page[1].compound_order; } -static inline void set_compound_order(struct page *page, unsigned long order) +static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c0ec46d..e334ef7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -28,8 +28,6 @@ struct mem_cgroup; IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) -typedef void compound_page_dtor(struct page *); - /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -136,8 +134,8 @@ struct page { */ /* First tail page of compound page */ struct { - compound_page_dtor *compound_dtor; - unsigned long compound_order; + unsigned short int compound_dtor; + unsigned short int compound_order; }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74ef0c6..e90a290 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1146,7 +1146,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - set_compound_page_dtor(page, NULL); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); @@ -1242,7 +1242,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; @@ -1294,7 +1294,7 @@ int PageHuge(struct page *page) return 0; page = compound_head(page); - return get_compound_page_dtor(page) == free_huge_page; + return page[1].compound_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -1568,7 +1568,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, if (page) { INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b8d560a..fae1bd6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -229,6 +229,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +static void free_compound_page(struct page *page); +compound_page_dtor * const compound_page_dtors[] = { + NULL, + free_compound_page, +#ifdef CONFIG_HUGETLB_PAGE + free_huge_page, +#endif +}; + int min_free_kbytes = 1024; int user_min_free_kbytes = -1; @@ -458,7 +467,7 @@ void prep_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - set_compound_page_dtor(page, free_compound_page); + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); __SetPageHead(page); for (i = 1; i < nr_pages; i++) { -- cgit v0.10.2 From 1d798ca3f16437c71ff63e36597ff07f9c12e4d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:54 -0800 Subject: mm: make compound_head() robust Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Cc: Vlastimil Babka Acked-by: Paul E. McKenney Cc: Aneesh Kumar K.V Cc: Andi Kleen Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock index 6dea4fd..62842a8 100644 --- a/Documentation/vm/split_page_table_lock +++ b/Documentation/vm/split_page_table_lock @@ -54,8 +54,8 @@ everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), which must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table -allocation: slab uses page->slab_cache and page->first_page for its pages. -These fields share storage with page->ptl. +allocation: slab uses page->slab_cache for its pages. +This field shares storage with page->ptl. PMD split lock only makes sense if you have more than two page table levels. diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index f3dfe0d..44c6764 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -169,7 +169,6 @@ CONFIG_FLATMEM_MANUAL=y # CONFIG_SPARSEMEM_MANUAL is not set CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y -CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=1 diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7edd305..24154c2 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -32,7 +32,7 @@ static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - return (struct hugetlb_cgroup *)page[2].lru.next; + return (struct hugetlb_cgroup *)page[2].private; } static inline @@ -42,7 +42,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) return -1; - page[2].lru.next = (void *)h_cg; + page[2].private = (unsigned long)h_cg; return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 6581c21..9671b6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -430,46 +430,6 @@ static inline void compound_unlock_irqrestore(struct page *page, #endif } -static inline struct page *compound_head_by_tail(struct page *tail) -{ - struct page *head = tail->first_page; - - /* - * page->first_page may be a dangling pointer to an old - * compound page, so recheck that it is still a tail - * page before returning. - */ - smp_rmb(); - if (likely(PageTail(tail))) - return head; - return tail; -} - -/* - * Since either compound page could be dismantled asynchronously in THP - * or we access asynchronously arbitrary positioned struct page, there - * would be tail flag race. To handle this race, we should call - * smp_rmb() before checking tail flag. compound_head_by_tail() did it. - */ -static inline struct page *compound_head(struct page *page) -{ - if (unlikely(PageTail(page))) - return compound_head_by_tail(page); - return page; -} - -/* - * If we access compound page synchronously such as access to - * allocated page, there is no need to handle tail flag race, so we can - * check tail flag directly without any synchronization primitive. - */ -static inline struct page *compound_head_fast(struct page *page) -{ - if (unlikely(PageTail(page))) - return page->first_page; - return page; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -518,7 +478,7 @@ static inline void get_huge_page_tail(struct page *page) VM_BUG_ON_PAGE(!PageTail(page), page); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - if (compound_tail_refcounted(page->first_page)) + if (compound_tail_refcounted(compound_head(page))) atomic_inc(&page->_mapcount); } @@ -541,13 +501,7 @@ static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); - /* - * We don't need to worry about synchronization of tail flag - * when we call virt_to_head_page() since it is only called for - * already allocated page and this page won't be freed until - * this virt_to_head_page() is finished. So use _fast variant. - */ - return compound_head_fast(page); + return compound_head(page); } /* @@ -1586,8 +1540,7 @@ static inline bool ptlock_init(struct page *page) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: - * slab code uses page->slab_cache and page->first_page (for tail - * pages), which share storage with page->ptl. + * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); if (!ptlock_alloc(page)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e334ef7..bb91658 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -111,7 +111,13 @@ struct page { }; }; - /* Third double word block */ + /* + * Third double word block + * + * WARNING: bit 0 of the first word encode PageTail(). That means + * the rest users of the storage space MUST NOT use the bit to + * avoid collision and false-positive PageTail(). + */ union { struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! @@ -132,14 +138,23 @@ struct page { struct rcu_head rcu_head; /* Used by SLAB * when destroying via RCU */ - /* First tail page of compound page */ + /* Tail pages of compound page */ struct { + unsigned long compound_head; /* If bit zero is set */ + + /* First tail page only */ unsigned short int compound_dtor; unsigned short int compound_order; }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - pgtable_t pmd_huge_pte; /* protected by page->ptl */ + struct { + unsigned long __pad; /* do not overlay pmd_huge_pte + * with compound_head to avoid + * possible bit 0 collision. + */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + }; #endif }; @@ -160,7 +175,6 @@ struct page { #endif #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ - struct page *first_page; /* Compound tail pages */ }; #ifdef CONFIG_MEMCG diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a525e50..bb53c7b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -86,12 +86,7 @@ enum pageflags { PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_writeback, /* Page is under writeback */ -#ifdef CONFIG_PAGEFLAGS_EXTENDED PG_head, /* A head page */ - PG_tail, /* A tail page */ -#else - PG_compound, /* A compound page */ -#endif PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ @@ -398,85 +393,46 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -#ifdef CONFIG_PAGEFLAGS_EXTENDED -/* - * System with lots of page flags available. This allows separate - * flags for PageHead() and PageTail() checks of compound pages so that bit - * tests can be used in performance sensitive paths. PageCompound is - * generally not used in hot code paths except arch/powerpc/mm/init_64.c - * and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages - * and avoid handling those in real mode. - */ __PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) -__PAGEFLAG(Tail, tail) -static inline int PageCompound(struct page *page) -{ - return page->flags & ((1L << PG_head) | (1L << PG_tail)); - -} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline void ClearPageCompound(struct page *page) +static inline int PageTail(struct page *page) { - BUG_ON(!PageHead(page)); - ClearPageHead(page); + return READ_ONCE(page->compound_head) & 1; } -#endif - -#define PG_head_mask ((1L << PG_head)) -#else -/* - * Reduce page flag use as much as possible by overlapping - * compound page flags with the flags used for page cache pages. Possible - * because PageCompound is always set for compound pages and not for - * pages on the LRU and/or pagecache. - */ -TESTPAGEFLAG(Compound, compound) -__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound) - -/* - * PG_reclaim is used in combination with PG_compound to mark the - * head and tail of a compound page. This saves one page flag - * but makes it impossible to use compound pages for the page cache. - * The PG_reclaim bit would have to be used for reclaim or readahead - * if compound pages enter the page cache. - * - * PG_compound & PG_reclaim => Tail page - * PG_compound & ~PG_reclaim => Head page - */ -#define PG_head_mask ((1L << PG_compound)) -#define PG_head_tail_mask ((1L << PG_compound) | (1L << PG_reclaim)) - -static inline int PageHead(struct page *page) +static inline void set_compound_head(struct page *page, struct page *head) { - return ((page->flags & PG_head_tail_mask) == PG_head_mask); + WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } -static inline int PageTail(struct page *page) +static inline void clear_compound_head(struct page *page) { - return ((page->flags & PG_head_tail_mask) == PG_head_tail_mask); + WRITE_ONCE(page->compound_head, 0); } -static inline void __SetPageTail(struct page *page) +static inline struct page *compound_head(struct page *page) { - page->flags |= PG_head_tail_mask; + unsigned long head = READ_ONCE(page->compound_head); + + if (unlikely(head & 1)) + return (struct page *) (head - 1); + return page; } -static inline void __ClearPageTail(struct page *page) +static inline int PageCompound(struct page *page) { - page->flags &= ~PG_head_tail_mask; -} + return PageHead(page) || PageTail(page); +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { - BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); - clear_bit(PG_compound, &page->flags); + BUG_ON(!PageHead(page)); + ClearPageHead(page); } #endif -#endif /* !PAGEFLAGS_EXTENDED */ +#define PG_head_mask ((1L << PG_head)) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); diff --git a/mm/Kconfig b/mm/Kconfig index 0d9fdcd..97a4e06 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION -# -# If we have space for more page flags then we can enable additional -# optimizations and functionality. -# -# Regular Sparsemem takes page flag bits for the sectionid if it does not -# use a virtual memmap. Disable extended page flags for 32 bit platforms -# that require the use of a sectionid in the page flags. -# -config PAGEFLAGS_EXTENDED - def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM - # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/debug.c b/mm/debug.c index e784110..668aa35 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -25,12 +25,7 @@ static const struct trace_print_flags pageflag_names[] = { {1UL << PG_private, "private" }, {1UL << PG_private_2, "private_2" }, {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73266ee..e1ccc83f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1755,8 +1755,7 @@ static void __split_huge_page_refcount(struct page *page, (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* clear PageTail before overwriting first_page */ - smp_wmb(); + clear_compound_head(page_tail); if (page_is_young(page)) set_page_young(page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e90a290..4eb0f09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1001,9 +1001,8 @@ static void destroy_compound_gigantic_page(struct page *page, struct page *p = page + 1; for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __ClearPageTail(p); + clear_compound_head(p); set_page_refcounted(p); - p->first_page = NULL; } set_compound_order(page, 0); @@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ __ClearPageReserved(p); set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 33d59ab..d8fb10d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -385,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void) /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgroup details. + * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); diff --git a/mm/internal.h b/mm/internal.h index 5b7841f6..a7f5670 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -80,9 +80,9 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); + VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); if (get_page_head) - atomic_inc(&page->first_page->_count); + atomic_inc(&compound_head(page)->_count); get_huge_page_tail(page); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 16a0ec3..8424b64 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -776,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -800,12 +798,7 @@ static struct page_state { */ { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED { head, head, MF_MSG_HUGE, me_huge_page }, - { tail, tail, MF_MSG_HUGE, me_huge_page }, -#else - { compound, compound, MF_MSG_HUGE, me_huge_page }, -#endif { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fae1bd6..e361001 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -445,15 +445,15 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -473,10 +473,7 @@ void prep_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -854,17 +851,30 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; } - return 0; + ret = 0; +out: + clear_compound_head(page); + return ret; } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -931,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + SetPageReserved(page); } } diff --git a/mm/swap.c b/mm/swap.c index 983f692..39395fb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -201,7 +201,7 @@ out_put_single: __put_single_page(page); return; } - VM_BUG_ON_PAGE(page_head != page->first_page, page); + VM_BUG_ON_PAGE(page_head != compound_head(page), page); /* * We can release the refcount taken by * get_page_unless_zero() now that @@ -262,7 +262,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head_by_tail(page); + page_head = compound_head(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else -- cgit v0.10.2 From d00181b96eb86c914cb327d1de974a1b71366e1b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:57 -0800 Subject: mm: use 'unsigned int' for page order Let's try to be consistent about data type of page order. [sfr@canb.auug.org.au: fix build (type of pageblock_order)] [hughd@google.com: some configs end up with MAX_ORDER and pageblock_order having different types] Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Andrea Arcangeli Cc: "Paul E. McKenney" Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Stephen Rothwell Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 9671b6f..00bad77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -550,7 +550,7 @@ static inline compound_page_dtor *get_compound_page_dtor(struct page *page) return compound_page_dtors[page[1].compound_dtor]; } -static inline int compound_order(struct page *page) +static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) return 0; @@ -1810,7 +1810,8 @@ extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, + const char *fmt, ...); extern void setup_per_cpu_pageset(void); diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 2baeee1..e942558 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -44,7 +44,7 @@ enum pageblock_bits { #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Huge page sizes are variable */ -extern int pageblock_order; +extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4eb0f09..7ce07d6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -994,7 +994,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) static void destroy_compound_gigantic_page(struct page *page, - unsigned long order) + unsigned int order) { int i; int nr_pages = 1 << order; @@ -1009,7 +1009,7 @@ static void destroy_compound_gigantic_page(struct page *page, __ClearPageHead(page); } -static void free_gigantic_page(struct page *page, unsigned order) +static void free_gigantic_page(struct page *page, unsigned int order) { free_contig_range(page_to_pfn(page), 1 << order); } @@ -1053,7 +1053,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned order) +static struct page *alloc_gigantic_page(int nid, unsigned int order) { unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; @@ -1089,7 +1089,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order) } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned long order); +static void prep_compound_gigantic_page(struct page *page, unsigned int order); static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { @@ -1122,9 +1122,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h, static inline bool gigantic_page_supported(void) { return true; } #else static inline bool gigantic_page_supported(void) { return false; } -static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, - unsigned long order) { } + unsigned int order) { } static inline int alloc_fresh_gigantic_page(struct hstate *h, nodemask_t *nodes_allowed) { return 0; } #endif @@ -1250,7 +1250,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; @@ -1968,7 +1968,8 @@ found: return 1; } -static void __init prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, + unsigned int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -2679,7 +2680,7 @@ static int __init hugetlb_init(void) module_init(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; diff --git a/mm/internal.h b/mm/internal.h index a7f5670..38e24b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -177,7 +177,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); -extern void prep_compound_page(struct page *page, unsigned long order); +extern void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif @@ -235,7 +235,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use page_order_unsafe() below. */ -static inline unsigned long page_order(struct page *page) +static inline unsigned int page_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e361001..208e4c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -181,7 +181,7 @@ bool pm_suspended_storage(void) #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -int pageblock_order __read_mostly; +unsigned int pageblock_order __read_mostly; #endif static void __free_pages_ok(struct page *page, unsigned int order); @@ -462,7 +462,7 @@ static void free_compound_page(struct page *page) __free_pages_ok(page, compound_order(page)); } -void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; @@ -662,7 +662,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - int max_order = MAX_ORDER; + unsigned int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -675,7 +675,7 @@ static inline void __free_one_page(struct page *page, * pageblock. Without this, pageblock isolation * could cause incorrect freepage accounting. */ - max_order = min(MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); } else { __mod_zone_freepage_state(zone, 1 << order, migratetype); } @@ -1471,7 +1471,7 @@ int move_freepages(struct zone *zone, int migratetype) { struct page *page; - unsigned long order; + unsigned int order; int pages_moved = 0; #ifndef CONFIG_HOLES_IN_ZONE @@ -1584,7 +1584,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type) { - int current_order = page_order(page); + unsigned int current_order = page_order(page); int pages; /* Take ownership for orders >= pageblock_order */ @@ -2637,7 +2637,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; @@ -2671,7 +2671,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); @@ -3449,7 +3449,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order) } } -static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +static void *make_alloc_exact(unsigned long addr, unsigned int order, + size_t size) { if (addr) { unsigned long alloc_end = addr + (PAGE_SIZE << order); @@ -3499,7 +3500,7 @@ EXPORT_SYMBOL(alloc_pages_exact); */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { - unsigned order = get_order(size); + unsigned int order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; @@ -3800,7 +3801,8 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned int order; + unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4149,7 +4151,7 @@ static void build_zonelists(pg_data_t *pgdat) nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; - int order = current_zonelist_order; + unsigned int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { @@ -6678,7 +6680,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { unsigned long outer_start, outer_end; - int ret = 0, order; + unsigned int order; + int ret = 0; struct compact_control cc = { .nr_migratepages = 0, -- cgit v0.10.2 From 1965c8b7ac7dd147663faf77a66a693ac3ddcb85 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:30:00 -0800 Subject: mm: use 'unsigned int' for compound_dtor/compound_order on 64BIT On 64 bit system we have enough space in struct page to encode compound_dtor and compound_order with unsigned int. On x86-64 it leads to slightly smaller code size due usesage of plain MOV instead of MOVZX (zero-extended move) or similar effect. allyesconfig: text data bss dec hex filename 159520446 48146736 72196096 279863278 10ae5fee vmlinux.pre 159520382 48146736 72196096 279863214 10ae5fae vmlinux.post On other architectures without native support of 16-bit data types the Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: "Paul E. McKenney" Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Christoph Lameter Cc: David Rientjes Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Sergey Senozhatsky Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb91658..f8d1492 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -143,8 +143,19 @@ struct page { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ +#ifdef CONFIG_64BIT + /* + * On 64 bit system we have enough space in struct page + * to encode compound_dtor and compound_order with + * unsigned int. It can help compiler generate better or + * smaller code on some archtectures. + */ + unsigned int compound_dtor; + unsigned int compound_order; +#else unsigned short int compound_dtor; unsigned short int compound_order; +#endif }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS -- cgit v0.10.2 From 3a49f3d2a10dfb27411d321900197a3d6c52405b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 6 Nov 2015 16:30:03 -0800 Subject: fs/proc/array.c: set overflow flag in case of error For now in task_name() we ignore the return code of string_escape_str() call. This is not good if buffer suddenly becomes not big enough. Do the proper error handling there. Signed-off-by: Andy Shevchenko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/array.c b/fs/proc/array.c index eed2050..d73291f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,18 +91,18 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) { char *buf; + size_t size; char tcomm[sizeof(p->comm)]; + int ret; get_task_comm(tcomm, p); seq_puts(m, "Name:\t"); - buf = m->buf + m->count; - /* Ignore error for now */ - buf += string_escape_str(tcomm, buf, m->size - m->count, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + size = seq_get_buf(m, &buf); + ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + seq_commit(m, ret < size ? ret : -1); - m->count = buf - m->buf; seq_putc(m, '\n'); } -- cgit v0.10.2 From 54708d2858e79a2bdda10bf8a20c80eb96c20613 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:30:06 -0800 Subject: proc: actually make proc_fd_permission() thread-friendly The commit 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly") fixed the access to /proc/self/fd from sub-threads, but introduced another problem: a sub-thread can't access /proc//fd/ or /proc/thread-self/fd if generic_permission() fails. Change proc_fd_permission() to check same_thread_group(pid_task(), current). Fixes: 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly") Reported-by: "Jin, Yihua" Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6e5fcd0..3c2a915 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, */ int proc_fd_permission(struct inode *inode, int mask) { - int rv = generic_permission(inode, mask); + struct task_struct *p; + int rv; + + rv = generic_permission(inode, mask); if (rv == 0) - return 0; - if (task_tgid(current) == proc_pid(inode)) + return rv; + + rcu_read_lock(); + p = pid_task(proc_pid(inode), PIDTYPE_PID); + if (p && same_thread_group(p, current)) rv = 0; + rcu_read_unlock(); + return rv; } -- cgit v0.10.2 From 9add850c211a39d5ab1a091d48795e21599a73d0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Nov 2015 16:30:09 -0800 Subject: include/linux/compiler-gcc.h: improve __visible documentation Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 0e3110a..22ab246 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -205,7 +205,10 @@ #if GCC_VERSION >= 40600 /* - * Tell the optimizer that something else uses this function or variable. + * When used with Link Time Optimization, gcc can optimize away C functions or + * variables which are referenced only from assembly code. __visible tells the + * optimizer that something else uses this function or variable, thus preventing + * this. */ #define __visible __attribute__((externally_visible)) #endif -- cgit v0.10.2 From eac44a5e07be41a153e52c35c4d7dc0fec23adb3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Nov 2015 16:30:12 -0800 Subject: fs/jffs2/wbuf.c: remove stray semicolon Reported-by: Wu Fengguang Cc: Sasha Levin Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 09ed551..955da62 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { if ((c->flash_size % c->sector_size) != 0) { c->flash_size = (c->flash_size / c->sector_size) * c->sector_size; pr_warn("flash size adjusted to %dKiB\n", c->flash_size); - }; + } c->wbuf_ofs = 0xFFFFFFFF; c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); -- cgit v0.10.2 From 3e406b1d7c1e5c14c84a71eb4bee5f46ba690401 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:30:15 -0800 Subject: lib/dynamic_debug.c: use kstrdup_const Using kstrdup_const, thus reusing .rodata when possible, saves around 2 kB of runtime memory on my laptop/.config combination. Signed-off-by: Rasmus Villemoes Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index e491e02..e3952e9 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -42,7 +42,7 @@ extern struct _ddebug __stop___verbose[]; struct ddebug_table { struct list_head link; - char *mod_name; + const char *mod_name; unsigned int num_ddebugs; struct _ddebug *ddebugs; }; @@ -841,12 +841,12 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *name) { struct ddebug_table *dt; - char *new_name; + const char *new_name; dt = kzalloc(sizeof(*dt), GFP_KERNEL); if (dt == NULL) return -ENOMEM; - new_name = kstrdup(name, GFP_KERNEL); + new_name = kstrdup_const(name, GFP_KERNEL); if (new_name == NULL) { kfree(dt); return -ENOMEM; @@ -907,7 +907,7 @@ int ddebug_dyndbg_module_param_cb(char *param, char *val, const char *module) static void ddebug_table_free(struct ddebug_table *dt) { list_del_init(&dt->link); - kfree(dt->mod_name); + kfree_const(dt->mod_name); kfree(dt); } -- cgit v0.10.2 From 5e4ee7b13b522d07196e737f399843c58569604d Mon Sep 17 00:00:00 2001 From: Martin Kletzander Date: Fri, 6 Nov 2015 16:30:17 -0800 Subject: printk: synchronize %p formatting documentation Move all pointer-formatting documentation to one place in the code and one place in the documentation instead of keeping it in three places with different level of completeness. Documentation/printk-formats.txt has detailed information about each modifier, docstring above pointer() has short descriptions of them (as that is the function dealing with %p) and docstring above vsprintf() is removed as redundant. Both docstrings in the code that were modified are updated with a reminder of updating the documentation upon any further change. [akpm@linux-foundation.org: fix comment] Signed-off-by: Martin Kletzander Reviewed-by: Andy Shevchenko Cc: Rasmus Villemoes Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 2216eb1..9b8d7f7 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -119,6 +119,7 @@ Raw buffer as an escaped string: If field width is omitted the 1 byte only will be escaped. Raw buffer as a hex string: + %*ph 00 01 02 ... 3f %*phC 00:01:02: ... :3f %*phD 00-01-02- ... -3f @@ -234,6 +235,7 @@ UUID/GUID addresses: Passed by reference. dentry names: + %pd{,2,3,4} %pD{,2,3,4} @@ -256,6 +258,8 @@ struct va_format: va_list *va; }; + Implements a "recursive vsnprintf". + Do not use this feature without some mechanism to verify the correctness of the format string and va_list arguments. @@ -284,6 +288,31 @@ bitmap and its derivatives such as cpumask and nodemask: Passed by reference. +Network device features: + + %pNF 0x000000000000c000 + + For printing netdev_features_t. + + Passed by reference. + +Command from struct task_struct + + %pT ls + + For printing executable name excluding path from struct + task_struct. + + Passed by reference. + +Ignored argument: + + %n %n + + The argument passed will be ignored. In other words, literal "%n" will + be in the output and the argument will be considered for next format + specifier. + Thank you for your cooperation and attention. diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 95cd63b..e966a45 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1448,6 +1448,9 @@ int kptr_restrict __read_mostly; * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cr' For a clock, it prints the current rate of the clock + * - 'n' For ignored argument + * + * ** Please update also Documentation/printk-formats.txt when making changes ** * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -1812,40 +1815,13 @@ qualifier: * @args: Arguments for the format string * * This function follows C99 vsnprintf, but has some extensions: - * %pS output the name of a text symbol with offset - * %ps output the name of a text symbol without offset - * %pF output the name of a function pointer with its offset - * %pf output the name of a function pointer without its offset - * %pB output the name of a backtrace symbol with its offset - * %pR output the address range in a struct resource with decoded flags - * %pr output the address range in a struct resource with raw flags - * %pb output the bitmap with field width as the number of bits - * %pbl output the bitmap as range list with field width as the number of bits - * %pM output a 6-byte MAC address with colons - * %pMR output a 6-byte MAC address with colons in reversed order - * %pMF output a 6-byte MAC address with dashes - * %pm output a 6-byte MAC address without colons - * %pmR output a 6-byte MAC address without colons in reversed order - * %pI4 print an IPv4 address without leading zeros - * %pi4 print an IPv4 address with leading zeros - * %pI6 print an IPv6 address with colons - * %pi6 print an IPv6 address without colons - * %pI6c print an IPv6 address as specified by RFC 5952 - * %pIS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address - * %piS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address - * %pU[bBlL] print a UUID/GUID in big or little endian using lower or upper - * case. - * %*pE[achnops] print an escaped buffer - * %*ph[CDN] a variable-length hex string with a separator (supports up to 64 - * bytes of the input) - * %pC output the name (Common Clock Framework) or address (legacy clock - * framework) of a clock - * %pCn output the name (Common Clock Framework) or address (legacy clock - * framework) of a clock - * %pCr output the current rate of a clock * %n is ignored + * %p* is handled by pointer() + * + * See pointer() or Documentation/printk-formats.txt for more + * extensive description. * - * ** Please update Documentation/printk-formats.txt when making changes ** + * ** Please update the documentation in both places when making changes ** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing -- cgit v0.10.2 From b006f19b055f90b73e97086490f95b83095dcc91 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:30:20 -0800 Subject: lib/vsprintf.c: handle invalid format specifiers more robustly If we meet any invalid or unsupported format specifier, 'handling' it by just printing it as a literal string is not safe: Presumably the format string and the arguments passed gcc's type checking, but that means something like sprintf(buf, "%n %pd", &intvar, dentry) would end up interpreting &intvar as a struct dentry*. When the offending specifier was %n it used to be at the end of the format string, but we can't rely on that always being the case. Also, gcc doesn't complain about some more or less exotic qualifiers (or 'length modifiers' in posix-speak) such as 'j' or 'q', but being unrecognized by the kernel's printf implementation, they'd be interpreted as unknown specifiers, and the rest of arguments would be interpreted wrongly. So let's complain about anything we don't understand, not just %n, and stop pretending that we'd be able to make sense of the rest of the format/arguments. If the offending specifier is in a printk() call we unfortunately only get a "BUG: recent printk recursion!", but at least direct users of the sprintf family will be caught. Signed-off-by: Rasmus Villemoes Reviewed-by: Andy Shevchenko Acked-by: Kees Cook Cc: Martin Kletzander Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/vsprintf.c b/lib/vsprintf.c index e966a45..e35724c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1772,14 +1772,14 @@ qualifier: case 'n': /* - * Since %n poses a greater security risk than utility, treat - * it as an invalid format specifier. Warn about its use so - * that new instances don't get added. + * Since %n poses a greater security risk than + * utility, treat it as any other invalid or + * unsupported format specifier. */ - WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", fmt); /* Fall-through */ default: + WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt); spec->type = FORMAT_TYPE_INVALID; return fmt - start; } @@ -1920,10 +1920,15 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_INVALID: - if (str < end) - *str = '%'; - ++str; - break; + /* + * Presumably the arguments passed gcc's type + * checking, but there is no safe or sane way + * for us to continue parsing the format and + * fetching from the va_list; the remaining + * specifiers and arguments would be out of + * sync. + */ + goto out; default: switch (spec.type) { @@ -1968,6 +1973,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) } } +out: if (size > 0) { if (str < end) *str = '\0'; @@ -2165,9 +2171,10 @@ do { \ switch (spec.type) { case FORMAT_TYPE_NONE: - case FORMAT_TYPE_INVALID: case FORMAT_TYPE_PERCENT_CHAR: break; + case FORMAT_TYPE_INVALID: + goto out; case FORMAT_TYPE_WIDTH: case FORMAT_TYPE_PRECISION: @@ -2229,6 +2236,7 @@ do { \ } } +out: return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; #undef save_arg } @@ -2351,12 +2359,14 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) break; case FORMAT_TYPE_PERCENT_CHAR: - case FORMAT_TYPE_INVALID: if (str < end) *str = '%'; ++str; break; + case FORMAT_TYPE_INVALID: + goto out; + default: { unsigned long long num; @@ -2399,6 +2409,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) } /* switch(spec.type) */ } /* while(*fmt) */ +out: if (size > 0) { if (str < end) *str = '\0'; -- cgit v0.10.2 From 762abb515415a5a4a37423f4f4ff5770d5a14bac Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:30:23 -0800 Subject: lib/vsprintf.c: also improve sanity check in bstr_printf() Quoting from 2aa2f9e21e4e ("lib/vsprintf.c: improve sanity check in vsnprintf()"): On 64 bit, size may very well be huge even if bit 31 happens to be 0. Somehow it doesn't feel right that one can pass a 5 GiB buffer but not a 3 GiB one. So cap at INT_MAX as was probably the intention all along. This is also the made-up value passed by sprintf and vsprintf. I should have seen this copy-pasted instance back then, but let's just do it now. Signed-off-by: Rasmus Villemoes Reviewed-by: Andy Shevchenko Acked-by: Kees Cook Cc: Martin Kletzander Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/vsprintf.c b/lib/vsprintf.c index e35724c..a513469 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2270,7 +2270,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) char *str, *end; const char *args = (const char *)bin_buf; - if (WARN_ON_ONCE((int) size < 0)) + if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; -- cgit v0.10.2 From 80c9eb46fa7236c1236ec695bfa2403c10cb8645 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:30:26 -0800 Subject: lib/vsprintf.c: remove SPECIAL handling in pointer() As a quick git grep -E '%[ +0#-]*#[ +0#-]*(\*|[0-9]+)?(\.(\*|[0-9]+)?)?p' shows, nobody uses the # flag with %p. Should one try to do so, one will be met with warning: `#' flag used with `%p' gnu_printf format [-Wformat] (POSIX and C99 both say "... For other conversion specifiers, the behavior is undefined.". Obviously, the kernel can choose to define the behaviour however it wants, but as long as gcc issues that warning, users are unlikely to show up.) Since default_width is effectively always 2*sizeof(void*), we can simplify the prologue of pointer() and save a few instructions. Signed-off-by: Rasmus Villemoes Reviewed-by: Andy Shevchenko Acked-by: Kees Cook Cc: Martin Kletzander Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/vsprintf.c b/lib/vsprintf.c index a513469..7848d53 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1460,7 +1460,7 @@ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { - int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0); + const int default_width = 2 * sizeof(void *); if (!ptr && *fmt != 'K') { /* -- cgit v0.10.2 From 707cc7280f452a162c52bc240eae62568b9753c2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:30:29 -0800 Subject: test_printf: test printf family at runtime This adds a simple module for testing the kernel's printf facilities. Previously, some %p extensions have caused a wrong return value in case the entire output didn't fit and/or been unusable in kasprintf(). This should help catch such issues. Also, it should help ensure that changes to the formatting algorithms don't break anything. I'm not sure if we have a struct dentry or struct file lying around at boot time or if we can fake one, but most %p extensions should be testable, as should the ordinary number and string formatting. The nature of vararg functions means we can't use a more conventional table-driven approach. For now, this is mostly a skeleton; contributions are very welcome. Some tests are/will be slightly annoying to write, since the expected output depends on stuff like CONFIG_*, sizeof(long), runtime values etc. Signed-off-by: Rasmus Villemoes Reviewed-by: Kees Cook Cc: Andy Shevchenko Cc: Martin Kletzander Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1d1521c..16bf3bc 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1686,6 +1686,9 @@ config TEST_STRING_HELPERS config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" +config TEST_PRINTF + tristate "Test printf() family of functions at runtime" + config TEST_RHASHTABLE tristate "Perform selftest on resizable hash table" default n diff --git a/lib/Makefile b/lib/Makefile index 8de3b01..7f1de26 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o +obj-$(CONFIG_TEST_PRINTF) += test_printf.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/test_printf.c b/lib/test_printf.c new file mode 100644 index 0000000..c5a666a --- /dev/null +++ b/lib/test_printf.c @@ -0,0 +1,362 @@ +/* + * Test cases for printf facility. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define BUF_SIZE 256 +#define FILL_CHAR '$' + +#define PTR1 ((void*)0x01234567) +#define PTR2 ((void*)(long)(int)0xfedcba98) + +#if BITS_PER_LONG == 64 +#define PTR1_ZEROES "000000000" +#define PTR1_SPACES " " +#define PTR1_STR "1234567" +#define PTR2_STR "fffffffffedcba98" +#define PTR_WIDTH 16 +#else +#define PTR1_ZEROES "0" +#define PTR1_SPACES " " +#define PTR1_STR "1234567" +#define PTR2_STR "fedcba98" +#define PTR_WIDTH 8 +#endif +#define PTR_WIDTH_STR stringify(PTR_WIDTH) + +static unsigned total_tests __initdata; +static unsigned failed_tests __initdata; +static char *test_buffer __initdata; + +static int __printf(4, 0) __init +do_test(int bufsize, const char *expect, int elen, + const char *fmt, va_list ap) +{ + va_list aq; + int ret, written; + + total_tests++; + + memset(test_buffer, FILL_CHAR, BUF_SIZE); + va_copy(aq, ap); + ret = vsnprintf(test_buffer, bufsize, fmt, aq); + va_end(aq); + + if (ret != elen) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) returned %d, expected %d\n", + bufsize, fmt, ret, elen); + return 1; + } + + if (!bufsize) { + if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) { + pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", + fmt); + return 1; + } + return 0; + } + + written = min(bufsize-1, elen); + if (test_buffer[written]) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) did not nul-terminate buffer\n", + bufsize, fmt); + return 1; + } + + if (memcmp(test_buffer, expect, written)) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", + bufsize, fmt, test_buffer, written, expect); + return 1; + } + return 0; +} + +static void __printf(3, 4) __init +__test(const char *expect, int elen, const char *fmt, ...) +{ + va_list ap; + int rand; + char *p; + + BUG_ON(elen >= BUF_SIZE); + + va_start(ap, fmt); + + /* + * Every fmt+args is subjected to four tests: Three where we + * tell vsnprintf varying buffer sizes (plenty, not quite + * enough and 0), and then we also test that kvasprintf would + * be able to print it as expected. + */ + failed_tests += do_test(BUF_SIZE, expect, elen, fmt, ap); + rand = 1 + prandom_u32_max(elen+1); + /* Since elen < BUF_SIZE, we have 1 <= rand <= BUF_SIZE. */ + failed_tests += do_test(rand, expect, elen, fmt, ap); + failed_tests += do_test(0, expect, elen, fmt, ap); + + p = kvasprintf(GFP_KERNEL, fmt, ap); + if (p) { + if (memcmp(p, expect, elen+1)) { + pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", + fmt, p, expect); + failed_tests++; + } + kfree(p); + } + va_end(ap); +} + +#define test(expect, fmt, ...) \ + __test(expect, strlen(expect), fmt, ##__VA_ARGS__) + +static void __init +test_basic(void) +{ + /* Work around annoying "warning: zero-length gnu_printf format string". */ + char nul = '\0'; + + test("", &nul); + test("100%", "100%%"); + test("xxx%yyy", "xxx%cyyy", '%'); + __test("xxx\0yyy", 7, "xxx%cyyy", '\0'); +} + +static void __init +test_number(void) +{ + test("0x1234abcd ", "%#-12x", 0x1234abcd); + test(" 0x1234abcd", "%#12x", 0x1234abcd); + test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234); +} + +static void __init +test_string(void) +{ + test("", "%s%.0s", "", "123"); + test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456"); + test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5"); + /* + * POSIX and C99 say that a missing precision should be + * treated as a precision of 0. However, the kernel's printf + * implementation treats this case as if the . wasn't + * present. Let's add a test case documenting the current + * behaviour; should anyone ever feel the need to follow the + * standards more closely, this can be revisited. + */ + test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c"); + test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c"); +} + +static void __init +plain(void) +{ + test(PTR1_ZEROES PTR1_STR " " PTR2_STR, "%p %p", PTR1, PTR2); + /* + * The field width is overloaded for some %p extensions to + * pass another piece of information. For plain pointers, the + * behaviour is slightly odd: One cannot pass either the 0 + * flag nor a precision to %p without gcc complaining, and if + * one explicitly gives a field width, the number is no longer + * zero-padded. + */ + test("|" PTR1_STR PTR1_SPACES " | " PTR1_SPACES PTR1_STR "|", + "|%-*p|%*p|", PTR_WIDTH+2, PTR1, PTR_WIDTH+2, PTR1); + test("|" PTR2_STR " | " PTR2_STR "|", + "|%-*p|%*p|", PTR_WIDTH+2, PTR2, PTR_WIDTH+2, PTR2); + + /* + * Unrecognized %p extensions are treated as plain %p, but the + * alphanumeric suffix is ignored (that is, does not occur in + * the output.) + */ + test("|"PTR1_ZEROES PTR1_STR"|", "|%p0y|", PTR1); + test("|"PTR2_STR"|", "|%p0y|", PTR2); +} + +static void __init +symbol_ptr(void) +{ +} + +static void __init +kernel_ptr(void) +{ +} + +static void __init +struct_resource(void) +{ +} + +static void __init +addr(void) +{ +} + +static void __init +escaped_str(void) +{ +} + +static void __init +hex_string(void) +{ + const char buf[3] = {0xc0, 0xff, 0xee}; + + test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee", + "%3ph|%3phC|%3phD|%3phN", buf, buf, buf, buf); + test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee", + "%*ph|%*phC|%*phD|%*phN", 3, buf, 3, buf, 3, buf, 3, buf); +} + +static void __init +mac(void) +{ + const u8 addr[6] = {0x2d, 0x48, 0xd6, 0xfc, 0x7a, 0x05}; + + test("2d:48:d6:fc:7a:05", "%pM", addr); + test("05:7a:fc:d6:48:2d", "%pMR", addr); + test("2d-48-d6-fc-7a-05", "%pMF", addr); + test("2d48d6fc7a05", "%pm", addr); + test("057afcd6482d", "%pmR", addr); +} + +static void __init +ip4(void) +{ + struct sockaddr_in sa; + + sa.sin_family = AF_INET; + sa.sin_port = cpu_to_be16(12345); + sa.sin_addr.s_addr = cpu_to_be32(0x7f000001); + + test("127.000.000.001|127.0.0.1", "%pi4|%pI4", &sa.sin_addr, &sa.sin_addr); + test("127.000.000.001|127.0.0.1", "%piS|%pIS", &sa, &sa); + sa.sin_addr.s_addr = cpu_to_be32(0x01020304); + test("001.002.003.004:12345|1.2.3.4:12345", "%piSp|%pISp", &sa, &sa); +} + +static void __init +ip6(void) +{ +} + +static void __init +ip(void) +{ + ip4(); + ip6(); +} + +static void __init +uuid(void) +{ + const char uuid[16] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}; + + test("00010203-0405-0607-0809-0a0b0c0d0e0f", "%pUb", uuid); + test("00010203-0405-0607-0809-0A0B0C0D0E0F", "%pUB", uuid); + test("03020100-0504-0706-0809-0a0b0c0d0e0f", "%pUl", uuid); + test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid); +} + +static void __init +dentry(void) +{ +} + +static void __init +struct_va_format(void) +{ +} + +static void __init +struct_clk(void) +{ +} + +static void __init +bitmap(void) +{ + DECLARE_BITMAP(bits, 20); + const int primes[] = {2,3,5,7,11,13,17,19}; + int i; + + bitmap_zero(bits, 20); + test("00000|00000", "%20pb|%*pb", bits, 20, bits); + test("|", "%20pbl|%*pbl", bits, 20, bits); + + for (i = 0; i < ARRAY_SIZE(primes); ++i) + set_bit(primes[i], bits); + test("a28ac|a28ac", "%20pb|%*pb", bits, 20, bits); + test("2-3,5,7,11,13,17,19|2-3,5,7,11,13,17,19", "%20pbl|%*pbl", bits, 20, bits); + + bitmap_fill(bits, 20); + test("fffff|fffff", "%20pb|%*pb", bits, 20, bits); + test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits); +} + +static void __init +netdev_features(void) +{ +} + +static void __init +test_pointer(void) +{ + plain(); + symbol_ptr(); + kernel_ptr(); + struct_resource(); + addr(); + escaped_str(); + hex_string(); + mac(); + ip(); + uuid(); + dentry(); + struct_va_format(); + struct_clk(); + bitmap(); + netdev_features(); +} + +static int __init +test_printf_init(void) +{ + test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); + if (!test_buffer) + return -ENOMEM; + + test_basic(); + test_number(); + test_string(); + test_pointer(); + + kfree(test_buffer); + + if (failed_tests == 0) + pr_info("all %u tests passed\n", total_tests); + else + pr_warn("failed %u out of %u tests\n", failed_tests, total_tests); + + return failed_tests ? -EINVAL : 0; +} + +module_init(test_printf_init); + +MODULE_AUTHOR("Rasmus Villemoes "); +MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 317dc34ab786d4f22bf7ed8ab546b9ae57f900d0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Nov 2015 16:30:32 -0800 Subject: selftests: run lib/test_printf module This runs the lib/test_printf module to make sure printf is operating sanely. Signed-off-by: Kees Cook Cc: Rasmus Villemoes Cc: Shuah Khan Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index cfe1213..4b4957b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -6,6 +6,7 @@ TARGETS += firmware TARGETS += ftrace TARGETS += futex TARGETS += kcmp +TARGETS += lib TARGETS += membarrier TARGETS += memfd TARGETS += memory-hotplug diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile new file mode 100644 index 0000000..47147b9 --- /dev/null +++ b/tools/testing/selftests/lib/Makefile @@ -0,0 +1,8 @@ +# Makefile for lib/ function selftests + +# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" +all: + +TEST_PROGS := printf.sh + +include ../lib.mk diff --git a/tools/testing/selftests/lib/printf.sh b/tools/testing/selftests/lib/printf.sh new file mode 100644 index 0000000..4fdc70f --- /dev/null +++ b/tools/testing/selftests/lib/printf.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Runs printf infrastructure using test_printf kernel module + +if /sbin/modprobe -q test_printf; then + /sbin/modprobe -q -r test_printf + echo "printf: ok" +else + echo "printf: [FAIL]" + exit 1 +fi -- cgit v0.10.2 From d7ec9a05d6defda8432da574a2a888eed6fc29f6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:30:35 -0800 Subject: lib/vsprintf.c: update documentation %n is no longer just ignored; it results in early return from vsnprintf. Also add a request to add test cases for future %p extensions. Signed-off-by: Rasmus Villemoes Reviewed-by: Martin Kletzander Reviewed-by: Andy Shevchenko Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 9b8d7f7..b784c27 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -23,6 +23,10 @@ Example: Reminder: sizeof() result is of type size_t. +The kernel's printf does not support %n. For obvious reasons, floating +point formats (%e, %f, %g, %a) are also not recognized. Use of any +unsupported specifier or length qualifier results in a WARN and early +return from vsnprintf. Raw pointer value SHOULD be printed with %p. The kernel supports the following extended format specifiers for pointer types: @@ -305,13 +309,9 @@ Command from struct task_struct Passed by reference. -Ignored argument: +If you add other %p extensions, please extend lib/test_printf.c with +one or more test cases, if at all feasible. - %n %n - - The argument passed will be ignored. In other words, literal "%n" will - be in the output and the argument will be considered for next format - specifier. Thank you for your cooperation and attention. diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7848d53..f9cee8e 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1448,7 +1448,6 @@ int kptr_restrict __read_mostly; * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cr' For a clock, it prints the current rate of the clock - * - 'n' For ignored argument * * ** Please update also Documentation/printk-formats.txt when making changes ** * @@ -1814,8 +1813,10 @@ qualifier: * @fmt: The format string to use * @args: Arguments for the format string * - * This function follows C99 vsnprintf, but has some extensions: - * %n is ignored + * This function generally follows C99 vsnprintf, but has some + * extensions and a few limitations: + * + * %n is unsupported * %p* is handled by pointer() * * See pointer() or Documentation/printk-formats.txt for more -- cgit v0.10.2 From 3824657c522f19f85a76bd932821174a5557a382 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 6 Nov 2015 16:30:38 -0800 Subject: printk: prevent userland from spoofing kernel messages The following statement of ABI/testing/dev-kmsg is not quite right: It is not possible to inject messages from userspace with the facility number LOG_KERN (0), to make sure that the origin of the messages can always be reliably determined. Userland actually can inject messages with a facility of 0 by abusing the fact that the facility is stored in a u8 data type. By using a facility which is a multiple of 256 the assignment of msg->facility in log_store() implicitly truncates it to 0, i.e. LOG_KERN, allowing users of /dev/kmsg to spoof kernel messages as shown below: The following call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0 >/dev/kmsg ...leads to the following log entry (dmesg -x | tail -n 1): user :emerg : [ 66.137758] Kernel panic - not syncing: beer empty However, this call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0x800 >/dev/kmsg ...leads to the slightly different log entry (note the kernel facility): kern :emerg : [ 74.177343] Kernel panic - not syncing: beer empty Fix that by limiting the user provided facility to 8 bit right from the beginning and catch the truncation early. Fixes: 7ff9554bb578 ("printk: convert byte-buffer to variable-length...") Signed-off-by: Mathias Krause Cc: Greg Kroah-Hartman Cc: Petr Mladek Cc: Alex Elder Cc: Joe Perches Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b16f354..2ce8826 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,6 +269,9 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 @@ -612,7 +615,6 @@ struct devkmsg_user { static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -642,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; -- cgit v0.10.2 From 3cbcca8a80ea30f7e3efb29217193a089484de28 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 6 Nov 2015 16:30:41 -0800 Subject: get_maintainer: add missing documentation for --git-blame-signatures I really haven't used this option much myself, so feel free to improve on the documentation for it. I just noticed it while inspecting this script for undocumented features. Signed-off-by: Brian Norris Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 98bae86..6ba0976 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -781,6 +781,7 @@ MAINTAINER field selection options: --git-max-maintainers => maximum maintainers to add (default: $email_git_max_maintainers) --git-min-percent => minimum percentage of commits required (default: $email_git_min_percent) --git-blame => use git blame to find modified commits for patch or file + --git-blame-signatures => when used with --git-blame, also include all commit signers --git-since => git history to use (default: $email_git_since) --hg-since => hg history to use (default: $email_hg_since) --interactive => display a menu (mostly useful if used with the --git option) -- cgit v0.10.2 From cc7ff0ef6eca3deeea4a424ca47a67c8450d5424 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 6 Nov 2015 16:30:43 -0800 Subject: get_maintainer: it's '--pattern-depth', not '-pattern-depth' Though it appears that Perl's GetOptions will take either, the latter is not documented in the options listing. Signed-off-by: Brian Norris Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6ba0976..6c30727 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -813,7 +813,7 @@ Other options: --help => show this help information Default options: - [--email --nogit --git-fallback --m --n --l --multiline -pattern-depth=0 + [--email --nogit --git-fallback --m --n --l --multiline --pattern-depth=0 --remove-duplicates --rolestats] Notes: -- cgit v0.10.2 From b1312bfe61c08684f7325f17858933017bae7f59 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 6 Nov 2015 16:30:46 -0800 Subject: get_maintainer: add --no-foo options to --help Many flag options are boolean and support both a positive and a negative invocation from the command line. Some of these are even mentioned by example (e.g., --nogit is mentioned as a default option), but they aren't explicitly mentioned in the list of options. It happens that some of these are pretty important, as they are default-on, and to turn them off, you have to know about the --no-foo version. Rather than clutter the whole help text with bracketed '--[no]foo', let's just mention the general rule, a la 'man gcc'. Signed-off-by: Brian Norris Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6c30727..145f1bf 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -845,6 +845,9 @@ Notes: Entries in this file can be any command line argument. This file is prepended to any additional command line arguments. Multiple lines and # comments are allowed. + Most options have both positive and negative forms. + The negative forms for -- are --no and --no-. + EOT } -- cgit v0.10.2 From 4f07510df2e8c47fd65b8ffaaf6c5d334d59d598 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 6 Nov 2015 16:30:49 -0800 Subject: get_maintainer: --r (list reviewer) is on by default We don't consistenly document the default value next to the option listing, but we do have a list of defaults here, so let's keep it up to date. Signed-off-by: Brian Norris Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 145f1bf..0eec34a 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -813,7 +813,7 @@ Other options: --help => show this help information Default options: - [--email --nogit --git-fallback --m --n --l --multiline --pattern-depth=0 + [--email --nogit --git-fallback --m --r --n --l --multiline --pattern-depth=0 --remove-duplicates --rolestats] Notes: -- cgit v0.10.2 From 2a7cb1dc82fc2a52e747b4c496c13f6575fb1790 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Nov 2015 16:30:52 -0800 Subject: get_maintainer: add subsystem to reviewer output Reviewer output currently does not include the subsystem that matched. Add it. Miscellanea: o Add a get_subsystem_name routine to centralize this Signed-off-by: Joe Perches Tested-by: Krzysztof Kozlowski Cc: Lee Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 0eec34a..cab641a 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -974,20 +974,29 @@ sub find_ending_index { return $index; } -sub get_maintainer_role { +sub get_subsystem_name { my ($index) = @_; - my $i; my $start = find_starting_index($index); - my $end = find_ending_index($index); - my $role = "unknown"; my $subsystem = $typevalue[$start]; if ($output_section_maxlen && length($subsystem) > $output_section_maxlen) { $subsystem = substr($subsystem, 0, $output_section_maxlen - 3); $subsystem =~ s/\s*$//; $subsystem = $subsystem . "..."; } + return $subsystem; +} + +sub get_maintainer_role { + my ($index) = @_; + + my $i; + my $start = find_starting_index($index); + my $end = find_ending_index($index); + + my $role = "unknown"; + my $subsystem = get_subsystem_name($index); for ($i = $start + 1; $i < $end; $i++) { my $tv = $typevalue[$i]; @@ -1021,16 +1030,7 @@ sub get_maintainer_role { sub get_list_role { my ($index) = @_; - my $i; - my $start = find_starting_index($index); - my $end = find_ending_index($index); - - my $subsystem = $typevalue[$start]; - if ($output_section_maxlen && length($subsystem) > $output_section_maxlen) { - $subsystem = substr($subsystem, 0, $output_section_maxlen - 3); - $subsystem =~ s/\s*$//; - $subsystem = $subsystem . "..."; - } + my $subsystem = get_subsystem_name($index); if ($subsystem eq "THE REST") { $subsystem = ""; @@ -1118,7 +1118,8 @@ sub add_categories { } } if ($email_reviewer) { - push_email_addresses($pvalue, 'reviewer'); + my $subsystem = get_subsystem_name($i); + push_email_addresses($pvalue, "reviewer:$subsystem"); } } elsif ($ptype eq "T") { push(@scm, $pvalue); -- cgit v0.10.2 From cd2c3e7f94f3d8210320bfb9b5712bdcbb12f8db Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 6 Nov 2015 16:30:55 -0800 Subject: MAINTAINERS: add missing extcon directory Add the missing extcon directory to maintain them. When using get_maintainer.pl, the result should include the correct maintainer information. Signed-off-by: Chanwoo Choi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 653ee9a..6318e95 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4207,7 +4207,10 @@ L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git S: Maintained F: drivers/extcon/ +F: include/linux/extcon/ +F: include/linux/extcon.h F: Documentation/extcon/ +F: Documentation/devicetree/bindings/extcon/ EXYNOS DP DRIVER M: Jingoo Han -- cgit v0.10.2 From e2eb53aa96754b97d158eff884dde88abbad925e Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:30:58 -0800 Subject: bitops.h: improve sign_extend32()'s documentation It is often overlooked that sign_extend32(), despite its name, is safe to use for 16 and 8 bit types as well. This should help prevent sign extension being done manually some other way. Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/bitops.h b/include/linux/bitops.h index e635533..5629923 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -164,6 +164,8 @@ static inline __u8 ror8(__u8 word, unsigned int shift) * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit + * + * This is safe to use for 16- and 8-bit types as well. */ static inline __s32 sign_extend32(__u32 value, int index) { -- cgit v0.10.2 From 48e203e21b29cd4b2c58403fe8bca68e2e854895 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:02 -0800 Subject: bitops.h: add sign_extend64() Months back, this was discussed, see https://lkml.org/lkml/2015/1/18/289 The result was the 64-bit version being "likely fine", "valuable" and "correct". The discussion fell asleep but since there are possible users, let's add it. Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 5629923..2b8ed12 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -173,6 +173,17 @@ static inline __s32 sign_extend32(__u32 value, int index) return (__s32)(value << shift) >> shift; } +/** + * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit + * @value: value to sign extend + * @index: 0 based bit index (0<=index<64) to sign bit + */ +static inline __s64 sign_extend64(__u64 value, int index) +{ + __u8 shift = 63 - index; + return (__s64)(value << shift) >> shift; +} + static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) -- cgit v0.10.2 From 06d8f8178c5709ff21cfbe16b6c078d2669b6e80 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:05 -0800 Subject: arch/sh/kernel/traps_64.c: use sign_extend64() for sign extension Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/sh/kernel/cpu/sh5/unwind.c b/arch/sh/kernel/cpu/sh5/unwind.c index 10aed41..3a4fed4 100644 --- a/arch/sh/kernel/cpu/sh5/unwind.c +++ b/arch/sh/kernel/cpu/sh5/unwind.c @@ -159,7 +159,7 @@ static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc, /* Sign extend */ regcache[dest] = - ((((s64)(u64)op >> 10) & 0xffff) << 54) >> 54; + sign_extend64((((u64)op >> 10) & 0xffff), 9); break; case (0xd0 >> 2): /* addi */ case (0xd4 >> 2): /* addi.l */ diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 112ea11..d208c27 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -101,7 +101,7 @@ static int generate_and_check_address(struct pt_regs *regs, if (displacement_not_indexed) { __s64 displacement; displacement = (opcode >> 10) & 0x3ff; - displacement = ((displacement << 54) >> 54); /* sign extend */ + displacement = sign_extend64(displacement, 9); addr = (__u64)((__s64)base_address + (displacement << width_shift)); } else { __u64 offset; -- cgit v0.10.2 From 78e3c7951021b4e1a554b3d619506b55b0619073 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Fri, 6 Nov 2015 16:31:08 -0800 Subject: arch/x86/kernel/cpu/perf_event_msr.c: use sign_extend64() for sign extension Signed-off-by: Martin Kepplinger Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: George Spelvin Cc: Rasmus Villemoes Cc: Maxime Coquelin Cc: Denys Vlasenko Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index f32ac13..ec863b9 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -163,10 +163,9 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { - delta <<= 32; - delta >>= 32; /* sign extend */ - } + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + delta = sign_extend64(delta, 31); + local64_add(now - prev, &event->count); } -- cgit v0.10.2 From 1c78bc170f393317dfa9d57baa599a51061ea86a Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 6 Nov 2015 16:31:11 -0800 Subject: lib/halfmd4.c: use rol32 inline function in the ROUND macro provides rol32() inline function, let's use already predefined function instead of direct expression. Signed-off-by: Alexander Kuleshov Cc: Herbert Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/halfmd4.c b/lib/halfmd4.c index a8fe627..137e861 100644 --- a/lib/halfmd4.c +++ b/lib/halfmd4.c @@ -1,6 +1,7 @@ #include #include #include +#include /* F, G and H are basic MD4 functions: selection, majority, parity */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) @@ -14,7 +15,7 @@ * Rotation is separate from addition to prevent recomputation */ #define ROUND(f, a, b, c, d, x, s) \ - (a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s))) + (a += f(b, c, d) + x, a = rol32(a, s)) #define K1 0 #define K2 013240474631UL #define K3 015666365641UL -- cgit v0.10.2 From 943ba6503802a46318cb9f5ab45be31d42e6f884 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Nov 2015 16:31:14 -0800 Subject: lib/test-string_helpers.c: add string_get_size() tests Add a couple of simple tests for string_get_size(). The last one will hang the kernel without the 'lib/string_helpers.c: fix infinite loop in string_get_size()' fix. Signed-off-by: Vitaly Kuznetsov Cc: James Bottomley Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: "K. Y. Srinivasan" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index 8e376ef..98866a7 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -326,6 +326,39 @@ out: kfree(out_test); } +#define string_get_size_maxbuf 16 +#define test_string_get_size_one(size, blk_size, units, exp_result) \ + do { \ + BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf); \ + __test_string_get_size((size), (blk_size), (units), \ + (exp_result)); \ + } while (0) + + +static __init void __test_string_get_size(const u64 size, const u64 blk_size, + const enum string_size_units units, + const char *exp_result) +{ + char buf[string_get_size_maxbuf]; + + string_get_size(size, blk_size, units, buf, sizeof(buf)); + if (!memcmp(buf, exp_result, strlen(exp_result) + 1)) + return; + + buf[sizeof(buf) - 1] = '\0'; + pr_warn("Test 'test_string_get_size_one' failed!\n"); + pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n", + size, blk_size, units); + pr_warn("expected: '%s', got '%s'\n", exp_result, buf); +} + +static __init void test_string_get_size(void) +{ + test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB"); + test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB"); + test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B"); +} + static int __init test_string_helpers_init(void) { unsigned int i; @@ -344,6 +377,9 @@ static int __init test_string_helpers_init(void) for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++) test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1); + /* Test string_get_size() */ + test_string_get_size(); + return -EINVAL; } module_init(test_string_helpers_init); -- cgit v0.10.2 From 2cf12f821cd4f996bfabeec23d8f25e7a2052a28 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 6 Nov 2015 16:31:17 -0800 Subject: lib/llist.c: fix data race in llist_del_first llist_del_first reads entry->next, but it did not acquire visibility over the entry node. As the result it can get a stale value of entry->next (e.g. NULL or whatever garbage was there before the appending thread wrote correct value). And then commit that value as llist head with cmpxchg. That will corrupt llist. Note there is a control-dependency between read of head->first and read of entry->next, but it does not make the code correct. Kernel memory model unambiguously says: "A load-load control dependency requires a full read memory barrier". Use smp_load_acquire to acquire visibility over the entry node. The data race was found with KernelThreadSanitizer (KTSAN). Here is an example of KTSAN report: ThreadSanitizer: data-race in llist_del_first Read of size 1 by thread T389 (K2630, CPU0): [] llist_del_first+0x39/0x70 lib/llist.c:74 [< inlined >] tty_buffer_alloc drivers/tty/tty_buffer.c:181 [] __tty_buffer_request_room+0xb4/0x250 drivers/tty/tty_buffer.c:292 [] tty_insert_flip_string_fixed_flag+0x6c/0x150 drivers/tty/tty_buffer.c:337 [< inlined >] tty_insert_flip_string include/linux/tty_flip.h:35 [] pty_write+0x72/0xc0 drivers/tty/pty.c:110 [< inlined >] process_output_block drivers/tty/n_tty.c:611 [] n_tty_write+0x346/0x7f0 drivers/tty/n_tty.c:2401 [< inlined >] do_tty_write drivers/tty/tty_io.c:1159 [] tty_write+0x21f/0x3f0 drivers/tty/tty_io.c:1245 [] __vfs_write+0x5f/0x1f0 fs/read_write.c:489 [] vfs_write+0xef/0x280 fs/read_write.c:538 [< inlined >] SYSC_write fs/read_write.c:585 [] SyS_write+0x70/0xe0 fs/read_write.c:577 [] entry_SYSCALL_64_fastpath+0x12/0x71 arch/x86/entry/entry_64.S:186 Previous write of size 8 by thread T226 (K761, CPU0): [] llist_add_batch+0x32/0x70 lib/llist.c:44 (discriminator 16) [< inlined >] llist_add include/linux/llist.h:180 [] tty_buffer_free+0x6c/0xb0 drivers/tty/tty_buffer.c:221 [] flush_to_ldisc+0x107/0x300 drivers/tty/tty_buffer.c:514 [] process_one_work+0x47e/0x930 kernel/workqueue.c:2036 [] worker_thread+0xb0/0x900 kernel/workqueue.c:2170 [] kthread+0x150/0x170 kernel/kthread.c:209 [] ret_from_fork+0x3f/0x70 arch/x86/entry/entry_64.S:526 Signed-off-by: Dmitry Vyukov Reviewed-by: Paul E. McKenney Cc: Rasmus Villemoes Cc: Huang Ying Cc: Konstantin Serebryany Cc: Andrey Konovalov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/llist.c b/lib/llist.c index 0b0e977..ae5872b 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -66,12 +66,12 @@ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *old_entry, *next; - entry = head->first; + entry = smp_load_acquire(&head->first); for (;;) { if (entry == NULL) return NULL; old_entry = entry; - next = entry->next; + next = READ_ONCE(entry->next); entry = cmpxchg(&head->first, old_entry, next); if (entry == old_entry) break; -- cgit v0.10.2 From 0a9df786a6ae2f898114bdd242b64920dedf53bd Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:31:20 -0800 Subject: lib/kasprintf.c: introduce kvasprintf_const This adds kvasprintf_const which tries to use kstrdup_const if possible: If the format string contains no % characters, or if the format string is exactly "%s", we delegate to kstrdup_const. Otherwise, we fall back to kvasprintf. Just as for kstrdup_const, the main motivation is to save memory by reusing .rodata when possible. The return value should be freed by kfree_const, just like for kstrdup_const. There is deliberately no kasprintf_const: In the vast majority of cases, the format string argument is a literal, so one can determine statically whether one could instead use kstrdup_const directly (which would also require one to change all corresponding kfree calls to kfree_const). Signed-off-by: Rasmus Villemoes Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5582410..2c13f74 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -413,6 +413,8 @@ extern __printf(2, 3) char *kasprintf(gfp_t gfp, const char *fmt, ...); extern __printf(2, 0) char *kvasprintf(gfp_t gfp, const char *fmt, va_list args); +extern __printf(2, 0) +const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args); extern __scanf(2, 3) int sscanf(const char *, const char *, ...); diff --git a/lib/kasprintf.c b/lib/kasprintf.c index 32f1215..f194e6e 100644 --- a/lib/kasprintf.c +++ b/lib/kasprintf.c @@ -31,6 +31,22 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) } EXPORT_SYMBOL(kvasprintf); +/* + * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt + * (or the sole vararg) points to rodata, we will then save a memory + * allocation and string copy. In any case, the return value should be + * freed using kfree_const(). + */ +const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) +{ + if (!strchr(fmt, '%')) + return kstrdup_const(fmt, gfp); + if (!strcmp(fmt, "%s")) + return kstrdup_const(va_arg(ap, const char*), gfp); + return kvasprintf(gfp, fmt, ap); +} +EXPORT_SYMBOL(kvasprintf_const); + char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; -- cgit v0.10.2 From f773f32d71a4ed9a645634da107cd249e09e1180 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 6 Nov 2015 16:31:23 -0800 Subject: lib/kobject.c: use kvasprintf_const for formatting ->name Sometimes kobject_set_name_vargs is called with a format string conaining no %, or a format string of precisely "%s", where the single vararg happens to point to .rodata. kvasprintf_const detects these cases for us and returns a copy of that pointer instead of duplicating the string, thus saving some run-time memory. Otherwise, it falls back to kvasprintf. We just need to always deallocate ->name using kfree_const. Unfortunately, the dance we need to do to perform the '/' -> '!' sanitization makes the resulting code rather ugly. I instrumented kstrdup_const to provide some statistics on the memory saved, and for me this gave an additional ~14KB after boot (306KB was already saved; this patch bumped that to 320KB). I have KMALLOC_SHIFT_LOW==3, and since 80% of the kvasprintf_const hits were satisfied by an 8-byte allocation, the 14K would roughly be quadrupled when KMALLOC_SHIFT_LOW==5. Whether these numbers are sufficient to justify the ugliness I'll leave to others to decide. Signed-off-by: Rasmus Villemoes Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/kobject.c b/lib/kobject.c index 0554077..7cbccd2 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -257,18 +257,32 @@ static int kobject_add_internal(struct kobject *kobj) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs) { - char *s; + const char *s; if (kobj->name && !fmt) return 0; - s = kvasprintf(GFP_KERNEL, fmt, vargs); + s = kvasprintf_const(GFP_KERNEL, fmt, vargs); if (!s) return -ENOMEM; - /* ewww... some of these buggers have '/' in the name ... */ - strreplace(s, '/', '!'); - kfree(kobj->name); + /* + * ewww... some of these buggers have '/' in the name ... If + * that's the case, we need to make sure we have an actual + * allocated copy to modify, since kvasprintf_const may have + * returned something from .rodata. + */ + if (strchr(s, '/')) { + char *t; + + t = kstrdup(s, GFP_KERNEL); + kfree_const(s); + if (!t) + return -ENOMEM; + strreplace(t, '/', '!'); + s = t; + } + kfree_const(kobj->name); kobj->name = s; return 0; @@ -466,7 +480,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name) envp[0] = devpath_string; envp[1] = NULL; - name = dup_name = kstrdup(new_name, GFP_KERNEL); + name = dup_name = kstrdup_const(new_name, GFP_KERNEL); if (!name) { error = -ENOMEM; goto out; @@ -486,7 +500,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name) kobject_uevent_env(kobj, KOBJ_MOVE, envp); out: - kfree(dup_name); + kfree_const(dup_name); kfree(devpath_string); kfree(devpath); kobject_put(kobj); @@ -634,7 +648,7 @@ static void kobject_cleanup(struct kobject *kobj) /* free name if we allocated it */ if (name) { pr_debug("kobject: '%s': free name\n", name); - kfree(name); + kfree_const(name); } } -- cgit v0.10.2 From 90224350eaaf8b8043b19c393048f732bc2e4120 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:31:26 -0800 Subject: lib/is_single_threaded.c: change current_is_single_threaded() to use for_each_thread() Change current_is_single_threaded() to use for_each_thread() rather than deprecated while_each_thread(). Signed-off-by: Oleg Nesterov Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c index bd2bea9..391fd23 100644 --- a/lib/is_single_threaded.c +++ b/lib/is_single_threaded.c @@ -36,8 +36,7 @@ bool current_is_single_threaded(void) if (unlikely(p == task->group_leader)) continue; - t = p; - do { + for_each_thread(p, t) { if (unlikely(t->mm == mm)) goto found; if (likely(t->mm)) @@ -48,7 +47,7 @@ bool current_is_single_threaded(void) * forked before exiting. */ smp_rmb(); - } while_each_thread(p, t); + } } ret = true; found: -- cgit v0.10.2 From 8de1ee7ebfb4979c6444e81273e12e7a972c367d Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 6 Nov 2015 16:31:28 -0800 Subject: rbtree: clarify documentation of rbtree_postorder_for_each_entry_safe() I noticed that commit a20135ffbc44 ("writeback: don't drain bdi_writeback_congested on bdi destruction") added a usage of rbtree_postorder_for_each_entry_safe() in mm/backing-dev.c which appears to try to rb_erase() elements from an rbtree while iterating over it using rbtree_postorder_for_each_entry_safe(). Doing this will cause random nodes to be missed by the iteration because rb_erase() may rebalance the tree, changing the ordering that we're trying to iterate over. The previous documentation for rbtree_postorder_for_each_entry_safe() wasn't clear that this wasn't allowed, it was taken from the docs for list_for_each_entry_safe(), where erasing isn't a problem due to list_del() not reordering. Explicitly warn developers about this potential pit-fall. Note that I haven't fixed the actual issue that (it appears) the commit referenced above introduced (not familiar enough with that code). In general (and in this case), the patterns to follow are: - switch to rb_first() + rb_erase(), don't use rbtree_postorder_for_each_entry_safe(). - keep the postorder iteration and don't rb_erase() at all. Instead just clear the fields of rb_node & cgwb_congested_tree as required by other users of those structures. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Cody P Schafer Cc: John de la Garza Cc: Michel Lespinasse Cc: Peter Zijlstra Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 830c499..a5aa7ae 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -101,13 +101,21 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent }) /** - * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of - * given type safe against removal of rb_node entry + * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of + * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. + * + * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as + * list_for_each_entry_safe() and allows the iteration to continue independent + * of changes to @pos by the body of the loop. + * + * Note, however, that it cannot handle other modifications that re-order the + * rbtree it is iterating over. This includes calling rb_erase() on @pos, as + * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ -- cgit v0.10.2 From 9f029f540c2f7e010e4922d44ba0dfd05da79f88 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 6 Nov 2015 16:31:31 -0800 Subject: lib/hexdump.c: truncate output in case of overflow There is a classical off-by-one error in case when we try to place, for example, 1+1 bytes as hex in the buffer of size 6. The expected result is to get an output truncated, but in the reality we get 6 bytes filed followed by terminating NUL. Change the logic how we fill the output in case of byte dumping into limited space. This will follow the snprintf() behaviour by truncating output even on half bytes. Fixes: 114fc1afb2de (hexdump: make it return number of bytes placed in buffer) Signed-off-by: Andy Shevchenko Reported-by: Aaro Koskinen Tested-by: Aaro Koskinen Cc: Al Viro Cc: Catalin Marinas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/hexdump.c b/lib/hexdump.c index 8d74c20..992457b 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -169,11 +169,15 @@ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, } } else { for (j = 0; j < len; j++) { - if (linebuflen < lx + 3) + if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); + if (linebuflen < lx + 2) + goto overflow2; linebuf[lx++] = hex_asc_lo(ch); + if (linebuflen < lx + 2) + goto overflow2; linebuf[lx++] = ' '; } if (j) -- cgit v0.10.2 From 369c8dd390baffd77b892b563d03f800e65d2dfa Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Nov 2015 16:31:34 -0800 Subject: checkpatch: improve tests for fixes:, long lines and stack dumps in commit log Including BUG and stack dumps in commit logs makes checkpatch produce some false positive warning messages. checkpatch has multiple types of false positives: o Commit message lines > 75 chars o Stack dump address are mistaken for git commit IDs o Link: and Fixes: lines are allowed to be > 75 chars. o Fixes: style doesn't require ("") parentheses and double quotes like other uses of git commit ID and description. Fix these. Miscellanea: o Move the test for checking $commit_log_possible_stack_dump above the test for a long line commit message o Add test for hex address surrounded by square or angle brackets Signed-off-by: Joe Perches Reported-by: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f2a1131..2d88cbf9 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2313,42 +2313,43 @@ sub process { "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); } +# Check if the commit log is in a possible stack dump + if ($in_commit_log && !$commit_log_possible_stack_dump && + ($line =~ /^\s*(?:WARNING:|BUG:)/ || + $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || + # timestamp + $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { + # stack dump address + $commit_log_possible_stack_dump = 1; + } + # Check for line lengths > 75 in commit log, warn once if ($in_commit_log && !$commit_log_long_line && - length($line) > 75 && - !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ || - # file delta changes - $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ || - # filename then : - $line =~ /^\s*(?:Fixes:|Link:)/i || - # A Fixes: or Link: line - $commit_log_possible_stack_dump)) { + length($line) > 75 && + !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ || + # file delta changes + $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ || + # filename then : + $line =~ /^\s*(?:Fixes:|Link:)/i || + # A Fixes: or Link: line + $commit_log_possible_stack_dump)) { WARN("COMMIT_LOG_LONG_LINE", "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr); $commit_log_long_line = 1; } -# Check if the commit log is in a possible stack dump - if ($in_commit_log && !$commit_log_possible_stack_dump && - ($line =~ /^\s*(?:WARNING:|BUG:)/ || - $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || - # timestamp - $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { - # stack dump address - $commit_log_possible_stack_dump = 1; - } - # Reset possible stack dump if a blank line is found - if ($in_commit_log && $commit_log_possible_stack_dump && - $line =~ /^\s*$/) { - $commit_log_possible_stack_dump = 0; - } + if ($in_commit_log && $commit_log_possible_stack_dump && + $line =~ /^\s*$/) { + $commit_log_possible_stack_dump = 0; + } # Check for git id commit length and improperly formed commit descriptions - if ($in_commit_log && + if ($in_commit_log && !$commit_log_possible_stack_dump && ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || - ($line =~ /\b[0-9a-f]{12,40}\b/i && - $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) { + ($line =~ /\b[0-9a-f]{12,40}\b/i && + $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i && + $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) { my $init_char = "c"; my $orig_commit = ""; my $short = 1; -- cgit v0.10.2 From 6d32f7a391466ed89d8a1018fdff5330766cc272 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Nov 2015 16:31:37 -0800 Subject: checkpatch: improve the unnecessary initialisers tests Global and static variables don't need to be initialized to 0. There is already a test for this but the output message doesn't mention booleans initialized to false. Improve the output message and the test by adding various forms with possible specific integer types and possible multiple zeros. Miscellanea: o Use a variable to hold the possible 0 test Signed-off-by: Joe Perches Signed-off-by: Shailendra Verma Tested-by: Shailendra Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2d88cbf9..2b3c228 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -370,6 +370,8 @@ our $typeTypedefs = qr{(?x: $typeKernelTypedefs\b )}; +our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; + our $logFunctions = qr{(?x: printk(?:_ratelimited|_once|)| (?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| @@ -3334,21 +3336,20 @@ sub process { } # check for global initialisers. - if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*(?:0|NULL|false)\s*;/) { + if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*($zero_initializer)\s*;/) { if (ERROR("GLOBAL_INITIALISERS", - "do not initialise globals to 0 or NULL\n" . - $herecurr) && + "do not initialise globals to $1\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s/(^.$Type\s*$Ident(?:\s+$Modifier)*)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(^.$Type\s*$Ident(?:\s+$Modifier)*)\s*=\s*$zero_initializer\s*;/$1;/; } } # check for static initialisers. - if ($line =~ /^\+.*\bstatic\s.*=\s*(0|NULL|false)\s*;/) { + if ($line =~ /^\+.*\bstatic\s.*=\s*($zero_initializer)\s*;/) { if (ERROR("INITIALISED_STATIC", - "do not initialise statics to 0 or NULL\n" . + "do not initialise statics to $1\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*$zero_initializer\s*;/$1;/; } } -- cgit v0.10.2 From da80a39fc962ceca085ddfb7d63e00309b305f17 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 6 Nov 2015 16:31:40 -0800 Subject: nilfs2: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); // Signed-off-by: Julia Lawall Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f47585b..c69455a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void) */ rcu_barrier(); - if (nilfs_inode_cachep) - kmem_cache_destroy(nilfs_inode_cachep); - if (nilfs_transaction_cachep) - kmem_cache_destroy(nilfs_transaction_cachep); - if (nilfs_segbuf_cachep) - kmem_cache_destroy(nilfs_segbuf_cachep); - if (nilfs_btree_path_cache) - kmem_cache_destroy(nilfs_btree_path_cache); + kmem_cache_destroy(nilfs_inode_cachep); + kmem_cache_destroy(nilfs_transaction_cachep); + kmem_cache_destroy(nilfs_segbuf_cachep); + kmem_cache_destroy(nilfs_btree_path_cache); } static int __init nilfs_init_cachep(void) -- cgit v0.10.2 From b7bed712d090c340b97d455c5cb62d151e004503 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:31:43 -0800 Subject: nilfs2: use nilfs_warning() in allocator implementation This uses nilfs_warning() to replace "printk(KERN_WARNING ...);" in the bitmap based allocator implementation of nilfs2. The warning messages are modified to include the device name and the inode number in each message. This makes it clear which metadata file of which device has output warnings such as "entry number xxxx already freed". Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 8df0f3b..afe98364 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -583,8 +583,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) - printk(KERN_WARNING "%s: entry number %llu already freed\n", - __func__, (unsigned long long)req->pr_entry_nr); + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)req->pr_entry_nr, + (unsigned long)inode->i_ino); else nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); @@ -620,8 +622,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) - printk(KERN_WARNING "%s: entry number %llu already freed\n", - __func__, (unsigned long long)req->pr_entry_nr); + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)req->pr_entry_nr, + (unsigned long)inode->i_ino); else nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); @@ -734,10 +738,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (!nilfs_clear_bit_atomic( nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) { - printk(KERN_WARNING - "%s: entry number %llu already freed\n", - __func__, - (unsigned long long)entry_nrs[j]); + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)entry_nrs[j], + (unsigned long)inode->i_ino); } else { n++; } -- cgit v0.10.2 From 4e9e63a671fbe13f448fb2e69dfdbb6c2a008368 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:31:45 -0800 Subject: nilfs2: do not call nilfs_mdt_bgl_lock() needlessly In the bitmap based allocator implementation, nilfs_mdt_bgl_lock() helper is frequently used to get a spinlock protecting a target block group. This reduces its usage and simplifies arguments of some related functions by directly passing a pointer to the spinlock. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index afe98364..ff0d62c 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -133,38 +133,34 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) /** * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group - * @inode: inode of metadata file using this allocator - * @group: group number * @desc: pointer to descriptor structure for the group + * @lock: spin lock protecting @desc */ static unsigned long -nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, - const struct nilfs_palloc_group_desc *desc) +nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, + spinlock_t *lock) { unsigned long nfree; - spin_lock(nilfs_mdt_bgl_lock(inode, group)); + spin_lock(lock); nfree = le32_to_cpu(desc->pg_nfrees); - spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + spin_unlock(lock); return nfree; } /** * nilfs_palloc_group_desc_add_entries - adjust count of free entries - * @inode: inode of metadata file using this allocator - * @group: group number * @desc: pointer to descriptor structure for the group + * @lock: spin lock protecting @desc * @n: delta to be added */ static void -nilfs_palloc_group_desc_add_entries(struct inode *inode, - unsigned long group, - struct nilfs_palloc_group_desc *desc, - u32 n) +nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, + spinlock_t *lock, u32 n) { - spin_lock(nilfs_mdt_bgl_lock(inode, group)); + spin_lock(lock); le32_add_cpu(&desc->pg_nfrees, n); - spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + spin_unlock(lock); } /** @@ -332,17 +328,15 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, /** * nilfs_palloc_find_available_slot - find available slot in a group - * @inode: inode of metadata file using this allocator - * @group: group number - * @target: offset number of an entry in the group (start point) * @bitmap: bitmap of the group + * @target: offset number of an entry in the group (start point) * @bsize: size in bits + * @lock: spin lock protecting @bitmap */ -static int nilfs_palloc_find_available_slot(struct inode *inode, - unsigned long group, +static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, - unsigned char *bitmap, - int bsize) + int bsize, + spinlock_t *lock) { int curr, pos, end, i; @@ -351,12 +345,11 @@ static int nilfs_palloc_find_available_slot(struct inode *inode, if (end > bsize) end = bsize; pos = nilfs_find_next_zero_bit(bitmap, end, target); - if (pos < end && - !nilfs_set_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) + if (pos < end && !nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; - } else + } else { end = 0; + } for (i = 0, curr = end; i < bsize; @@ -370,10 +363,8 @@ static int nilfs_palloc_find_available_slot(struct inode *inode, if (end > bsize) end = bsize; pos = nilfs_find_next_zero_bit(bitmap, end, curr); - if ((pos < end) && - !nilfs_set_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), pos, - bitmap)) + if (pos < end && + !nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; } } @@ -477,6 +468,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, unsigned long group_offset, maxgroup_offset; unsigned long n, entries_per_group, groups_per_desc_block; unsigned long i, j; + spinlock_t *lock; int pos, ret; ngroups = nilfs_palloc_groups_count(inode); @@ -501,8 +493,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); for (j = 0; j < n; j++, desc++, group++) { - if (nilfs_palloc_group_desc_nfrees(inode, group, desc) - > 0) { + lock = nilfs_mdt_bgl_lock(inode, group); + if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) { ret = nilfs_palloc_get_bitmap_block( inode, group, 1, &bitmap_bh); if (ret < 0) @@ -510,12 +502,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( - inode, group, group_offset, bitmap, - entries_per_group); + bitmap, group_offset, + entries_per_group, lock); if (pos >= 0) { /* found a free entry */ nilfs_palloc_group_desc_add_entries( - inode, group, desc, -1); + desc, lock, -1); req->pr_entry_nr = entries_per_group * group + pos; kunmap(desc_bh->b_page); @@ -573,6 +565,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, unsigned long group, group_offset; unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; + spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap(req->pr_desc_bh->b_page); @@ -580,15 +573,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + lock = nilfs_mdt_bgl_lock(inode, group); - if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) + if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warning(inode->i_sb, __func__, "entry number %llu already freed: ino=%lu\n", (unsigned long long)req->pr_entry_nr, (unsigned long)inode->i_ino); else - nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); @@ -613,6 +606,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, void *desc_kaddr, *bitmap_kaddr; unsigned char *bitmap; unsigned long group, group_offset; + spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap(req->pr_desc_bh->b_page); @@ -620,14 +614,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); - if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) + lock = nilfs_mdt_bgl_lock(inode, group); + + if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warning(inode->i_sb, __func__, "entry number %llu already freed: ino=%lu\n", (unsigned long long)req->pr_entry_nr, (unsigned long)inode->i_ino); else - nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); @@ -712,6 +707,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; + spinlock_t *lock; int i, j, n, ret; for (i = 0; i < nitems; i = j) { @@ -730,14 +726,14 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) inode, group, desc_bh, desc_kaddr); bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + lock = nilfs_mdt_bgl_lock(inode, group); for (j = i, n = 0; (j < nitems) && nilfs_palloc_group_is_in(inode, group, entry_nrs[j]); j++) { nilfs_palloc_group(inode, entry_nrs[j], &group_offset); - if (!nilfs_clear_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) { + if (!nilfs_clear_bit_atomic(lock, group_offset, + bitmap)) { nilfs_warning(inode->i_sb, __func__, "entry number %llu already freed: ino=%lu\n", (unsigned long long)entry_nrs[j], @@ -746,7 +742,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) n++; } } - nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + nilfs_palloc_group_desc_add_entries(desc, lock, n); kunmap(bitmap_bh->b_page); kunmap(desc_bh->b_page); -- cgit v0.10.2 From 18c41b37f0f16a0d6e5b1a73563d0c1333e7ef70 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:31:48 -0800 Subject: nilfs2: refactor nilfs_palloc_find_available_slot() The current implementation of nilfs_palloc_find_available_slot() function is overkill. The underlying bit search routine is well optimized, so this uses it more simply in nilfs_palloc_find_available_slot(). Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index ff0d62c..b15daf8 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -335,39 +335,33 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, - int bsize, + unsigned bsize, spinlock_t *lock) { - int curr, pos, end, i; + int pos, end = bsize; - if (target > 0) { - end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); - if (end > bsize) - end = bsize; - pos = nilfs_find_next_zero_bit(bitmap, end, target); - if (pos < end && !nilfs_set_bit_atomic(lock, pos, bitmap)) - return pos; - } else { - end = 0; + if (likely(target < bsize)) { + pos = target; + do { + pos = nilfs_find_next_zero_bit(bitmap, end, pos); + if (pos >= end) + break; + if (!nilfs_set_bit_atomic(lock, pos, bitmap)) + return pos; + } while (++pos < end); + + end = target; } - for (i = 0, curr = end; - i < bsize; - i += BITS_PER_LONG, curr += BITS_PER_LONG) { - /* wrap around */ - if (curr >= bsize) - curr = 0; - while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) - != ~0UL) { - end = curr + BITS_PER_LONG; - if (end > bsize) - end = bsize; - pos = nilfs_find_next_zero_bit(bitmap, end, curr); - if (pos < end && - !nilfs_set_bit_atomic(lock, pos, bitmap)) - return pos; - } + /* wrap around */ + for (pos = 0; pos < end; pos++) { + pos = nilfs_find_next_zero_bit(bitmap, end, pos); + if (pos >= end) + break; + if (!nilfs_set_bit_atomic(lock, pos, bitmap)) + return pos; } + return -ENOSPC; } -- cgit v0.10.2 From b22580948c39d71fb150c1d53148a381011dd109 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:31:51 -0800 Subject: nilfs2: get rid of nilfs_palloc_group_is_in() This unfolds nilfs_palloc_group_is_in() helper function into nilfs_palloc_freev() function to simplify a range check and an index calculation repeatedy performed in a loop of the function. Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index b15daf8..5b7ee36 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -673,22 +673,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode, } /** - * nilfs_palloc_group_is_in - judge if an entry is in a group - * @inode: inode of metadata file using this allocator - * @group: group number - * @nr: serial number of the entry (e.g. inode number) - */ -static int -nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) -{ - __u64 first, last; - - first = group * nilfs_palloc_entries_per_group(inode); - last = first + nilfs_palloc_entries_per_group(inode) - 1; - return (nr >= first) && (nr <= last); -} - -/** * nilfs_palloc_freev - deallocate a set of persistent objects * @inode: inode of metadata file using this allocator * @entry_nrs: array of entry numbers to be deallocated @@ -701,6 +685,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; + __u64 group_min_nr; + const unsigned long epg = nilfs_palloc_entries_per_group(inode); spinlock_t *lock; int i, j, n, ret; @@ -715,6 +701,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) brelse(desc_bh); return ret; } + + /* Get the first entry number of the group */ + group_min_nr = (__u64)group * epg; + desc_kaddr = kmap(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); @@ -722,10 +712,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) bitmap = bitmap_kaddr + bh_offset(bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); for (j = i, n = 0; - (j < nitems) && nilfs_palloc_group_is_in(inode, group, - entry_nrs[j]); + j < nitems && entry_nrs[j] >= group_min_nr && + entry_nrs[j] < group_min_nr + epg; j++) { - nilfs_palloc_group(inode, entry_nrs[j], &group_offset); + group_offset = entry_nrs[j] - group_min_nr; if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warning(inode->i_sb, __func__, -- cgit v0.10.2 From da019954dd821682d6b2a8330c9c90acb943c456 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:31:54 -0800 Subject: nilfs2: add helper functions to delete blocks from dat file This adds delete functions for data blocks of metadata files using bitmap based allocator. nilfs_palloc_delete_entry_block() deletes an entry block (e.g. block storing dat entries), and nilfs_palloc_delete_bitmap_block() deletes a bitmap block, respectively. These helpers are intended to be used in the successive change on deallocator of block addresses ("nilfs2: free unused dat file blocks during garbage collection"). Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 5b7ee36..225b797 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -236,6 +236,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, } /** + * nilfs_palloc_delete_block - delete a block on the persistent allocator file + * @inode: inode of metadata file using this allocator + * @blkoff: block offset + * @prev: nilfs_bh_assoc struct of the last used buffer + * @lock: spin lock protecting @prev + */ +static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + brelse(prev->bh); + prev->bh = NULL; + } + spin_unlock(lock); + return nilfs_mdt_delete_block(inode, blkoff); +} + +/** * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number @@ -274,6 +294,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, } /** + * nilfs_palloc_delete_bitmap_block - delete a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + */ +static int nilfs_palloc_delete_bitmap_block(struct inode *inode, + unsigned long group) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, + nilfs_palloc_bitmap_blkoff(inode, + group), + &cache->prev_bitmap, &cache->lock); +} + +/** * nilfs_palloc_get_entry_block - get buffer head of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) @@ -292,6 +328,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, } /** + * nilfs_palloc_delete_entry_block - delete an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry + */ +static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + &cache->prev_entry, &cache->lock); +} + +/** * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor * @inode: inode of metadata file using this allocator * @group: group number -- cgit v0.10.2 From d0c14a9ee79467cd6a04b281577e1e6f74806ab2 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:31:56 -0800 Subject: nilfs2: free unused dat file blocks during garbage collection As a nilfs2 volume ages, the amount of available disk space decreases little by little due to bloat of DAT (disk address translation) metadata file. Even if we delete all files in a file system and free their block addresses from the DAT file through a garbage collection, empty DAT blocks are not freed. This fixes the issue by extending the deallocator of block addresses so that empty data blocks and empty bitmap blocks of DAT are deleted. The following comparison shows the effect of this patch. Each shows disk amount information of a nilfs2 volume that we cleaned out by deleting all files and running gc after having filled 90% of its capacity. Before: Filesystem 1K-blocks Used Available Use% Mounted on /dev/sda1 500105212 3022844 472072192 1% /test After: Filesystem 1K-blocks Used Available Use% Mounted on /dev/sda1 500105212 16380 475078656 1% /test Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 225b797..b335a32 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -154,13 +154,17 @@ nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, * @lock: spin lock protecting @desc * @n: delta to be added */ -static void +static u32 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, spinlock_t *lock, u32 n) { + u32 nfree; + spin_lock(lock); le32_add_cpu(&desc->pg_nfrees, n); + nfree = le32_to_cpu(desc->pg_nfrees); spin_unlock(lock); + return nfree; } /** @@ -735,12 +739,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; - __u64 group_min_nr; + __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); + const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block; + unsigned entry_start, end, pos; spinlock_t *lock; - int i, j, n, ret; + int i, j, k, ret; + u32 nfree; for (i = 0; i < nitems; i = j) { + int change_group = false; + int nempties = 0, n = 0; + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); if (ret < 0) @@ -755,17 +765,13 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; - desc_kaddr = kmap(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); - for (j = i, n = 0; - j < nitems && entry_nrs[j] >= group_min_nr && - entry_nrs[j] < group_min_nr + epg; - j++) { - group_offset = entry_nrs[j] - group_min_nr; + + j = i; + entry_start = rounddown(group_offset, epb); + do { if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warning(inode->i_sb, __func__, @@ -775,18 +781,69 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) } else { n++; } - } - nilfs_palloc_group_desc_add_entries(desc, lock, n); + + j++; + if (j >= nitems || entry_nrs[j] < group_min_nr || + entry_nrs[j] >= group_min_nr + epg) { + change_group = true; + } else { + group_offset = entry_nrs[j] - group_min_nr; + if (group_offset >= entry_start && + group_offset < entry_start + epb) { + /* This entry is in the same block */ + continue; + } + } + + /* Test if the entry block is empty or not */ + end = entry_start + epb; + pos = nilfs_find_next_bit(bitmap, end, entry_start); + if (pos >= end) { + last_nrs[nempties++] = entry_nrs[j - 1]; + if (nempties >= ARRAY_SIZE(last_nrs)) + break; + } + + if (change_group) + break; + + /* Go on to the next entry block */ + entry_start = rounddown(group_offset, epb); + } while (true); kunmap(bitmap_bh->b_page); - kunmap(desc_bh->b_page); + mark_buffer_dirty(bitmap_bh); + brelse(bitmap_bh); + for (k = 0; k < nempties; k++) { + ret = nilfs_palloc_delete_entry_block(inode, + last_nrs[k]); + if (ret && ret != -ENOENT) { + nilfs_warning(inode->i_sb, __func__, + "failed to delete block of entry %llu: ino=%lu, err=%d\n", + (unsigned long long)last_nrs[k], + (unsigned long)inode->i_ino, ret); + } + } + + desc_kaddr = kmap_atomic(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); + kunmap_atomic(desc_kaddr); mark_buffer_dirty(desc_bh); - mark_buffer_dirty(bitmap_bh); nilfs_mdt_mark_dirty(inode); - - brelse(bitmap_bh); brelse(desc_bh); + + if (nfree == nilfs_palloc_entries_per_group(inode)) { + ret = nilfs_palloc_delete_bitmap_block(inode, group); + if (ret && ret != -ENOENT) { + nilfs_warning(inode->i_sb, __func__, + "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n", + group, + (unsigned long)inode->i_ino, ret); + } + } } return 0; } diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 4bd6451..6e6f49a 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_set_bit_atomic ext2_set_bit_atomic #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit find_next_zero_bit_le +#define nilfs_find_next_bit find_next_bit_le /** * struct nilfs_bh_assoc - block offset and buffer head association -- cgit v0.10.2 From 58497703837048ac501ce56056eb74b4361108fc Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Fri, 6 Nov 2015 16:31:59 -0800 Subject: nilfs2: add a tracepoint for tracking stage transition of segment construction This patch adds a tracepoint for tracking stage transition of block collection in segment construction. With the tracepoint, we can analysis the behavior of segment construction in depth. It would be useful for bottleneck detection and debugging, etc. The tracepoint is created with the standard trace API of linux (like ext3, ext4, f2fs and btrfs). So we can analysis with existing tools easily. Of course, more detailed analysis will be possible if we can create nilfs specific analysis tools. Below is an example of event dump with Brendan Gregg's perf-tools (https://github.com/brendangregg/perf-tools). Time consumption between each stage can be obtained. $ sudo bin/tpoint nilfs2:nilfs2_collection_stage_transition Tracing nilfs2:nilfs2_collection_stage_transition. Ctrl-C to end. segctord-14875 [003] ...1 28311.067794: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_INIT segctord-14875 [003] ...1 28311.068139: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_GC segctord-14875 [003] ...1 28311.068139: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_FILE segctord-14875 [003] ...1 28311.068486: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_IFILE segctord-14875 [003] ...1 28311.068540: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_CPFILE segctord-14875 [003] ...1 28311.068561: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_SUFILE segctord-14875 [003] ...1 28311.068565: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_DAT segctord-14875 [003] ...1 28311.068573: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_SR segctord-14875 [003] ...1 28311.068574: nilfs2_collection_stage_transition: sci = ffff8800ce6de000 stage = ST_DONE For capturing transition correctly, this patch adds wrappers for the member scnt of nilfs_cstage. With this change, every transition of the stage can produce trace event in a correct manner. Signed-off-by: Hitoshi Mitake Signed-off-by: Ryusuke Konishi Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c6abbad9..ef35404 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -77,6 +77,36 @@ enum { NILFS_ST_DONE, }; +#define CREATE_TRACE_POINTS +#include + +/* + * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are + * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of + * the variable must use them because transition of stage count must involve + * trace events (trace_nilfs2_collection_stage_transition). + * + * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't + * produce tracepoint events. It is provided just for making the intention + * clear. + */ +static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci) +{ + sci->sc_stage.scnt++; + trace_nilfs2_collection_stage_transition(sci); +} + +static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt) +{ + sci->sc_stage.scnt = next_scnt; + trace_nilfs2_collection_stage_transition(sci); +} + +static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci) +{ + return sci->sc_stage.scnt; +} + /* State flags of collection */ #define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ #define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ @@ -1062,7 +1092,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) size_t ndone; int err = 0; - switch (sci->sc_stage.scnt) { + switch (nilfs_sc_cstage_get(sci)) { case NILFS_ST_INIT: /* Pre-processes */ sci->sc_stage.flags = 0; @@ -1071,7 +1101,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_nblk_inc = 0; sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; if (mode == SC_LSEG_DSYNC) { - sci->sc_stage.scnt = NILFS_ST_DSYNC; + nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC); goto dsync_mode; } } @@ -1079,10 +1109,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_stage.dirty_file_ptr = NULL; sci->sc_stage.gc_inode_ptr = NULL; if (mode == SC_FLUSH_DAT) { - sci->sc_stage.scnt = NILFS_ST_DAT; + nilfs_sc_cstage_set(sci, NILFS_ST_DAT); goto dat_stage; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_GC: if (nilfs_doing_gc()) { head = &sci->sc_gc_inodes; @@ -1103,7 +1133,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.gc_inode_ptr = NULL; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_FILE: head = &sci->sc_dirty_files; ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, @@ -1125,10 +1155,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.dirty_file_ptr = NULL; if (mode == SC_FLUSH_FILE) { - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - sci->sc_stage.scnt++; + nilfs_sc_cstage_inc(sci); sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; /* Fall through */ case NILFS_ST_IFILE: @@ -1136,7 +1166,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; + nilfs_sc_cstage_inc(sci); /* Creating a checkpoint */ err = nilfs_segctor_create_checkpoint(sci); if (unlikely(err)) @@ -1147,7 +1177,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_SUFILE: err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, &ndone); @@ -1163,7 +1193,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_DAT: dat_stage: err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, @@ -1171,10 +1201,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) break; if (mode == SC_FLUSH_DAT) { - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_SR: if (mode == SC_LSEG_SR) { /* Appending a super root */ @@ -1184,7 +1214,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } /* End of a logical segment */ sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DSYNC: dsync_mode: @@ -1197,7 +1227,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) break; sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DONE: return 0; @@ -1442,7 +1472,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, goto failed; /* The current segment is filled up */ - if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + if (mode != SC_LSEG_SR || + nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE) break; nilfs_clear_logs(&sci->sc_segbufs); @@ -1946,7 +1977,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int err; - sci->sc_stage.scnt = NILFS_ST_INIT; + nilfs_sc_cstage_set(sci, NILFS_ST_INIT); sci->sc_cno = nilfs->ns_cno; err = nilfs_segctor_collect_dirty_files(sci, nilfs); @@ -1974,7 +2005,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto failed; /* Avoid empty segment */ - if (sci->sc_stage.scnt == NILFS_ST_DONE && + if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE && nilfs_segbuf_empty(sci->sc_curseg)) { nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; @@ -1988,7 +2019,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) nilfs_segctor_fill_in_file_bmap(sci); if (mode == SC_LSEG_SR && - sci->sc_stage.scnt >= NILFS_ST_CPFILE) { + nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) goto failed_to_write; @@ -2007,7 +2038,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) goto failed_to_write; - if (sci->sc_stage.scnt == NILFS_ST_DONE || + if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE || nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { /* * At this point, we avoid double buffering @@ -2020,7 +2051,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (err) goto failed_to_write; } - } while (sci->sc_stage.scnt != NILFS_ST_DONE); + } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE); out: nilfs_segctor_drop_written_files(sci, nilfs); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index a48d6de..0408b9b 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -67,7 +67,8 @@ struct nilfs_recovery_info { /** * struct nilfs_cstage - Context of collection stage - * @scnt: Stage count + * @scnt: Stage count, must be accessed via wrappers: + * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() * @flags: State flags * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file * @gc_inode_ptr: Pointer on the list of gc-inodes diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h new file mode 100644 index 0000000..573da00 --- /dev/null +++ b/include/trace/events/nilfs2.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nilfs2 + +#if !defined(_TRACE_NILFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NILFS2_H + +#include + +struct nilfs_sc_info; + +#define show_collection_stage(type) \ + __print_symbolic(type, \ + { NILFS_ST_INIT, "ST_INIT" }, \ + { NILFS_ST_GC, "ST_GC" }, \ + { NILFS_ST_FILE, "ST_FILE" }, \ + { NILFS_ST_IFILE, "ST_IFILE" }, \ + { NILFS_ST_CPFILE, "ST_CPFILE" }, \ + { NILFS_ST_SUFILE, "ST_SUFILE" }, \ + { NILFS_ST_DAT, "ST_DAT" }, \ + { NILFS_ST_SR, "ST_SR" }, \ + { NILFS_ST_DSYNC, "ST_DSYNC" }, \ + { NILFS_ST_DONE, "ST_DONE"}) + +TRACE_EVENT(nilfs2_collection_stage_transition, + + TP_PROTO(struct nilfs_sc_info *sci), + + TP_ARGS(sci), + + TP_STRUCT__entry( + __field(void *, sci) + __field(int, stage) + ), + + TP_fast_assign( + __entry->sci = sci; + __entry->stage = sci->sc_stage.scnt; + ), + + TP_printk("sci = %p stage = %s", + __entry->sci, + show_collection_stage(__entry->stage)) +); + +#endif /* _TRACE_NILFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE nilfs2 +#include -- cgit v0.10.2 From 44fda114601fa5edebeacecb265f09d802670bc0 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Fri, 6 Nov 2015 16:32:02 -0800 Subject: nilfs2: add a tracepoint for transaction events This patch adds a tracepoint for transaction events of nilfs. With the tracepoint, these events can be tracked: begin, abort, commit, trylock, lock, and unlock. Basically, these events have corresponding functions e.g. begin event corresponds nilfs_transaction_begin(). The unlock event is an exception. It corresponds to the iteration in nilfs_transaction_lock(). Only one tracepoint is introcued: nilfs2_transaction_transition. The above events are distinguished with newly introduced enum. With this tracepoint, we can analyse a critical section of segment constructoin. Sample output by tpoint of perf-tools: cp-4457 [000] ...1 63.266220: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800bf5ccc58 count = 1 flags = 9 state = BEGIN cp-4457 [000] ...1 63.266221: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800bf5ccc58 count = 0 flags = 9 state = COMMIT cp-4457 [000] ...1 63.266221: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800bf5ccc58 count = 0 flags = 9 state = COMMIT segctord-4371 [001] ...1 68.261196: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 10 state = TRYLOCK segctord-4371 [001] ...1 68.261280: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 10 state = LOCK segctord-4371 [001] ...1 68.261877: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 1 flags = 10 state = BEGIN segctord-4371 [001] ...1 68.262116: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 18 state = COMMIT segctord-4371 [001] ...1 68.265032: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 18 state = UNLOCK segctord-4371 [001] ...1 132.376847: nilfs2_transaction_transition: sb = ffff8802112b8800 ti = ffff8800b889bdf8 count = 0 flags = 10 state = TRYLOCK This patch also does trivial cleaning of comma usage in collection stage transition event for consistent coding style. Signed-off-by: Hitoshi Mitake Signed-off-by: Ryusuke Konishi Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ef35404..3fc4732 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -214,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb, { struct the_nilfs *nilfs; int ret = nilfs_prepare_segment_lock(ti); + struct nilfs_transaction_info *trace_ti; if (unlikely(ret < 0)) return ret; - if (ret > 0) + if (ret > 0) { + trace_ti = current->journal_info; + + trace_nilfs2_transaction_transition(sb, trace_ti, + trace_ti->ti_count, trace_ti->ti_flags, + TRACE_NILFS2_TRANSACTION_BEGIN); return 0; + } sb_start_intwrite(sb); @@ -229,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb, ret = -ENOSPC; goto failed; } + + trace_ti = current->journal_info; + trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count, + trace_ti->ti_flags, + TRACE_NILFS2_TRANSACTION_BEGIN); return 0; failed: @@ -261,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb) ti->ti_flags |= NILFS_TI_COMMIT; if (ti->ti_count > 0) { ti->ti_count--; + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); return 0; } if (nilfs->ns_writer) { @@ -272,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb) nilfs_segctor_do_flush(sci, 0); } up_read(&nilfs->ns_segctor_sem); + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); + current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_SYNC) @@ -290,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb) BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); if (ti->ti_count > 0) { ti->ti_count--; + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); return; } up_read(&nilfs->ns_segctor_sem); + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); + current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); @@ -339,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb, current->journal_info = ti; for (;;) { + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK); + down_write(&nilfs->ns_segctor_sem); if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) break; @@ -350,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb, } if (gcflag) ti->ti_flags |= NILFS_TI_GC; + + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK); } static void nilfs_transaction_unlock(struct super_block *sb) @@ -362,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb) up_write(&nilfs->ns_segctor_sem); current->journal_info = ti->ti_save; + + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK); } static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h index 573da00..e5649ac 100644 --- a/include/trace/events/nilfs2.h +++ b/include/trace/events/nilfs2.h @@ -42,6 +42,59 @@ TRACE_EVENT(nilfs2_collection_stage_transition, show_collection_stage(__entry->stage)) ); +#ifndef TRACE_HEADER_MULTI_READ +enum nilfs2_transaction_transition_state { + TRACE_NILFS2_TRANSACTION_BEGIN, + TRACE_NILFS2_TRANSACTION_COMMIT, + TRACE_NILFS2_TRANSACTION_ABORT, + TRACE_NILFS2_TRANSACTION_TRYLOCK, + TRACE_NILFS2_TRANSACTION_LOCK, + TRACE_NILFS2_TRANSACTION_UNLOCK, +}; +#endif + +#define show_transaction_state(type) \ + __print_symbolic(type, \ + { TRACE_NILFS2_TRANSACTION_BEGIN, "BEGIN" }, \ + { TRACE_NILFS2_TRANSACTION_COMMIT, "COMMIT" }, \ + { TRACE_NILFS2_TRANSACTION_ABORT, "ABORT" }, \ + { TRACE_NILFS2_TRANSACTION_TRYLOCK, "TRYLOCK" }, \ + { TRACE_NILFS2_TRANSACTION_LOCK, "LOCK" }, \ + { TRACE_NILFS2_TRANSACTION_UNLOCK, "UNLOCK" }) + +TRACE_EVENT(nilfs2_transaction_transition, + TP_PROTO(struct super_block *sb, + struct nilfs_transaction_info *ti, + int count, + unsigned int flags, + enum nilfs2_transaction_transition_state state), + + TP_ARGS(sb, ti, count, flags, state), + + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, ti) + __field(int, count) + __field(unsigned int, flags) + __field(int, state) + ), + + TP_fast_assign( + __entry->sb = sb; + __entry->ti = ti; + __entry->count = count; + __entry->flags = flags; + __entry->state = state; + ), + + TP_printk("sb = %p ti = %p count = %d flags = %x state = %s", + __entry->sb, + __entry->ti, + __entry->count, + __entry->flags, + show_transaction_state(__entry->state)) +); + #endif /* _TRACE_NILFS2_H */ /* This part must be outside protection */ -- cgit v0.10.2 From 83eec5e6dd10f0b1ab83ee660c8be883b3da7ba8 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Fri, 6 Nov 2015 16:32:05 -0800 Subject: nilfs2: add tracepoints for analyzing sufile manipulation This patch adds tracepoints which would be useful for analyzing segment usage from a perspective of high level sufile manipulation (check, alloc, free). sufile is an important in-place updated metadata file, so analyzing the behavior would be useful for performance turning. example of usage (a case of allocation): $ sudo bin/tpoint nilfs2:nilfs2_segment_usage_allocated Tracing nilfs2:nilfs2_segment_usage_allocated. Ctrl-C to end. segctord-17800 [002] ...1 10671.867294: nilfs2_segment_usage_allocated: sufile = ffff880054f908a8 segnum = 2 segctord-17800 [002] ...1 10675.073477: nilfs2_segment_usage_allocated: sufile = ffff880054f908a8 segnum = 3 Signed-off-by: Hitoshi Mitake Signed-off-by: Ryusuke Konishi Cc: Steven Rostedt Cc: Benixon Dhas Cc: TK Kato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 2a869c3..7ff8f15 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -30,6 +30,8 @@ #include "mdt.h" #include "sufile.h" +#include + /** * struct nilfs_sufile_info - on-memory private data of sufile * @mi: on-memory private data of metadata file @@ -358,6 +360,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) break; /* never happens */ } } + trace_nilfs2_segment_usage_check(sufile, segnum, cnt); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); if (ret < 0) @@ -388,6 +391,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; + + trace_nilfs2_segment_usage_allocated(sufile, segnum); + goto out_header; } @@ -490,6 +496,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, NILFS_SUI(sufile)->ncleansegs++; nilfs_mdt_mark_dirty(sufile); + + trace_nilfs2_segment_usage_freed(sufile, segnum); } /** diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h index e5649ac..1b65ba6 100644 --- a/include/trace/events/nilfs2.h +++ b/include/trace/events/nilfs2.h @@ -95,6 +95,73 @@ TRACE_EVENT(nilfs2_transaction_transition, show_transaction_state(__entry->state)) ); +TRACE_EVENT(nilfs2_segment_usage_check, + TP_PROTO(struct inode *sufile, + __u64 segnum, + unsigned long cnt), + + TP_ARGS(sufile, segnum, cnt), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + __field(unsigned long, cnt) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + __entry->cnt = cnt; + ), + + TP_printk("sufile = %p segnum = %llu cnt = %lu", + __entry->sufile, + __entry->segnum, + __entry->cnt) +); + +TRACE_EVENT(nilfs2_segment_usage_allocated, + TP_PROTO(struct inode *sufile, + __u64 segnum), + + TP_ARGS(sufile, segnum), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + ), + + TP_printk("sufile = %p segnum = %llu", + __entry->sufile, + __entry->segnum) +); + +TRACE_EVENT(nilfs2_segment_usage_freed, + TP_PROTO(struct inode *sufile, + __u64 segnum), + + TP_ARGS(sufile, segnum), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + ), + + TP_printk("sufile = %p segnum = %llu", + __entry->sufile, + __entry->segnum) +); + #endif /* _TRACE_NILFS2_H */ /* This part must be outside protection */ -- cgit v0.10.2 From a9cd207c23ca4fa5bd5f1092e867e87542e349a3 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Fri, 6 Nov 2015 16:32:08 -0800 Subject: nilfs2: add tracepoints for analyzing reading and writing metadata files This patch adds tracepoints for analyzing requests of reading and writing metadata files. The tracepoints cover every in-place mdt files (cpfile, sufile, and datfile). Example of tracing mdt_insert_new_block(): cp-14635 [000] ...1 30598.199309: nilfs2_mdt_insert_new_block: inode = ffff88022a8d0178 ino = 3 block = 155 cp-14635 [000] ...1 30598.199520: nilfs2_mdt_insert_new_block: inode = ffff88022a8d0178 ino = 3 block = 5 cp-14635 [000] ...1 30598.200828: nilfs2_mdt_insert_new_block: inode = ffff88022a8d0178 ino = 3 block = 253 Signed-off-by: Hitoshi Mitake Signed-off-by: Ryusuke Konishi Cc: Steven Rostedt Cc: TK Kato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index dee34d9..1125f40 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -33,6 +33,7 @@ #include "page.h" #include "mdt.h" +#include #define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) @@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_uptodate(bh); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); + + trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block); + return 0; } @@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, get_bh(bh); submit_bh(mode, bh); ret = 0; + + trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); out: get_bh(bh); *out_bh = bh; diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h index 1b65ba6..c780581 100644 --- a/include/trace/events/nilfs2.h +++ b/include/trace/events/nilfs2.h @@ -162,6 +162,60 @@ TRACE_EVENT(nilfs2_segment_usage_freed, __entry->segnum) ); +TRACE_EVENT(nilfs2_mdt_insert_new_block, + TP_PROTO(struct inode *inode, + unsigned long ino, + unsigned long block), + + TP_ARGS(inode, ino, block), + + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned long, ino) + __field(unsigned long, block) + ), + + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->block = block; + ), + + TP_printk("inode = %p ino = %lu block = %lu", + __entry->inode, + __entry->ino, + __entry->block) +); + +TRACE_EVENT(nilfs2_mdt_submit_block, + TP_PROTO(struct inode *inode, + unsigned long ino, + unsigned long blkoff, + int mode), + + TP_ARGS(inode, ino, blkoff, mode), + + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned long, ino) + __field(unsigned long, blkoff) + __field(int, mode) + ), + + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->blkoff = blkoff; + __entry->mode = mode; + ), + + TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x", + __entry->inode, + __entry->ino, + __entry->blkoff, + __entry->mode) +); + #endif /* _TRACE_NILFS2_H */ /* This part must be outside protection */ -- cgit v0.10.2 From c35c7ac5da8ddfd7c6cd1acc29b052a15f437e24 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:32:11 -0800 Subject: MAINTAINERS: nilfs2: add header file for tracing This adds header file "include/trace/events/nilfs2.h" to maintainer-ship of nilfs2 so that updates to the nilfs2 header file go to the mailing list of nilfs2. Signed-off-by: Ryusuke Konishi Cc: Hitoshi Mitake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 6318e95..eabf3d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7491,6 +7491,7 @@ S: Supported F: Documentation/filesystems/nilfs2.txt F: fs/nilfs2/ F: include/linux/nilfs2_fs.h +F: include/trace/events/nilfs2.h NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER M: YOKOTA Hiroshi -- cgit v0.10.2 From 09ef29e0f6ac9f08ba4cc501ab4a3c33be526343 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:32:14 -0800 Subject: nilfs2: fix gcc unused-but-set-variable warnings Fix the following build warnings: $ make W=1 [...] CC [M] fs/nilfs2/btree.o fs/nilfs2/btree.c: In function 'nilfs_btree_split': fs/nilfs2/btree.c:923:8: warning: variable 'newptr' set but not used [-Wunused-but-set-variable] __u64 newptr; ^ fs/nilfs2/btree.c:922:8: warning: variable 'newkey' set but not used [-Wunused-but-set-variable] __u64 newkey; ^ CC [M] fs/nilfs2/dat.o fs/nilfs2/dat.c: In function 'nilfs_dat_prepare_end': fs/nilfs2/dat.c:158:8: warning: variable 'start' set but not used [-Wunused-but-set-variable] __u64 start; ^ CC [M] fs/nilfs2/segment.o fs/nilfs2/segment.c: In function 'nilfs_segctor_do_immediate_flush': fs/nilfs2/segment.c:2433:6: warning: variable 'err' set but not used [-Wunused-but-set-variable] int err; ^ CC [M] fs/nilfs2/sufile.o fs/nilfs2/sufile.c: In function 'nilfs_sufile_alloc': fs/nilfs2/sufile.c:320:27: warning: variable 'ncleansegs' set but not used [-Wunused-but-set-variable] unsigned long nsegments, ncleansegs, nsus, cnt; ^ CC [M] fs/nilfs2/alloc.o fs/nilfs2/alloc.c: In function 'nilfs_palloc_prepare_alloc_entry': fs/nilfs2/alloc.c:478:38: warning: variable 'groups_per_desc_block' set but not used [-Wunused-but-set-variable] unsigned long n, entries_per_group, groups_per_desc_block; ^ Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index b335a32..2ccbf55 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -514,7 +514,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, void *desc_kaddr, *bitmap_kaddr; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; - unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long n, entries_per_group; unsigned long i, j; spinlock_t *lock; int pos, ret; @@ -523,7 +523,6 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, maxgroup = ngroups - 1; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); entries_per_group = nilfs_palloc_entries_per_group(inode); - groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); for (i = 0; i < ngroups; i += n) { if (group >= ngroups) { diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 919fd5b..f609a85 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - __u64 newkey; - __u64 newptr; int nchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); @@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(right, 0); - newptr = path[level].bp_newreq.bpr_ptr; - if (move) { path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(right, path[level].bp_index, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 0d5fada..7dc23f1 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - __u64 start; sector_t blocknr; void *kaddr; int ret; @@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) kaddr = kmap_atomic(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); - start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); kunmap_atomic(kaddr); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3fc4732..3b65ada 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2492,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) { int mode = 0; - int err; spin_lock(&sci->sc_state_lock); mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? @@ -2500,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) spin_unlock(&sci->sc_state_lock); if (mode) { - err = nilfs_segctor_do_construct(sci, mode); + nilfs_segctor_do_construct(sci, mode); spin_lock(&sci->sc_state_lock); sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 7ff8f15..52821ff 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -319,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; void *kaddr; - unsigned long nsegments, ncleansegs, nsus, cnt; + unsigned long nsegments, nsus, cnt; int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -329,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) goto out_sem; kaddr = kmap_atomic(header_bh->b_page); header = kaddr + bh_offset(header_bh); - ncleansegs = le64_to_cpu(header->sh_ncleansegs); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_atomic(kaddr); -- cgit v0.10.2 From 4f05028f8d1af782cfd03d09e0a052e9745dc5ad Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 6 Nov 2015 16:32:16 -0800 Subject: nilfs2: fix gcc uninitialized-variable warnings in powerpc build Some false positive warnings are reported for powerpc build. The following warnings are reported in http://kisskb.ellerman.id.au/kisskb/buildresult/12519703/ CC fs/nilfs2/super.o fs/nilfs2/super.c: In function 'nilfs_resize_fs': fs/nilfs2/super.c:376:2: warning: 'blocknr' may be used uninitialized in this function [-Wuninitialized] fs/nilfs2/super.c:362:11: note: 'blocknr' was declared here CC fs/nilfs2/recovery.o fs/nilfs2/recovery.c: In function 'nilfs_salvage_orphan_logs': fs/nilfs2/recovery.c:631:21: warning: 'sum' may be used uninitialized in this function [-Wuninitialized] fs/nilfs2/recovery.c:585:32: note: 'sum' was declared here fs/nilfs2/recovery.c: In function 'nilfs_search_super_root': fs/nilfs2/recovery.c:873:11: warning: 'sum' may be used uninitialized in this function [-Wuninitialized] Another similar warning is reported in http://kisskb.ellerman.id.au/kisskb/buildresult/12520079/ CC fs/nilfs2/btree.o fs/nilfs2/btree.c: In function 'nilfs_btree_convert_and_insert': include/asm-generic/bitops/non-atomic.h:105:20: warning: 'bh' may be used uninitialized in this function [-Wuninitialized] fs/nilfs2/btree.c:1859:22: note: 'bh' was declared here This cleans out these warnings by forcing the variables to be initialized. Signed-off-by: Ryusuke Konishi Reported-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index f609a85..3a3821b 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1851,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n) { - struct buffer_head *bh; + struct buffer_head *bh = NULL; union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; struct nilfs_bmap_stats stats; int ret; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ff00a0b..9b4f205 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; - struct nilfs_segment_summary *sum; + struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; @@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; - struct nilfs_segment_summary *sum; + struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c69455a..354013e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) struct nilfs_super_block *nsbp; sector_t blocknr, newblocknr; unsigned long offset; - int sb2i = -1; /* array index of the secondary superblock */ + int sb2i; /* array index of the secondary superblock */ int ret = 0; /* nilfs->ns_sem must be locked by the caller. */ @@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { sb2i = 0; blocknr = nilfs->ns_sbh[0]->b_blocknr; + } else { + sb2i = -1; + blocknr = 0; } if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) goto out; /* super block location is unchanged */ -- cgit v0.10.2 From 2e01fabe67ccaff1d59bda01e60a61f5fb0aa7b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:19 -0800 Subject: signals: kill block_all_signals() and unblock_all_signals() It is hardly possible to enumerate all problems with block_all_signals() and unblock_all_signals(). Just for example, 1. block_all_signals(SIGSTOP/etc) simply can't help if the caller is multithreaded. Another thread can dequeue the signal and force the group stop. 2. Even is the caller is single-threaded, it will "stop" anyway. It will not sleep, but it will spin in kernel space until SIGCONT or SIGKILL. And a lot more. In short, this interface doesn't work at all, at least the last 10+ years. Daniel said: Yeah the only times I played around with the DRM_LOCK stuff was when old drivers accidentally deadlocked - my impression is that the entire DRM_LOCK thing was never really tested properly ;-) Hence I'm all for purging where this leaks out of the drm subsystem. Signed-off-by: Oleg Nesterov Acked-by: Daniel Vetter Acked-by: Dave Airlie Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/gpu/drm/drm_lock.c b/drivers/gpu/drm/drm_lock.c index 4924d381..daa2ff1 100644 --- a/drivers/gpu/drm/drm_lock.c +++ b/drivers/gpu/drm/drm_lock.c @@ -38,8 +38,6 @@ #include "drm_legacy.h" #include "drm_internal.h" -static int drm_notifier(void *priv); - static int drm_lock_take(struct drm_lock_data *lock_data, unsigned int context); /** @@ -118,14 +116,8 @@ int drm_legacy_lock(struct drm_device *dev, void *data, * really probably not the correct answer but lets us debug xkb * xserver for now */ if (!file_priv->is_master) { - sigemptyset(&dev->sigmask); - sigaddset(&dev->sigmask, SIGSTOP); - sigaddset(&dev->sigmask, SIGTSTP); - sigaddset(&dev->sigmask, SIGTTIN); - sigaddset(&dev->sigmask, SIGTTOU); dev->sigdata.context = lock->context; dev->sigdata.lock = master->lock.hw_lock; - block_all_signals(drm_notifier, dev, &dev->sigmask); } if (dev->driver->dma_quiescent && (lock->flags & _DRM_LOCK_QUIESCENT)) @@ -169,7 +161,6 @@ int drm_legacy_unlock(struct drm_device *dev, void *data, struct drm_file *file_ /* FIXME: Should really bail out here. */ } - unblock_all_signals(); return 0; } @@ -288,38 +279,6 @@ int drm_legacy_lock_free(struct drm_lock_data *lock_data, unsigned int context) } /** - * If we get here, it means that the process has called DRM_IOCTL_LOCK - * without calling DRM_IOCTL_UNLOCK. - * - * If the lock is not held, then let the signal proceed as usual. If the lock - * is held, then set the contended flag and keep the signal blocked. - * - * \param priv pointer to a drm_device structure. - * \return one if the signal should be delivered normally, or zero if the - * signal should be blocked. - */ -static int drm_notifier(void *priv) -{ - struct drm_device *dev = priv; - struct drm_hw_lock *lock = dev->sigdata.lock; - unsigned int old, new, prev; - - /* Allow signal delivery if lock isn't held */ - if (!lock || !_DRM_LOCK_IS_HELD(lock->lock) - || _DRM_LOCKING_CONTEXT(lock->lock) != dev->sigdata.context) - return 1; - - /* Otherwise, set flag to force call to - drmUnlock */ - do { - old = lock->lock; - new = old | _DRM_LOCK_CONT; - prev = cmpxchg(&lock->lock, old, new); - } while (prev != old); - return 0; -} - -/** * This function returns immediately and takes the hw lock * with the kernel context if it is free, otherwise it gets the highest priority when and if * it is eventually released. diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 8b5ce7c..f56cdce 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -822,7 +822,6 @@ struct drm_device { struct drm_sg_mem *sg; /**< Scatter gather memory */ unsigned int num_crtcs; /**< Number of CRTCs on this device */ - sigset_t sigmask; struct { int context; diff --git a/include/linux/sched.h b/include/linux/sched.h index eeb5066..923ec1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1570,9 +1570,7 @@ struct task_struct { unsigned long sas_ss_sp; size_t sas_ss_size; - int (*notifier)(void *priv); - void *notifier_data; - sigset_t *notifier_mask; + struct callback_head *task_works; struct audit_context *audit_context; @@ -2476,9 +2474,6 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s return ret; } -extern void block_all_signals(int (*notifier)(void *priv), void *priv, - sigset_t *mask); -extern void unblock_all_signals(void); extern void release_task(struct task_struct * p); extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe..f2cbd4e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. -- cgit v0.10.2 From be0e6f290f78b84a3b21b8c8c46819c4514fe632 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:22 -0800 Subject: signal: turn dequeue_signal_lock() into kernel_dequeue_signal() 1. Rename dequeue_signal_lock() to kernel_dequeue_signal(). This matches another "for kthreads only" kernel_sigaction() helper. 2. Remove the "tsk" and "mask" arguments, they are always current and current->blocked. And it is simply wrong if tsk != current. 3. We could also remove the 3rd "siginfo_t *info" arg but it looks potentially useful. However we can simplify the callers if we change kernel_dequeue_signal() to accept info => NULL. 4. Remove _irqsave, it is never called from atomic context. Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo Cc: David Woodhouse Cc: Felipe Balbi Cc: Markus Pargmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1b87623..93b3f99 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -444,9 +444,7 @@ static int nbd_thread_recv(struct nbd_device *nbd) spin_unlock_irqrestore(&nbd->tasks_lock, flags); if (signal_pending(current)) { - siginfo_t info; - - ret = dequeue_signal_lock(current, ¤t->blocked, &info); + ret = kernel_dequeue_signal(NULL); dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", task_pid_nr(current), current->comm, ret); mutex_lock(&nbd->tx_lock); @@ -560,11 +558,8 @@ static int nbd_thread_send(void *data) !list_empty(&nbd->waiting_queue)); if (signal_pending(current)) { - siginfo_t info; - int ret; + int ret = kernel_dequeue_signal(NULL); - ret = dequeue_signal_lock(current, ¤t->blocked, - &info); dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", task_pid_nr(current), current->comm, ret); mutex_lock(&nbd->tx_lock); @@ -592,10 +587,8 @@ static int nbd_thread_send(void *data) spin_unlock_irqrestore(&nbd->tasks_lock, flags); /* Clear maybe pending signals */ - if (signal_pending(current)) { - siginfo_t info; - dequeue_signal_lock(current, ¤t->blocked, &info); - } + if (signal_pending(current)) + kernel_dequeue_signal(NULL); return 0; } diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index cd54e72..5ec5338 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -2345,7 +2345,6 @@ static void fsg_disable(struct usb_function *f) static void handle_exception(struct fsg_common *common) { - siginfo_t info; int i; struct fsg_buffhd *bh; enum fsg_state old_state; @@ -2357,8 +2356,7 @@ static void handle_exception(struct fsg_common *common) * into a high-priority EXIT exception. */ for (;;) { - int sig = - dequeue_signal_lock(current, ¤t->blocked, &info); + int sig = kernel_dequeue_signal(NULL); if (!sig) break; if (sig != SIGUSR1) { diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index bb9cebc..f3145fd 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -121,13 +121,12 @@ static int jffs2_garbage_collect_thread(void *_c) /* Put_super will send a SIGKILL and then wait on the sem. */ while (signal_pending(current) || freezing(current)) { - siginfo_t info; unsigned long signr; if (try_to_freeze()) goto again; - signr = dequeue_signal_lock(current, ¤t->blocked, &info); + signr = kernel_dequeue_signal(NULL); switch(signr) { case SIGSTOP: diff --git a/include/linux/sched.h b/include/linux/sched.h index 923ec1a..3d54924 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2462,14 +2462,15 @@ extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); -static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +static inline int kernel_dequeue_signal(siginfo_t *info) { - unsigned long flags; + struct task_struct *tsk = current; + siginfo_t __info; int ret; - spin_lock_irqsave(&tsk->sighand->siglock, flags); - ret = dequeue_signal(tsk, mask, info); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_lock_irq(&tsk->sighand->siglock); + ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info); + spin_unlock_irq(&tsk->sighand->siglock); return ret; } -- cgit v0.10.2 From 9a13049e83f346cb1cbd60c64e520a73c396af16 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:25 -0800 Subject: signal: introduce kernel_signal_stop() to fix jffs2_garbage_collect_thread() jffs2_garbage_collect_thread() can race with SIGCONT and sleep in TASK_STOPPED state after it was already sent. Add the new helper, kernel_signal_stop(), which does this correctly. Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo Cc: David Woodhouse Cc: Felipe Balbi Cc: Markus Pargmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index f3145fd..53cc735 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -132,8 +132,7 @@ static int jffs2_garbage_collect_thread(void *_c) case SIGSTOP: jffs2_dbg(1, "%s(): SIGSTOP received\n", __func__); - set_current_state(TASK_STOPPED); - schedule(); + kernel_signal_stop(); break; case SIGKILL: diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d54924..4069feb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2475,6 +2475,16 @@ static inline int kernel_dequeue_signal(siginfo_t *info) return ret; } +static inline void kernel_signal_stop(void) +{ + spin_lock_irq(¤t->sighand->siglock); + if (current->jobctl & JOBCTL_STOP_DEQUEUED) + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +} + extern void release_task(struct task_struct * p); extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); -- cgit v0.10.2 From 9317bb9696566e6759203ffcaa80481b725785b3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:28 -0800 Subject: signal: remove jffs2_garbage_collect_thread()->allow_signal(SIGCONT) jffs2_garbage_collect_thread() does allow_signal(SIGCONT) for no reason, SIGCONT will wake a stopped task up even if it is ignored. Signed-off-by: Oleg Nesterov Reviewed-by: Tejun Heo Cc: David Woodhouse Cc: Felipe Balbi Cc: Markus Pargmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 53cc735..e5c1783 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c) siginitset(&hupmask, sigmask(SIGHUP)); allow_signal(SIGKILL); allow_signal(SIGSTOP); - allow_signal(SIGCONT); allow_signal(SIGHUP); c->gc_task = current; -- cgit v0.10.2 From 5fa534c987784c4811757a34c425aff3ce3b5037 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:31 -0800 Subject: coredump: ensure all coredumping tasks have SIGNAL_GROUP_COREDUMP task_will_free_mem() is wrong in many ways, and in particular the SIGNAL_GROUP_COREDUMP check is not reliable: a task can participate in the coredumping without SIGNAL_GROUP_COREDUMP bit set. change zap_threads() paths to always set SIGNAL_GROUP_COREDUMP even if other CLONE_VM processes can't react to SIGKILL. Fortunately, at least oom-kill case if fine; it kills all tasks sharing the same mm, so it should also kill the process which actually dumps the core. The change in prepare_signal() is not strictly necessary, it just ensures that the patch does not bring another subtle behavioural change. But it reminds us that this SIGNAL_GROUP_EXIT/COREDUMP case needs more changes. Signed-off-by: Oleg Nesterov Cc: David Rientjes Cc: Kyle Walker Acked-by: Michal Hocko Cc: Stanislav Kozina Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/coredump.c b/fs/coredump.c index a8f7564..c66bb05 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -280,11 +280,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start, int exit_code) +static int zap_process(struct task_struct *start, int exit_code, int flags) { struct task_struct *t; int nr = 0; + /* ignore all signals except SIGKILL, see prepare_signal() */ + start->signal->flags = SIGNAL_GROUP_COREDUMP | flags; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -311,10 +313,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - nr = zap_process(tsk, exit_code); tsk->signal->group_exit_task = tsk; - /* ignore all signals except SIGKILL, see prepare_signal() */ - tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + nr = zap_process(tsk, exit_code, 0); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); @@ -365,8 +365,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (p->mm) { if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code); - p->signal->flags = SIGNAL_GROUP_EXIT; + nr += zap_process(p, exit_code, + SIGNAL_GROUP_EXIT); unlock_task_sighand(p, &flags); } break; diff --git a/kernel/signal.c b/kernel/signal.c index f2cbd4e..c0b01fe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -788,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. -- cgit v0.10.2 From d61ba58953fcf708c850bfd90b2858f2ddfbc34c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:34 -0800 Subject: coredump: change zap_threads() and zap_process() to use for_each_thread() Change zap_threads() paths to use for_each_thread() rather than while_each_thread(). While at it, change zap_threads() to avoid the nested if's to make the code more readable and lessen the indentation. Signed-off-by: Oleg Nesterov Cc: David Rientjes Cc: Kyle Walker Cc: Michal Hocko Cc: Stanislav Kozina Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/coredump.c b/fs/coredump.c index c66bb05..1777331 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -290,15 +290,14 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; - t = start; - do { + for_each_thread(start, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); nr++; } - } while_each_thread(start, t); + } return nr; } @@ -360,18 +359,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, continue; if (g->flags & PF_KTHREAD) continue; - p = g; - do { - if (p->mm) { - if (unlikely(p->mm == mm)) { - lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code, - SIGNAL_GROUP_EXIT); - unlock_task_sighand(p, &flags); - } - break; + + for_each_thread(g, p) { + if (unlikely(!p->mm)) + continue; + if (unlikely(p->mm == mm)) { + lock_task_sighand(p, &flags); + nr += zap_process(p, exit_code, + SIGNAL_GROUP_EXIT); + unlock_task_sighand(p, &flags); } - } while_each_thread(g, p); + break; + } } rcu_read_unlock(); done: -- cgit v0.10.2 From 8b91a318e422a0201a7bd7ac2626eecec6b27ae1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 6 Nov 2015 16:32:37 -0800 Subject: fs/seq_file: use seq_* helpers in seq_hex_dump() This improves code readability. Signed-off-by: Andy Shevchenko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/seq_file.c b/fs/seq_file.c index 225586e..ad034fb 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -773,6 +773,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, { const u8 *ptr = buf; int i, linelen, remaining = len; + char *buffer; + size_t size; int ret; if (rowsize != 16 && rowsize != 32) @@ -794,15 +796,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, break; } + size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, - m->buf + m->count, m->size - m->count, - ascii); - if (ret >= m->size - m->count) { - seq_set_overflow(m); - } else { - m->count += ret; - seq_putc(m, '\n'); - } + buffer, size, ascii); + seq_commit(m, ret < size ? ret : -1); + + seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); -- cgit v0.10.2 From 25c6bb76eafe37c8963ae58a6a1bcf4069caeedb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 6 Nov 2015 16:32:40 -0800 Subject: seq_file: reuse string_escape_str() strint_escape_str() escapes input string by given criteria. In case of seq_escape() the criteria is to convert some characters to their octal representation. Signed-off-by: Andy Shevchenko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/seq_file.c b/fs/seq_file.c index ad034fb..00bbe2b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -377,26 +378,12 @@ EXPORT_SYMBOL(seq_release); */ void seq_escape(struct seq_file *m, const char *s, const char *esc) { - char *end = m->buf + m->size; - char *p; - char c; + char *buf; + size_t size = seq_get_buf(m, &buf); + int ret; - for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { - if (!strchr(esc, c)) { - *p++ = c; - continue; - } - if (p + 3 < end) { - *p++ = '\\'; - *p++ = '0' + ((c & 0300) >> 6); - *p++ = '0' + ((c & 070) >> 3); - *p++ = '0' + (c & 07); - continue; - } - seq_set_overflow(m); - return; - } - m->count = p - m->buf; + ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc); + seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape); -- cgit v0.10.2 From 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 6 Nov 2015 16:32:42 -0800 Subject: fs, seqfile: always allow oom killer Since 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill processes") seq_buf_alloc() avoids calling the oom killer for PAGE_SIZE or smaller allocations; but larger allocations can use the oom killer via vmalloc(). Thus reads of small files can return ENOMEM, but larger files use the oom killer to avoid ENOMEM. The effect of this bug is that reads from /proc and other virtual filesystems can return ENOMEM instead of the preferred behavior - oom killing something (possibly the calling process). I don't know of anyone except Google who has noticed the issue. I suspect the fix is more needed in smaller systems where there isn't any reclaimable memory. But these seem like the kinds of systems which probably don't use the oom killer for production situations. Memory overcommit requires use of the oom killer to select a victim regardless of file size. Enable oom killer for small seq_buf_alloc() allocations. Fixes: 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill processes") Signed-off-by: David Rientjes Signed-off-by: Greg Thelen Acked-by: Eric Dumazet Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/seq_file.c b/fs/seq_file.c index 00bbe2b..e85664b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -26,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { void *buf; + gfp_t gfp = GFP_KERNEL; /* - * __GFP_NORETRY to avoid oom-killings with high-order allocations - - * it's better to fall back to vmalloc() than to kill things. + * For high order allocations, use __GFP_NORETRY to avoid oom-killing - + * it's better to fall back to vmalloc() than to kill things. For small + * allocations, just use GFP_KERNEL which will oom kill, thus no need + * for vmalloc fallback. */ - buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + if (size > PAGE_SIZE) + gfp |= __GFP_NORETRY | __GFP_NOWARN; + buf = kmalloc(size, gfp); if (!buf && size > PAGE_SIZE) buf = vmalloc(size); return buf; -- cgit v0.10.2 From de90a6bcaede81f35e8caf4566d1006267230377 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 6 Nov 2015 16:32:45 -0800 Subject: kexec: use file name as the output message prefix kexec output message misses the prefix "kexec", when Dave Young split the kexec code. Now, we use file name as the output message prefix. Currently, the format of output message: [ 140.290795] SYSC_kexec_load: hello, world [ 140.291534] kexec: sanity_check_segment_list: hello, world Ideally, the format of output message: [ 30.791503] kexec: SYSC_kexec_load, Hello, world [ 79.182752] kexec_core: sanity_check_segment_list, Hello, world Remove the custom prefix "kexec" in output message. Signed-off-by: Minfei Huang Acked-by: Dave Young Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc3..d873b64 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bd9f8a0..11b64a6 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,7 +6,7 @@ * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void) crash_notes = __alloc_percpu(size, align); if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2..b70ada0 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -9,6 +9,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -- cgit v0.10.2 From 8639b46139b0e4ea3b1ab1c274e410ee327f1d89 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Fri, 6 Nov 2015 16:32:48 -0800 Subject: pidns: fix set/getpriority and ioprio_set/get in PRIO_USER mode setpriority(PRIO_USER, 0, x) will change the priority of tasks outside of the current pid namespace. This is in contrast to both the other modes of setpriority and the example of kill(-1). Fix this. getpriority and ioprio have the same failure mode, fix them too. Eric said: : After some more thinking about it this patch sounds justifiable. : : My goal with namespaces is not to build perfect isolation mechanisms : as that can get into ill defined territory, but to build well defined : mechanisms. And to handle the corner cases so you can use only : a single namespace with well defined results. : : In this case you have found the two interfaces I am aware of that : identify processes by uid instead of by pid. Which quite frankly is : weird. Unfortunately the weird unexpected cases are hard to handle : in the usual way. : : I was hoping for a little more information. Changes like this one we : have to be careful of because someone might be depending on the current : behavior. I don't think they are and I do think this make sense as part : of the pid namespace. Signed-off-by: Ben Segall Cc: Oleg Nesterov Cc: Al Viro Cc: Ambrose Feinstein Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/ioprio.c b/block/ioprio.c index 31666c9..cc7800e 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f6..6af9212 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; -- cgit v0.10.2 From 002edb6f6f2a79bea50de11260ddc9572e6db731 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 6 Nov 2015 16:32:51 -0800 Subject: dma-mapping: tidy up dma_parms default handling Many DMA controllers and other devices set max_segment_size to indicate their scatter-gather capability, but have no interest in segment_boundary_mask. However, the existence of a dma_parms structure precludes the use of any default value, leaving them as zeros (assuming a properly kzalloc'ed structure). If a well-behaved IOMMU (or SWIOTLB) then tries to respect this by ensuring a mapped segment does not cross a zero-byte boundary, hilarity ensues. Since zero is a nonsensical value for either parameter, treat it as an indicator for "default", as might be expected. In the process, clean up a bit by replacing the bare constants with slightly more meaningful macros and removing the superfluous "else" statements. [akpm@linux-foundation.org: dma-mapping.h needs sizes.h for SZ_64K] Signed-off-by: Robin Murphy Reviewed-by: Sumit Semwal Acked-by: Marek Szyprowski Cc: Arnd Bergmann Cc: Sakari Ailus Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index ac07ff0..2e551e2 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -1,6 +1,7 @@ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H +#include #include #include #include @@ -145,7 +146,9 @@ static inline void arch_teardown_dma_ops(struct device *dev) { } static inline unsigned int dma_get_max_seg_size(struct device *dev) { - return dev->dma_parms ? dev->dma_parms->max_segment_size : 65536; + if (dev->dma_parms && dev->dma_parms->max_segment_size) + return dev->dma_parms->max_segment_size; + return SZ_64K; } static inline unsigned int dma_set_max_seg_size(struct device *dev, @@ -154,14 +157,15 @@ static inline unsigned int dma_set_max_seg_size(struct device *dev, if (dev->dma_parms) { dev->dma_parms->max_segment_size = size; return 0; - } else - return -EIO; + } + return -EIO; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { - return dev->dma_parms ? - dev->dma_parms->segment_boundary_mask : 0xffffffff; + if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) + return dev->dma_parms->segment_boundary_mask; + return DMA_BIT_MASK(32); } static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) @@ -169,8 +173,8 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) if (dev->dma_parms) { dev->dma_parms->segment_boundary_mask = mask; return 0; - } else - return -EIO; + } + return -EIO; } #ifndef dma_max_pfn -- cgit v0.10.2 From 7f8306429c4c75f9e2bf39fcfe990b0af2f7292d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 6 Nov 2015 16:32:55 -0800 Subject: dma-debug: check nents in dma_sync_sg* Like dma_unmap_sg, dma_sync_sg* should be called with the original number of entries passed to dma_map_sg, so do the same check in the sync path as we do in the unmap path. Signed-off-by: Robin Murphy Cc: Arnd Bergmann Cc: Marek Szyprowski Cc: Sumit Semwal Cc: Sakari Ailus Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/dma-debug.c b/lib/dma-debug.c index fcb65d2..8855f01 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1249,6 +1249,14 @@ static void check_sync(struct device *dev, dir2name[entry->direction], dir2name[ref->direction]); + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver syncs " + "DMA sg list with different entry count " + "[map count=%d] [sync count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + out: put_hash_bucket(bucket, &flags); } -- cgit v0.10.2 From 08d78658f393fefaa2e6507ea052c6f8ef4002a2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Nov 2015 16:32:58 -0800 Subject: panic: release stale console lock to always get the logbuf printed out In some cases we may end up killing the CPU holding the console lock while still having valuable data in logbuf. E.g. I'm observing the following: - A crash is happening on one CPU and console_unlock() is being called on some other. - console_unlock() tries to print out the buffer before releasing the lock and on slow console it takes time. - in the meanwhile crashing CPU does lots of printk()-s with valuable data (which go to the logbuf) and sends IPIs to all other CPUs. - console_unlock() finishes printing previous chunk and enables interrupts before trying to print out the rest, the CPU catches the IPI and never releases console lock. This is not the only possible case: in VT/fb subsystems we have many other console_lock()/console_unlock() users. Non-masked interrupts (or receiving NMI in case of extreme slowness) will have the same result. Getting the whole console buffer printed out on crash should be top priority. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Masami Hiramatsu Cc: Jiri Kosina Cc: Baoquan He Cc: Prarit Bhargava Cc: Xie XiuQi Cc: Seth Jennings Cc: "K. Y. Srinivasan" Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff..4579dbb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -147,6 +148,15 @@ void panic(const char *fmt, ...) bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. + */ + console_trylock(); + console_unlock(); + if (!panic_blink) panic_blink = no_blink; -- cgit v0.10.2 From cb7ae262e230064ba282094b7e1f60a092448b72 Mon Sep 17 00:00:00 2001 From: Anish Bhatt Date: Fri, 6 Nov 2015 16:33:01 -0800 Subject: include/linux/zutil.h: fix usage example of zlib_adler32() alder32 was renamed to zlib_adler32 since before 2.6.11. Signed-off-by: Anish Bhatt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/zutil.h b/include/linux/zutil.h index 6adfa9a..6636895 100644 --- a/include/linux/zutil.h +++ b/include/linux/zutil.h @@ -68,10 +68,10 @@ typedef uLong (*check_func) (uLong check, const Byte *buf, An Adler-32 checksum is almost as reliable as a CRC32 but can be computed much faster. Usage example: - uLong adler = adler32(0L, NULL, 0); + uLong adler = zlib_adler32(0L, NULL, 0); while (read_buffer(buffer, length) != EOF) { - adler = adler32(adler, buffer, length); + adler = zlib_adler32(adler, buffer, length); } if (adler != original_adler) error(); */ -- cgit v0.10.2 From 5f2a2d5d423d5337a1392fa016ec23a8a4206006 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 6 Nov 2015 16:33:04 -0800 Subject: ipc,msg: drop dst nil validation in copy_msg d0edd8528362 ("ipc: convert invalid scenarios to use WARN_ON") relaxed the nil dst parameter check, originally being a full BUG_ON. However, this check seems quite unnecessary when the only purpose is for ceckpoint/restore (MSG_COPY flag): o The copy variable is set initially to nil, apparently as a way of ensuring that prepare_copy is previously called. Which is in fact done, unconditionally at the beginning of do_msgrcv. o There is no concurrency with 'copy' (stack allocated in do_msgrcv). Furthermore, any errors in 'copy' (and thus prepare_copy/copy_msg) should always handled by IS_ERR() family. Therefore remove this check altogether as it can never occur with the current users. Signed-off-by: Davidlohr Bueso Cc: Stanislav Kinsbursky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 71f448e..ed81aaf 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -123,7 +123,6 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) size_t len = src->m_ts; size_t alen; - WARN_ON(dst == NULL); if (src->m_ts > dst->m_ts) return ERR_PTR(-EINVAL); -- cgit v0.10.2