From e6fd1fb3b5b58fcfa8e546c69d9cb64aa2c5c9b8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 15 Mar 2016 14:52:46 -0700 Subject: init/main.c: use list_for_each_entry() Use list_for_each_entry() instead of list_for_each() to simplify the code. Signed-off-by: Geliang Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/main.c b/init/main.c index 7c27de4..031a608 100644 --- a/init/main.c +++ b/init/main.c @@ -719,7 +719,6 @@ static int __init initcall_blacklist(char *str) static bool __init_or_module initcall_blacklisted(initcall_t fn) { - struct list_head *tmp; struct blacklist_entry *entry; char *fn_name; @@ -727,8 +726,7 @@ static bool __init_or_module initcall_blacklisted(initcall_t fn) if (!fn_name) return false; - list_for_each(tmp, &blacklisted_initcalls) { - entry = list_entry(tmp, struct blacklist_entry, next); + list_for_each_entry(entry, &blacklisted_initcalls, next) { if (!strcmp(fn_name, entry->buf)) { pr_debug("initcall %s blacklisted\n", fn_name); kfree(fn_name); -- cgit v0.10.2 From 25528213fe9f75f4e286f08d35a73ca2bb634a50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Mar 2016 14:52:49 -0700 Subject: tags: Fix DEFINE_PER_CPU expansions $ make tags GEN tags ctags: Warning: drivers/acpi/processor_idle.c:64: null expansion of name pattern "\1" ctags: Warning: drivers/xen/events/events_2l.c:41: null expansion of name pattern "\1" ctags: Warning: kernel/locking/lockdep.c:151: null expansion of name pattern "\1" ctags: Warning: kernel/rcu/rcutorture.c:133: null expansion of name pattern "\1" ctags: Warning: kernel/rcu/rcutorture.c:135: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:323: null expansion of name pattern "\1" ctags: Warning: net/ipv4/syncookies.c:53: null expansion of name pattern "\1" ctags: Warning: net/ipv6/syncookies.c:44: null expansion of name pattern "\1" ctags: Warning: net/rds/page.c:45: null expansion of name pattern "\1" Which are all the result of the DEFINE_PER_CPU pattern: scripts/tags.sh:200: '/\ Acked-by: David S. Miller Acked-by: Rafael J. Wysocki Cc: Tejun Heo Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 175c86b..9ca2b2f 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -61,8 +61,8 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], - acpi_cstate); +static +DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate); static int disabled_by_idle_boot_param(void) { diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index 7dd4631..403fe39 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -38,8 +38,9 @@ /* Find the first set bit in a evtchn mask */ #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) -static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], - cpu_evtchn_mask); +#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask); static unsigned evtchn_2l_max_channels(void) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f894a2c..53ab2f8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -148,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats); static inline u64 lockstat_clock(void) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d0..4d5cc6a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff5dc7..16e13d8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c..2d5589b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -50,8 +50,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv4_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 2906ef2..aae3e5c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -41,8 +41,7 @@ static __u16 const msstab[] = { 9000 - 60, }; -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv6_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/rds/page.c b/net/rds/page.c index 5a14e6d..616f21f 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -42,8 +42,8 @@ struct rds_page_remainder { unsigned long r_offset; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, - rds_page_remainders); +static +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. -- cgit v0.10.2 From 3701dc815117e0317f6fcf062ff8a28cd171ce44 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 15 Mar 2016 14:52:52 -0700 Subject: m32r: mm: fix build warning While building we are getting warnings: arch/m32r/mm/init.c:63:17: warning: unused variable 'low' arch/m32r/mm/init.c:62:17: warning: unused variable 'max_dma' max_dma and low are only used if CONFIG_MMU is defined. Lets declare the variables inside the #ifdef. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index 0d4146f..11fa717 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -59,21 +59,24 @@ void free_initrd_mem(unsigned long, unsigned long); void __init zone_sizes_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, }; - unsigned long max_dma; - unsigned long low; unsigned long start_pfn; #ifdef CONFIG_MMU - start_pfn = START_PFN(0); - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = MAX_LOW_PFN(0); - - if (low < max_dma){ - zones_size[ZONE_DMA] = low - start_pfn; - zones_size[ZONE_NORMAL] = 0; - } else { - zones_size[ZONE_DMA] = low - start_pfn; - zones_size[ZONE_NORMAL] = low - max_dma; + { + unsigned long low; + unsigned long max_dma; + + start_pfn = START_PFN(0); + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + low = MAX_LOW_PFN(0); + + if (low < max_dma) { + zones_size[ZONE_DMA] = low - start_pfn; + zones_size[ZONE_NORMAL] = 0; + } else { + zones_size[ZONE_DMA] = low - start_pfn; + zones_size[ZONE_NORMAL] = low - max_dma; + } } #else zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT; -- cgit v0.10.2 From e928f35061a6f3ead993b230b01d9ca6b4f495ca Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 15 Mar 2016 14:52:55 -0700 Subject: blackfin: define dummy pgprot_writecombine for !MMU blackfin allmodconfig build fails with the error: ../sound/core/pcm_native.c: In function 'snd_pcm_lib_default_mmap': ../sound/core/pcm_native.c:3386:24: error: implicit declaration of function 'pgprot_writecombine' [-Werror=implicit-function-declaration] area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); ^ ../sound/core/pcm_native.c:3386:22: error: incompatible types when assigning to type 'pgprot_t {aka struct }' from type 'int' area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); ^ When !MMU, asm-generic will not define default pgprot_writecombine, so blackfin needs to define it by itself. The patch idea is from commit 65b9ab888cd7 ("arch/c6x/include/asm/pgtable.h: define dummy pgprot_writecombine for !MMU") Signed-off-by: Sudip Mukherjee Cc: Steven Miao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/blackfin/include/asm/pgtable.h b/arch/blackfin/include/asm/pgtable.h index b88a155..c1ee3d6 100644 --- a/arch/blackfin/include/asm/pgtable.h +++ b/arch/blackfin/include/asm/pgtable.h @@ -97,6 +97,8 @@ extern unsigned long get_fb_unmapped_area(struct file *filp, unsigned long, unsigned long); #define HAVE_ARCH_FB_UNMAPPED_AREA +#define pgprot_writecombine pgprot_noncached + #include #endif /* _BLACKFIN_PGTABLE_H */ -- cgit v0.10.2 From 4d548f61d61fcaa5b5e14b8d250de9604bdb61bf Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Tue, 15 Mar 2016 14:52:58 -0700 Subject: ocfs2/cluster: replace the interrupt safe spinlocks with common ones There actually no hardware or software interrupts in the context which using o2hb_live_lock, so we don't need to worry about race conditions caused by irq/softirq with spinlock held. Turning off irq is not good for system performance after all. Just replace them with a non interrupt safe function. Signed-off-by: Yiwen Jiang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a76b9ea..ef6a2ec 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt { static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; - unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work) jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", quorum, failed); @@ -2425,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long flags; - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", -- cgit v0.10.2 From bfd97a0320d338b2fce422adeddd512466ef2390 Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Tue, 15 Mar 2016 14:53:01 -0700 Subject: ocfs2: use spinlock_irqsave() to downconvert lock in ocfs2_osb_dump() Commit a75e9ccabd92 ("ocfs2: use spinlock irqsave for downconvert lock") missed an unmodified place in ocfs2_osb_dump(), so it still exists a deadlock scenario. ocfs2_wake_downconvert_thread ocfs2_rw_unlock ocfs2_dio_end_io dio_complete ..... bio_endio req_bio_endio .... scsi_io_completion blk_done_softirq __do_softirq do_softirq irq_exit do_IRQ ocfs2_osb_dump cat /sys/kernel/debug/ocfs2/${uuid}/fs_state This patch still uses spin_lock_irqsave() - replace spin_lock() to solve this situation. Signed-off-by: Yiwen Jiang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index faa1365..302854e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -236,6 +236,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) struct ocfs2_recovery_map *rm = osb->recovery_map; struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; int i, out = 0; + unsigned long flags; out += snprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", @@ -271,14 +272,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) cconn->cc_version.pv_minor); } - spin_lock(&osb->dc_task_lock); + spin_lock_irqsave(&osb->dc_task_lock, flags); out += snprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), osb->blocked_lock_count, osb->dc_wake_sequence, osb->dc_work_sequence); - spin_unlock(&osb->dc_task_lock); + spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", -- cgit v0.10.2 From 39b29af03049421139656818f45b8ca7e12db3fa Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 15 Mar 2016 14:53:05 -0700 Subject: ocfs2/dlm: fix a typo in dlmcommon.h Refer to cluster/tcp.h, NET_MAX_PAYLOAD_BYTES is a typo for O2NET_MAX_PAYLOAD_BYTES. Since currently DLM_MIG_LOCKRES_RESERVED is not actually used, it won't cause any problem. But we'd better correct it for further use. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 68c607e..62d9323 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -545,7 +545,7 @@ struct dlm_master_requery * }; * * from ../cluster/tcp.h - * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) * (roughly 4080 bytes) * and sizeof(dlm_migratable_lockres) = 112 bytes * and sizeof(dlm_migratable_lock) = 16 bytes @@ -586,7 +586,7 @@ struct dlm_migratable_lockres /* from above, 128 bytes * for some undetermined future use */ -#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ +#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \ DLM_MIG_LOCKRES_MAX_LEN) struct dlm_create_lock -- cgit v0.10.2 From 60d663cb527340c87c6cb98842e90a43243e1607 Mon Sep 17 00:00:00 2001 From: xuejiufei Date: Tue, 15 Mar 2016 14:53:08 -0700 Subject: ocfs2/dlm: add DEREF_DONE message This series of patches is to fix the dis-order issue of setting/clearing refmap bit described below. Node 1 Node 2(master) dlmlock dlm_do_master_request dlm_master_request_handler -> dlm_lockres_set_refmap_bit dlmlock succeed dlmunlock succeed dlm_purge_lockres dlm_deref_handler -> find lock resource is in DLM_LOCK_RES_SETREF_INPROG state, so dispatch a deref work dlm_purge_lockres succeed. call dlmlock again dlm_do_master_request dlm_master_request_handler -> dlm_lockres_set_refmap_bit deref work trigger, call dlm_lockres_clear_refmap_bit to clear Node 1 from refmap dlm_purge_lockres succeed dlm_send_remote_lock_request return DLM_IVLOCKID because the lockres is not exist BUG if the lockres is $RECOVERY This series of patches add a new message to keep the order of set and clear. Other nodes can purge the lock resource only after the refmap bit on master is cleared. This patch is to add DEREF_DONE message and corresponding handler. Node can purge the lock resource after receiving this message. As a new message is added, so increase the minor number of dlm protocol version. Signed-off-by: xuejiufei Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Reviewed-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 62d9323..c9f97da 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -451,6 +451,7 @@ enum { DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, DLM_BEGIN_EXIT_DOMAIN_MSG = 521, + DLM_DEREF_LOCKRES_DONE = 522, }; struct dlm_reco_node_data @@ -782,6 +783,15 @@ struct dlm_deref_lockres u8 name[O2NM_MAX_NAME_LEN]; }; +struct dlm_deref_lockres_done { + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -968,6 +978,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, void dlm_assert_master_post_handler(int status, void *data, void *ret_data); int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2ee7fe7..c73c68e 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * - Message DLM_QUERY_NODEINFO added to allow online node removes * New in version 1.2: * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + * New in version 1.3: + * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the + * refmap is cleared */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 2, + .pv_minor = 3, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key, + sizeof(struct dlm_deref_lockres_done), + dlm_deref_lockres_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); bail: if (status) dlm_unregister_domain_handlers(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9477d6e..8913e7d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2375,6 +2375,122 @@ done: return ret; } +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres_done *deref + = (struct dlm_deref_lockres_done *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF)); + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. + */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + dlm_lockres_put(res); + + spin_unlock(&dlm->spinlock); + +done: + dlm_put(dlm); + return ret; +} + +static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 node) +{ + struct dlm_deref_lockres_done deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key, + &deref, sizeof(deref), node, &r); + if (ret < 0) { + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE " + " to node %u\n", dlm->name, namelen, + lockname, ret, node); + } else if (r < 0) { + /* ignore the error */ + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, node, r); + dlm_print_one_lock_resource(res); + } +} + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) { struct dlm_ctxt *dlm; -- cgit v0.10.2 From 842b90b62461d0848bd56ad776117d15a5fa95c0 Mon Sep 17 00:00:00 2001 From: xuejiufei Date: Tue, 15 Mar 2016 14:53:11 -0700 Subject: ocfs2/dlm: return in progress if master can not clear the refmap bit right now Master returns in-progress to non-master node when it can not clear the refmap bit right now. And non-master node will not purge the lock resource until receiving deref done message. Signed-off-by: xuejiufei Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Reviewed-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index c9f97da..3b77862 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -783,6 +783,11 @@ struct dlm_deref_lockres u8 name[O2NM_MAX_NAME_LEN]; }; +enum { + DLM_DEREF_RESPONSE_DONE = 0, + DLM_DEREF_RESPONSE_INPROG = 1, +}; + struct dlm_deref_lockres_done { u32 pad1; u16 pad2; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 8913e7d..87e2254 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) dlm_print_one_lock_resource(res); BUG(); } - return ret; + return ret ? ret : r; } int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, res->lockname.len, res->lockname.name, node); dlm_print_one_lock_resource(res); } - ret = 0; + ret = DLM_DEREF_RESPONSE_DONE; goto done; } @@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->work_lock); queue_work(dlm->dlm_worker, &dlm->dispatched_work); - return 0; + return DLM_DEREF_RESPONSE_INPROG; done: if (res) @@ -2511,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } spin_unlock(&res->spinlock); + dlm_drop_lockres_ref_done(dlm, res, node); + if (cleared) { mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", dlm->name, res->lockname.len, res->lockname.name, node); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c5f6c24..22e6eb8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -202,6 +202,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, dlm->purge_count--; } + if (!master && ret != 0) { + mlog(0, "%s: deref %.*s in progress or master goes down\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + if (!__dlm_lockres_unused(res)) { mlog(ML_ERROR, "%s: res %.*s in use after deref\n", dlm->name, res->lockname.len, res->lockname.name); -- cgit v0.10.2 From 8c0343968163dc3536e56268026f475b1bbf61b4 Mon Sep 17 00:00:00 2001 From: xuejiufei Date: Tue, 15 Mar 2016 14:53:14 -0700 Subject: ocfs2/dlm: clear DROPPING_REF flag when the master goes down If the master goes down after return in-progress for deref message. The lock resource on non-master node can not be purged. Clear the DROPPING_REF flag and recovery it. Signed-off-by: xuejiufei Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Reviewed-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b94a425..1eee96e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2377,14 +2377,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "%s: res %.*s, Skip " - "recovery as it is being freed\n", - dlm->name, res->lockname.len, - res->lockname.name); - } else - dlm_move_lockres_to_recovery_list(dlm, - res); - + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + } + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, + res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); -- cgit v0.10.2 From d277f33eda000ca03b1497fcf1c9e2ec33adf4c6 Mon Sep 17 00:00:00 2001 From: xuejiufei Date: Tue, 15 Mar 2016 14:53:17 -0700 Subject: ocfs2/dlm: return EINVAL when the lockres on migration target is in DROPPING_REF state If master migrate this lock resource to node when it happened to purge it, a new lock resource will be created and inserted into hash list. If then master goes down, the lock resource being purged is recovered, so there exist two lock resource with different owner. So return error to master if the lock resource is in DROPPING state, master will retry to migrate this lock resource. Signed-off-by: xuejiufei Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Reviewed-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 1eee96e..213279d 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, * and RECOVERY flag changed when it completes. */ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len, hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(0, "%s: node is attempting to migrate " + "lockres %.*s, but marked as dropping " + " ref!\n", dlm->name, + mres->lockname_len, mres->lockname); + ret = -EINVAL; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + dlm_lockres_put(res); + goto leave; + } + if (mres->flags & DLM_MRES_RECOVERY) { res->state |= DLM_LOCK_RES_RECOVERING; } else { -- cgit v0.10.2 From 814ce69432bffdd0533fda28deea5dcfba153d17 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Tue, 15 Mar 2016 14:53:20 -0700 Subject: ocfs2: fix a tiny race that leads file system read-only when o2hb detect a node down, it first set the dead node to recovery map and create ocfs2rec which will replay journal for dead node. o2hb thread then call dlm_do_local_recovery_cleanup() to delete the lock for dead node. After the lock of dead node is gone, locks for other nodes can be granted and may modify the meta data without replaying journal of the dead node. The detail is described as follows. N1 N2 N3(master) modify the extent tree of inode, and commit dirty metadata to journal, then goes down. o2hb thread detects N1 goes down, set recovery map and delete the lock of N1. dlm_thread flush ast for the lock of N2. do not detect the death of N1, so recovery map is empty. read inode from disk without replaying the journal of N1 and modify the extent tree of the inode that N1 had modified. ocfs2rec recover the journal of N1. The modification of N2 is lost. The modification of N1 and N2 are not serial, and it will lead to read-only file system. We can set recovery_waiting flag to the lock resource after delete the lock for dead node to prevent other node from getting the lock before dlm recovery. After dlm recovery, the recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will wait for ocfs2 recovery. Signed-off-by: Jiufei Xue Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 3b77862..004f2cb 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DROPPING_REF 0x00000040 #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -804,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) status = DLM_RECOVERING; else if (res->state & DLM_LOCK_RES_MIGRATING) status = DLM_MIGRATING; @@ -1026,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) { __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING| DLM_LOCK_RES_MIGRATING)); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 87e2254..9aed6e2 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2550,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; /* delay migration when the lockres is in RECOCERING state */ - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; if (res->owner != dlm->node_num) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 213279d..cd38488 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2175,6 +2175,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2312,6 +2319,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 22e6eb8..68d239b 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; /* Another node has this resource with this node as the master */ @@ -707,7 +708,8 @@ static int dlm_thread(void *data) * dirty for a short while. */ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_RECOVERING)) { + DLM_LOCK_RES_RECOVERING | + DLM_LOCK_RES_RECOVERY_WAITING)) { /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); -- cgit v0.10.2 From 8d67d3c2444fd3bb787cc6c27053717c01155966 Mon Sep 17 00:00:00 2001 From: Jun Piao Date: Tue, 15 Mar 2016 14:53:23 -0700 Subject: ocfs2/dlm: fix a variable overflow problem in dlmdomain.c In dlm_send_join_cancels(), node is defined with type unsigned int, but initialized with -1, this will lead variable overflow. Although this won't cause any runtime problem, the code looks a little uncoordinated. Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c73c68e..12e064b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1399,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm, unsigned int map_size) { int status, tmpstat; - unsigned int node; + int node; if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))) { -- cgit v0.10.2 From 0d9c51a6e1df4651379b8fca8a7a96595cd9988a Mon Sep 17 00:00:00 2001 From: San Mehat Date: Tue, 15 Mar 2016 14:53:26 -0700 Subject: block: partition: add partition specific uevent callbacks for partition info This patch has been carried in the Android tree for quite some time and is one of the few patches required to get a mainline kernel up and running with an exsiting Android userspace. So I wanted to submit it for review and consideration if it should be merged. For partitions, add new uevent parameters 'PARTN' which specifies the partitions index in the table, and 'PARTNAME', which specifies PARTNAME specifices the partition name of a partition device. Android's userspace uses this for creating device node links from the partition name and number, ie: /dev/block/platform/soc/by-name/system or /dev/block/platform/soc/by-num/p1 One can see its usage here: https://android.googlesource.com/platform/system/core/+/master/init/devices.cpp#355 and https://android.googlesource.com/platform/system/core/+/master/init/devices.cpp#494 [john.stultz@linaro.org: dropped NPARTS and reworded commit message for context] Signed-off-by: Dima Zavin Signed-off-by: John Stultz Cc: Jens Axboe Cc: Rom Lemarchand Cc: Android Kernel Team Cc: Jeff Moyer Cc: Cc: Kees Cook Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/partition-generic.c b/block/partition-generic.c index fefd01b..5d87019 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -217,10 +217,21 @@ static void part_release(struct device *dev) kfree(p); } +static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct hd_struct *part = dev_to_part(dev); + + add_uevent_var(env, "PARTN=%u", part->partno); + if (part->info && part->info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->info->volname); + return 0; +} + struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, + .uevent = part_uevent, }; static void delete_partition_rcu_cb(struct rcu_head *head) -- cgit v0.10.2 From dec63a4dec2d6d01346fd5d96062e67c0636852b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Mar 2016 14:53:29 -0700 Subject: paride: make 'verbose' parameter an 'int' again gcc-6.0 found an ancient bug in the paride driver, which had a "module_param(verbose, bool, 0);" since before 2.6.12, but actually uses it to accept '0', '1' or '2' as arguments: drivers/block/paride/pd.c: In function 'pd_init_dev_parms': drivers/block/paride/pd.c:298:29: warning: comparison of constant '1' with boolean expression is always false [-Wbool-compare] #define DBMSG(msg) ((verbose>1)?(msg):NULL) In 2012, Rusty did a cleanup patch that also changed the type of the variable to 'bool', which introduced what is now a gcc warning. This changes the type back to 'int' and adapts the module_param() line instead, so it should work as documented in case anyone ever cares about running the ancient driver with debugging. Fixes: 90ab5ee94171 ("module_param: make bool parameters really bool (drivers & misc)") Signed-off-by: Arnd Bergmann Rusty Russell Cc: Tim Waugh Cc: Sudip Mukherjee Cc: Jens Axboe Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 562b5a4..78a39f7 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -126,7 +126,7 @@ */ #include -static bool verbose = 0; +static int verbose = 0; static int major = PD_MAJOR; static char *name = PD_NAME; static int cluster = 64; @@ -161,7 +161,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV}; static DEFINE_MUTEX(pd_mutex); static DEFINE_SPINLOCK(pd_lock); -module_param(verbose, bool, 0); +module_param(verbose, int, 0); module_param(major, int, 0); module_param(name, charp, 0); module_param(cluster, int, 0); diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 1740d75..216a94f 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -117,7 +117,7 @@ */ -static bool verbose = 0; +static int verbose = 0; static int major = PT_MAJOR; static char *name = PT_NAME; static int disable = 0; @@ -152,7 +152,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3}; #include -module_param(verbose, bool, 0); +module_param(verbose, int, 0); module_param(major, int, 0); module_param(name, charp, 0); module_param_array(drive0, int, NULL, 0); -- cgit v0.10.2 From 376bf125ac781d32e202760ed7deb1ae4ed35d31 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:32 -0700 Subject: slub: clean up code for kmem cgroup support to kmem_cache_free_bulk This change is primarily an attempt to make it easier to realize the optimizations the compiler performs in-case CONFIG_MEMCG_KMEM is not enabled. Performance wise, even when CONFIG_MEMCG_KMEM is compiled in, the overhead is zero. This is because, as long as no process have enabled kmem cgroups accounting, the assignment is replaced by asm-NOP operations. This is possible because memcg_kmem_enabled() uses a static_key_false() construct. It also helps readability as it avoid accessing the p[] array like: p[size - 1] which "expose" that the array is processed backwards inside helper function build_detached_freelist(). Lastly this also makes the code more robust, in error case like passing NULL pointers in the array. Which were previously handled before commit 033745189b1b ("slub: add missing kmem cgroup support to kmem_cache_free_bulk"). Fixes: 033745189b1b ("slub: add missing kmem cgroup support to kmem_cache_free_bulk") Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index d8fbd4a..2a722e1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2815,6 +2815,7 @@ struct detached_freelist { void *tail; void *freelist; int cnt; + struct kmem_cache *s; }; /* @@ -2829,8 +2830,9 @@ struct detached_freelist { * synchronization primitive. Look ahead in the array is limited due * to performance reasons. */ -static int build_detached_freelist(struct kmem_cache *s, size_t size, - void **p, struct detached_freelist *df) +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { size_t first_skipped_index = 0; int lookahead = 3; @@ -2846,8 +2848,11 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, if (!object) return 0; + /* Support for memcg, compiler can optimize this out */ + df->s = cache_from_obj(s, object); + /* Start new detached freelist */ - set_freepointer(s, object, NULL); + set_freepointer(df->s, object, NULL); df->page = virt_to_head_page(object); df->tail = object; df->freelist = object; @@ -2862,7 +2867,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* df->page is always set at this point */ if (df->page == virt_to_head_page(object)) { /* Opportunity build freelist */ - set_freepointer(s, object, df->freelist); + set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; p[size] = NULL; /* mark object processed */ @@ -2881,25 +2886,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, return first_skipped_index; } - /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; - struct kmem_cache *s; - - /* Support for memcg */ - s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) continue; - slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); -- cgit v0.10.2 From 11c7aec2a9b4e685bbf6a15148e7841b3525fc0c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:35 -0700 Subject: mm/slab: move SLUB alloc hooks to common mm/slab.h First step towards sharing alloc_hook's between SLUB and SLAB allocators. Move the SLUB allocators *_alloc_hook to the common mm/slab.h for internal slab definitions. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.h b/mm/slab.h index 2eedace..fd231c9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -38,6 +38,10 @@ struct kmem_cache { #endif #include +#include +#include +#include +#include /* * State of the slab allocator. @@ -321,6 +325,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s->object_size, flags, s->flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } + memcg_kmem_put_cache(s); +} + #ifndef CONFIG_SLOB /* * The slab lists for all objects. diff --git a/mm/slub.c b/mm/slub.c index 2a722e1..6dd04c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline int order_objects(int order, unsigned long size, int reserved) { return ((PAGE_SIZE << order) - reserved) / size; @@ -1281,36 +1257,6 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(gfpflags_allow_blocking(flags)); - - if (should_failslab(s->object_size, flags, s->flags)) - return NULL; - - return memcg_kmem_get_cache(s, flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - size_t i; - - flags &= gfp_allowed_mask; - for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, - s->flags, flags); - kasan_slab_alloc(s, object); - } - memcg_kmem_put_cache(s); -} - static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -- cgit v0.10.2 From fab9963a69dbd71304357dbfe4ec5345f14cebdd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:38 -0700 Subject: mm: fault-inject take over bootstrap kmem_cache check Remove the SLAB specific function slab_should_failslab(), by moving the check against fault-injection for the bootstrap slab, into the shared function should_failslab() (used by both SLAB and SLUB). This is a step towards sharing alloc_hook's between SLUB and SLAB. This bootstrap slab "kmem_cache" is used for allocating struct kmem_cache objects to the allocator itself. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 3159a7d..9f4956d 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -62,10 +62,9 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name, #endif /* CONFIG_FAULT_INJECTION */ #ifdef CONFIG_FAILSLAB -extern bool should_failslab(size_t size, gfp_t gfpflags, unsigned long flags); +extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags); #else -static inline bool should_failslab(size_t size, gfp_t gfpflags, - unsigned long flags) +static inline bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) { return false; } diff --git a/mm/failslab.c b/mm/failslab.c index 79171b4..b0fac98 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,5 +1,7 @@ #include #include +#include +#include "slab.h" static struct { struct fault_attr attr; @@ -11,18 +13,22 @@ static struct { .cache_filter = false, }; -bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) +bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + /* No fault-injection for bootstrap cache */ + if (unlikely(s == kmem_cache)) + return false; + if (gfpflags & __GFP_NOFAIL) return false; if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; - if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; - return should_fail(&failslab.attr, size); + return should_fail(&failslab.attr, s->object_size); } static int __init setup_failslab(char *str) diff --git a/mm/slab.c b/mm/slab.c index 621fbcb..95f344d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2926,14 +2926,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif -static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) -{ - if (unlikely(cachep == kmem_cache)) - return false; - - return should_failslab(cachep->object_size, flags, cachep->flags); -} - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; @@ -3155,7 +3147,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, lockdep_trace_alloc(flags); - if (slab_should_failslab(cachep, flags)) + if (should_failslab(cachep, flags)) return NULL; cachep = memcg_kmem_get_cache(cachep, flags); @@ -3243,7 +3235,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) lockdep_trace_alloc(flags); - if (slab_should_failslab(cachep, flags)) + if (should_failslab(cachep, flags)) return NULL; cachep = memcg_kmem_get_cache(cachep, flags); diff --git a/mm/slab.h b/mm/slab.h index fd231c9..6c7f16a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -360,7 +360,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, lockdep_trace_alloc(flags); might_sleep_if(gfpflags_allow_blocking(flags)); - if (should_failslab(s->object_size, flags, s->flags)) + if (should_failslab(s, flags)) return NULL; return memcg_kmem_get_cache(s, flags); -- cgit v0.10.2 From 011eceaf0ad54e0916df8593ffc27b9f73d89fcf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:41 -0700 Subject: slab: use slab_pre_alloc_hook in SLAB allocator shared with SLUB Deduplicate code in SLAB allocator functions slab_alloc() and slab_alloc_node() by using the slab_pre_alloc_hook() call, which is now shared between SLUB and SLAB. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 95f344d..1857a65 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3144,14 +3144,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3232,14 +3228,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) void *objp; flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); -- cgit v0.10.2 From 0142eae3ae15b9b9f0ae2a8e68e3c8dc347a2394 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:44 -0700 Subject: mm: kmemcheck skip object if slab allocation failed In the SLAB allocator kmemcheck_slab_alloc() is guarded against being called in case the object is NULL. In SLUB allocator this NULL pointer invocation can happen, which seems like an oversight. Move the NULL pointer check into kmemcheck code (kmemcheck_slab_alloc) so the check gets moved out of the fastpath, when not compiled with CONFIG_KMEMCHECK. This is a step towards sharing post_alloc_hook between SLUB and SLAB, because slab_post_alloc_hook() does not perform this check before calling kmemcheck_slab_alloc(). Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb..6f4f424 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order) void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, size_t size) { + if (unlikely(!object)) /* Skip object if allocation failed */ + return; + /* * Has already been memset(), which initializes the shadow for us * as well. -- cgit v0.10.2 From d5e3ed66d6f260b3bb68cb5cf0fe79777e8febf0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:47 -0700 Subject: slab: use slab_post_alloc_hook in SLAB allocator shared with SLUB Reviewers notice that the order in slab_post_alloc_hook() of kmemcheck_slab_alloc() and kmemleak_alloc_recursive() gets swapped compared to slab.c / SLAB allocator. Also notice memset now occurs before calling kmemcheck_slab_alloc() and kmemleak_alloc_recursive(). I assume this reordering of kmemcheck, kmemleak and memset is okay because this is the order they are used by the SLUB allocator. This patch completes the sharing of alloc_hook's between SLUB and SLAB. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 1857a65..f872208 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3176,16 +3176,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, - flags); - if (likely(ptr)) { - kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(ptr, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && ptr) + memset(ptr, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &ptr); return ptr; } @@ -3237,17 +3232,12 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, - flags); prefetchw(objp); - if (likely(objp)) { - kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(objp, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && objp) + memset(objp, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &objp); return objp; } -- cgit v0.10.2 From 2a777eac173a53b33f2f7dbed2b61a1f5eb0b531 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:50 -0700 Subject: slab: implement bulk alloc in SLAB allocator This patch implements the alloc side of bulk API for the SLAB allocator. Further optimization are still possible by changing the call to __do_cache_alloc() into something that can return multiple objects. This optimization is left for later, given end results already show in the area of 80% speedup. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index f872208..0c2e2ca 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3392,9 +3392,42 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) EXPORT_SYMBOL(kmem_cache_free_bulk); int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) + void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + s = slab_pre_alloc_hook(s, flags); + if (!s) + return 0; + + cache_alloc_debugcheck_before(s, flags); + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = __do_cache_alloc(s, flags); + + /* this call could be done outside IRQ disabled section */ + objp = cache_alloc_debugcheck_after(s, flags, objp, _RET_IP_); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + /* Clear memory outside IRQ disabled section */ + if (unlikely(flags & __GFP_ZERO)) + for (i = 0; i < size; i++) + memset(p[i], 0, s->object_size); + + slab_post_alloc_hook(s, flags, size, p); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -- cgit v0.10.2 From 7b0501dd6b1862a83c5c1c65beadce99014b97a4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:53 -0700 Subject: slab: avoid running debug SLAB code with IRQs disabled for alloc_bulk Move the call to cache_alloc_debugcheck_after() outside the IRQ disabled section in kmem_cache_alloc_bulk(). When CONFIG_DEBUG_SLAB is disabled the compiler should remove this code. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 0c2e2ca..207fa92 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3391,6 +3391,16 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) +{ + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); +} + int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { @@ -3406,15 +3416,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, for (i = 0; i < size; i++) { void *objp = __do_cache_alloc(s, flags); - /* this call could be done outside IRQ disabled section */ - objp = cache_alloc_debugcheck_after(s, flags, objp, _RET_IP_); - if (unlikely(!objp)) goto error; p[i] = objp; } local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + /* Clear memory outside IRQ disabled section */ if (unlikely(flags & __GFP_ZERO)) for (i = 0; i < size; i++) @@ -3425,6 +3434,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, return size; error: local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); slab_post_alloc_hook(s, flags, i, p); __kmem_cache_free_bulk(s, i, p); return 0; -- cgit v0.10.2 From e6cdb58d1c8307f66039b04203e378d89938281c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:56 -0700 Subject: slab: implement bulk free in SLAB allocator This patch implements the free side of bulk API for the SLAB allocator kmem_cache_free_bulk(), and concludes the implementation of optimized bulk API for SLAB allocator. Benchmarked[1] cost of alloc+free (obj size 256 bytes) on CPU i7-4790K @ 4.00GHz, with no debug options, no PREEMPT and CONFIG_MEMCG_KMEM=y but no active user of kmemcg. SLAB single alloc+free cost: 87 cycles(tsc) 21.814 ns with this optimized config. bulk- Current fallback - optimized SLAB bulk 1 - 102 cycles(tsc) 25.747 ns - 41 cycles(tsc) 10.490 ns - improved 59.8% 2 - 94 cycles(tsc) 23.546 ns - 26 cycles(tsc) 6.567 ns - improved 72.3% 3 - 92 cycles(tsc) 23.127 ns - 20 cycles(tsc) 5.244 ns - improved 78.3% 4 - 90 cycles(tsc) 22.663 ns - 18 cycles(tsc) 4.588 ns - improved 80.0% 8 - 88 cycles(tsc) 22.242 ns - 14 cycles(tsc) 3.656 ns - improved 84.1% 16 - 88 cycles(tsc) 22.010 ns - 13 cycles(tsc) 3.480 ns - improved 85.2% 30 - 89 cycles(tsc) 22.305 ns - 13 cycles(tsc) 3.303 ns - improved 85.4% 32 - 89 cycles(tsc) 22.277 ns - 13 cycles(tsc) 3.309 ns - improved 85.4% 34 - 88 cycles(tsc) 22.246 ns - 13 cycles(tsc) 3.294 ns - improved 85.2% 48 - 88 cycles(tsc) 22.121 ns - 13 cycles(tsc) 3.492 ns - improved 85.2% 64 - 88 cycles(tsc) 22.052 ns - 13 cycles(tsc) 3.411 ns - improved 85.2% 128 - 89 cycles(tsc) 22.452 ns - 15 cycles(tsc) 3.841 ns - improved 83.1% 158 - 89 cycles(tsc) 22.403 ns - 14 cycles(tsc) 3.746 ns - improved 84.3% 250 - 91 cycles(tsc) 22.775 ns - 16 cycles(tsc) 4.111 ns - improved 82.4% Notice it is not recommended to do very large bulk operation with this bulk API, because local IRQs are disabled in this period. [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/slab_bulk_test01.c Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 207fa92..2e075c0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3385,12 +3385,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) -{ - __kmem_cache_free_bulk(s, size, p); -} -EXPORT_SYMBOL(kmem_cache_free_bulk); - static __always_inline void cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p, unsigned long caller) @@ -3584,6 +3578,29 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + struct kmem_cache *s; + size_t i; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = p[i]; + + s = cache_from_obj(orig_s, objp); + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. -- cgit v0.10.2 From ca257195511d536308700548de008b51729221eb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:54:00 -0700 Subject: mm: new API kfree_bulk() for SLAB+SLUB allocators This patch introduce a new API call kfree_bulk() for bulk freeing memory objects not bound to a single kmem_cache. Christoph pointed out that it is possible to implement freeing of objects, without knowing the kmem_cache pointer as that information is available from the object's page->slab_cache. Proposing to remove the kmem_cache argument from the bulk free API. Jesper demonstrated that these extra steps per object comes at a performance cost. It is only in the case CONFIG_MEMCG_KMEM is compiled in and activated runtime that these steps are done anyhow. The extra cost is most visible for SLAB allocator, because the SLUB allocator does the page lookup (virt_to_head_page()) anyhow. Thus, the conclusion was to keep the kmem_cache free bulk API with a kmem_cache pointer, but we can still implement a kfree_bulk() API fairly easily. Simply by handling if kmem_cache_free_bulk() gets called with a kmem_cache NULL pointer. This does increase the code size a bit, but implementing a separate kfree_bulk() call would likely increase code size even more. Below benchmarks cost of alloc+free (obj size 256 bytes) on CPU i7-4790K @ 4.00GHz, no PREEMPT and CONFIG_MEMCG_KMEM=y. Code size increase for SLAB: add/remove: 0/0 grow/shrink: 1/0 up/down: 74/0 (74) function old new delta kmem_cache_free_bulk 660 734 +74 SLAB fastpath: 87 cycles(tsc) 21.814 sz - fallback - kmem_cache_free_bulk - kfree_bulk 1 - 103 cycles 25.878 ns - 41 cycles 10.498 ns - 81 cycles 20.312 ns 2 - 94 cycles 23.673 ns - 26 cycles 6.682 ns - 42 cycles 10.649 ns 3 - 92 cycles 23.181 ns - 21 cycles 5.325 ns - 39 cycles 9.950 ns 4 - 90 cycles 22.727 ns - 18 cycles 4.673 ns - 26 cycles 6.693 ns 8 - 89 cycles 22.270 ns - 14 cycles 3.664 ns - 23 cycles 5.835 ns 16 - 88 cycles 22.038 ns - 14 cycles 3.503 ns - 22 cycles 5.543 ns 30 - 89 cycles 22.284 ns - 13 cycles 3.310 ns - 20 cycles 5.197 ns 32 - 88 cycles 22.249 ns - 13 cycles 3.420 ns - 20 cycles 5.166 ns 34 - 88 cycles 22.224 ns - 14 cycles 3.643 ns - 20 cycles 5.170 ns 48 - 88 cycles 22.088 ns - 14 cycles 3.507 ns - 20 cycles 5.203 ns 64 - 88 cycles 22.063 ns - 13 cycles 3.428 ns - 20 cycles 5.152 ns 128 - 89 cycles 22.483 ns - 15 cycles 3.891 ns - 23 cycles 5.885 ns 158 - 89 cycles 22.381 ns - 15 cycles 3.779 ns - 22 cycles 5.548 ns 250 - 91 cycles 22.798 ns - 16 cycles 4.152 ns - 23 cycles 5.967 ns SLAB when enabling MEMCG_KMEM runtime: - kmemcg fastpath: 130 cycles(tsc) 32.684 ns (step:0) 1 - 148 cycles 37.220 ns - 66 cycles 16.622 ns - 66 cycles 16.583 ns 2 - 141 cycles 35.510 ns - 51 cycles 12.820 ns - 58 cycles 14.625 ns 3 - 140 cycles 35.017 ns - 37 cycles 9.326 ns - 33 cycles 8.474 ns 4 - 137 cycles 34.507 ns - 31 cycles 7.888 ns - 33 cycles 8.300 ns 8 - 140 cycles 35.069 ns - 25 cycles 6.461 ns - 25 cycles 6.436 ns 16 - 138 cycles 34.542 ns - 23 cycles 5.945 ns - 22 cycles 5.670 ns 30 - 136 cycles 34.227 ns - 22 cycles 5.502 ns - 22 cycles 5.587 ns 32 - 136 cycles 34.253 ns - 21 cycles 5.475 ns - 21 cycles 5.324 ns 34 - 136 cycles 34.254 ns - 21 cycles 5.448 ns - 20 cycles 5.194 ns 48 - 136 cycles 34.075 ns - 21 cycles 5.458 ns - 21 cycles 5.367 ns 64 - 135 cycles 33.994 ns - 21 cycles 5.350 ns - 21 cycles 5.259 ns 128 - 137 cycles 34.446 ns - 23 cycles 5.816 ns - 22 cycles 5.688 ns 158 - 137 cycles 34.379 ns - 22 cycles 5.727 ns - 22 cycles 5.602 ns 250 - 138 cycles 34.755 ns - 24 cycles 6.093 ns - 23 cycles 5.986 ns Code size increase for SLUB: function old new delta kmem_cache_free_bulk 717 799 +82 SLUB benchmark: SLUB fastpath: 46 cycles(tsc) 11.691 ns (step:0) sz - fallback - kmem_cache_free_bulk - kfree_bulk 1 - 61 cycles 15.486 ns - 53 cycles 13.364 ns - 57 cycles 14.464 ns 2 - 54 cycles 13.703 ns - 32 cycles 8.110 ns - 33 cycles 8.482 ns 3 - 53 cycles 13.272 ns - 25 cycles 6.362 ns - 27 cycles 6.947 ns 4 - 51 cycles 12.994 ns - 24 cycles 6.087 ns - 24 cycles 6.078 ns 8 - 50 cycles 12.576 ns - 21 cycles 5.354 ns - 22 cycles 5.513 ns 16 - 49 cycles 12.368 ns - 20 cycles 5.054 ns - 20 cycles 5.042 ns 30 - 49 cycles 12.273 ns - 18 cycles 4.748 ns - 19 cycles 4.758 ns 32 - 49 cycles 12.401 ns - 19 cycles 4.821 ns - 19 cycles 4.810 ns 34 - 98 cycles 24.519 ns - 24 cycles 6.154 ns - 24 cycles 6.157 ns 48 - 83 cycles 20.833 ns - 21 cycles 5.446 ns - 21 cycles 5.429 ns 64 - 75 cycles 18.891 ns - 20 cycles 5.247 ns - 20 cycles 5.238 ns 128 - 93 cycles 23.271 ns - 27 cycles 6.856 ns - 27 cycles 6.823 ns 158 - 102 cycles 25.581 ns - 30 cycles 7.714 ns - 30 cycles 7.695 ns 250 - 107 cycles 26.917 ns - 38 cycles 9.514 ns - 38 cycles 9.506 ns SLUB when enabling MEMCG_KMEM runtime: - kmemcg fastpath: 71 cycles(tsc) 17.897 ns (step:0) 1 - 85 cycles 21.484 ns - 78 cycles 19.569 ns - 75 cycles 18.938 ns 2 - 81 cycles 20.363 ns - 45 cycles 11.258 ns - 44 cycles 11.076 ns 3 - 78 cycles 19.709 ns - 33 cycles 8.354 ns - 32 cycles 8.044 ns 4 - 77 cycles 19.430 ns - 28 cycles 7.216 ns - 28 cycles 7.003 ns 8 - 101 cycles 25.288 ns - 23 cycles 5.849 ns - 23 cycles 5.787 ns 16 - 76 cycles 19.148 ns - 20 cycles 5.162 ns - 20 cycles 5.081 ns 30 - 76 cycles 19.067 ns - 19 cycles 4.868 ns - 19 cycles 4.821 ns 32 - 76 cycles 19.052 ns - 19 cycles 4.857 ns - 19 cycles 4.815 ns 34 - 121 cycles 30.291 ns - 25 cycles 6.333 ns - 25 cycles 6.268 ns 48 - 108 cycles 27.111 ns - 21 cycles 5.498 ns - 21 cycles 5.458 ns 64 - 100 cycles 25.164 ns - 20 cycles 5.242 ns - 20 cycles 5.229 ns 128 - 155 cycles 38.976 ns - 27 cycles 6.886 ns - 27 cycles 6.892 ns 158 - 132 cycles 33.034 ns - 30 cycles 7.711 ns - 30 cycles 7.728 ns 250 - 130 cycles 32.612 ns - 38 cycles 9.560 ns - 38 cycles 9.549 ns Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/slab.h b/include/linux/slab.h index 3627d5c..9d9a5bd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -323,6 +323,15 @@ void kmem_cache_free(struct kmem_cache *, void *); void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +/* + * Caller must not use kfree_bulk() on memory not originally allocated + * by kmalloc(), because the SLOB allocator cannot handle this. + */ +static __always_inline void kfree_bulk(size_t size, void **p) +{ + kmem_cache_free_bulk(NULL, size, p); +} + #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment; void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment; diff --git a/mm/slab.c b/mm/slab.c index 2e075c0..b3eca03 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3587,7 +3587,10 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) for (i = 0; i < size; i++) { void *objp = p[i]; - s = cache_from_obj(orig_s, objp); + if (!orig_s) /* called via kfree_bulk */ + s = virt_to_cache(objp); + else + s = cache_from_obj(orig_s, objp); debug_check_no_locks_freed(objp, s->object_size); if (!(s->flags & SLAB_DEBUG_OBJECTS)) diff --git a/mm/slab_common.c b/mm/slab_common.c index 065b7bd..6afb226 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) { size_t i; - for (i = 0; i < nr; i++) - kmem_cache_free(s, p[i]); + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } } int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, diff --git a/mm/slub.c b/mm/slub.c index 6dd04c0..9620815 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2783,23 +2783,38 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, size_t first_skipped_index = 0; int lookahead = 3; void *object; + struct page *page; /* Always re-init detached_freelist */ df->page = NULL; do { object = p[--size]; + /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ } while (!object && size); if (!object) return 0; - /* Support for memcg, compiler can optimize this out */ - df->s = cache_from_obj(s, object); + page = virt_to_head_page(object); + if (!s) { + /* Handle kalloc'ed objects */ + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kfree_hook(object); + __free_kmem_pages(page, compound_order(page)); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Derive kmem_cache from object */ + df->s = page->slab_cache; + } else { + df->s = cache_from_obj(s, object); /* Support for memcg */ + } /* Start new detached freelist */ + df->page = page; set_freepointer(df->s, object, NULL); - df->page = virt_to_head_page(object); df->tail = object; df->freelist = object; p[size] = NULL; /* mark object processed */ -- cgit v0.10.2 From 9f706d6820d3ea776d6b3fc0c1de9f81eb0d021b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:54:03 -0700 Subject: mm: fix some spelling Fix up trivial spelling errors, noticed while reading the code. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 792c898..30b02e7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -765,7 +765,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); /* - * helper for acessing a memcg's index. It will be used as an index in the + * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function * will return -1 when this is not a kmem-limited memcg. */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 9d9a5bd..5d49f0c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -314,7 +314,7 @@ void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment void kmem_cache_free(struct kmem_cache *, void *); /* - * Bulk allocation and freeing operations. These are accellerated in an + * Bulk allocation and freeing operations. These are accelerated in an * allocator specific way to avoid taking locks repeatedly or building * metadata structures unnecessarily. * diff --git a/mm/slab.h b/mm/slab.h index 6c7f16a..e880bbe 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -172,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, /* * Generic implementation of bulk operations * These are useful for situations in which the allocator cannot - * perform optimizations. In that case segments of the objecct listed + * perform optimizations. In that case segments of the object listed * may be allocated or freed using these operations. */ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -- cgit v0.10.2 From 12c61fe9b763812adac44522100527caf534462e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:06 -0700 Subject: mm/slab: fix stale code comment This patchset implements a new freed object management way, that is, OBJFREELIST_SLAB. Purpose of it is to reduce memory overhead in SLAB. SLAB needs a array to manage freed objects in a slab. If there is leftover after objects are packed into a slab, we can use it as a management array, and, in this case, there is no memory waste. But, in the other cases, we need to allocate extra memory for a management array or utilize dedicated internal memory in a slab for it. Both cases causes memory waste so it's not good. With this patchset, freed object itself can be used for a management array. So, memory waste could be reduced. Detailed idea and numbers are described in last patch's commit description. Please refer it. In fact, I tested another idea implementing OBJFREELIST_SLAB with extendable linked array through another freed object. It can remove memory waste completely but it causes more computational overhead in critical lock path and it seems that overhead outweigh benefit. So, this patchset doesn't include it. I will attach prototype just for a reference. This patch (of 16): We use freelist_idx_t type for free object management whose size would be smaller than size of unsigned int. Fix it. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index b3eca03..23fc4b0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -537,7 +537,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object + * - One freelist_idx_t for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * -- cgit v0.10.2 From 6fb924304ac35f1ab9f3abe73527efcd5156131f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:09 -0700 Subject: mm/slab: remove useless structure define It is obsolete so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 23fc4b0..3634dc1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -225,16 +225,6 @@ static inline void clear_obj_pfmemalloc(void **objp) } /* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - -/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (2 * MAX_NUMNODES) @@ -457,6 +447,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } +#define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, -- cgit v0.10.2 From 260b61dd46ed07f517e4059ab9881402cbd6a385 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:12 -0700 Subject: mm/slab: remove the checks for slab implementation bug Some of "#if DEBUG" are for reporting slab implementation bug rather than user usecase bug. It's not really needed because slab is stable for a quite long time and it makes code too dirty. This patch remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 3634dc1..14c3f9c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2110,8 +2110,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(flags & SLAB_POISON); #endif /* @@ -2368,9 +2366,6 @@ static int drain_freelist(struct kmem_cache *cache, } page = list_entry(p, struct page, lru); -#if DEBUG - BUG_ON(page->active); -#endif list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked @@ -2528,30 +2523,23 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, - int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; -#if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); -#endif return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, - void *objp, int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, + struct page *page, void *objp) { unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; - /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { @@ -2817,8 +2805,7 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, - node)); + ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); } /* move slabp to correct slabp list: */ @@ -3101,7 +3088,7 @@ retry: BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, page); n->free_objects--; /* move slabp to correct slabp list: */ list_del(&page->lru); @@ -3252,7 +3239,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = virt_to_head_page(objp); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); n->free_objects++; @@ -3282,9 +3269,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) LIST_HEAD(list); batchcount = ac->batchcount; -#if DEBUG - BUG_ON(!batchcount || batchcount > ac->avail); -#endif + check_irq_off(); n = get_node(cachep, node); spin_lock(&n->list_lock); -- cgit v0.10.2 From a307ebd468e0b97c203f5a99a56a6017e4d1991a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:15 -0700 Subject: mm/slab: activate debug_pagealloc in SLAB when it is actually enabled Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 14c3f9c..4807cf4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1838,7 +1838,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && + if (debug_pagealloc_enabled() && + cachep->size % PAGE_SIZE == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 1); @@ -2176,7 +2177,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && + if (debug_pagealloc_enabled() && + !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && ALIGN(size, cachep->align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); @@ -2232,7 +2234,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) + if (debug_pagealloc_enabled() && + size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } @@ -2716,7 +2719,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { + if (debug_pagealloc_enabled() && + (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, caller); kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 0); @@ -2861,7 +2865,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + if (debug_pagealloc_enabled() && + (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 1); else -- cgit v0.10.2 From 40323278b557a5909bbecfa181c91a3af7afbbe3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:18 -0700 Subject: mm/slab: use more appropriate condition check for debug_pagealloc debug_pagealloc debugging is related to SLAB_POISON flag rather than FORCED_DEBUG option, although FORCED_DEBUG option will enable SLAB_POISON. Fix it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 4807cf4..8bca9be 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2169,7 +2169,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) /* * To activate debug pagealloc, off-slab management is necessary * requirement. In early phase of initialization, small sized slab @@ -2177,7 +2176,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (debug_pagealloc_enabled() && + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && ALIGN(size, cachep->align) < PAGE_SIZE) { @@ -2185,7 +2184,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size = PAGE_SIZE; } #endif -#endif /* * Determine if the slab management is 'on' or 'off' slab. -- cgit v0.10.2 From 40b44137971c2e5865a78f9f7de274449983ccb5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:21 -0700 Subject: mm/slab: clean up DEBUG_PAGEALLOC processing code Currently, open code for checking DEBUG_PAGEALLOC cache is spread to some sites. It makes code unreadable and hard to change. This patch cleans up this code. The following patch will change the criteria for DEBUG_PAGEALLOC cache so this clean-up will help it, too. [akpm@linux-foundation.org: fix build with CONFIG_DEBUG_PAGEALLOC=n] Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 2b6e227..69fd6bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2195,14 +2195,18 @@ kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); -#endif /* CONFIG_HIBERNATION */ -#else +#endif /* CONFIG_HIBERNATION */ +#else /* CONFIG_DEBUG_PAGEALLOC */ static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } -#endif /* CONFIG_HIBERNATION */ -#endif +#endif /* CONFIG_HIBERNATION */ +static inline bool debug_pagealloc_enabled(void) +{ + return false; +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); diff --git a/mm/slab.c b/mm/slab.c index 8bca9be..3142ec3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1661,6 +1661,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1694,6 +1702,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1772,6 +1797,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1837,17 +1865,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -2226,16 +2245,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ freelist_size = calculate_freelist_size(cachep->num, 0); - -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (debug_pagealloc_enabled() && - size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif } cachep->colour_off = cache_line_size(); @@ -2251,7 +2260,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* * This is a possibility for one of the kmalloc_{dma,}_caches. @@ -2475,9 +2496,6 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); #if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2501,10 +2519,11 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } #else if (cachep->ctor) cachep->ctor(objp); @@ -2716,18 +2735,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2862,16 +2871,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) -- cgit v0.10.2 From d31676dfde257cb2b3e52d4e657d8ad2251e4d49 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:24 -0700 Subject: mm/slab: alternative implementation for DEBUG_SLAB_LEAK DEBUG_SLAB_LEAK is a debug option. It's current implementation requires status buffer so we need more memory to use it. And, it cause kmem_cache initialization step more complex. To remove this extra memory usage and to simplify initialization step, this patch implement this feature with another way. When user requests to get slab object owner information, it marks that getting information is started. And then, all free objects in caches are flushed to corresponding slab page. Now, we can distinguish all freed object so we can know all allocated objects, too. After collecting slab object owner information on allocated objects, mark is checked that there is no free during the processing. If true, we can be sure that our information is correct so information is returned to user. Although this way is rather complex, it has two important benefits mentioned above. So, I think it is worth changing. There is one drawback that it takes more time to get slab object owner information but it is just a debug option so it doesn't matter at all. To help review, this patch implements new way only. Following patch will remove useless code. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index cf139d3..e878ba3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -60,6 +60,9 @@ struct kmem_cache { atomic_t allocmiss; atomic_t freehit; atomic_t freemiss; +#ifdef CONFIG_DEBUG_SLAB_LEAK + atomic_t store_user_clean; +#endif /* * If debugging is enabled, then the allocator can add additional diff --git a/mm/slab.c b/mm/slab.c index 3142ec3..907abe9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -396,20 +396,25 @@ static void set_obj_status(struct page *page, int idx, int val) status[idx] = val; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; + return atomic_read(&cachep->store_user_clean) == 1; +} - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; +static inline void set_store_user_clean(struct kmem_cache *cachep) +{ + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -2550,6 +2555,11 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; +#if DEBUG + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); +#endif + return objp; } @@ -2725,8 +2735,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); @@ -4119,15 +4131,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) + continue; + + if (!add_caller(n, v)) return; } } @@ -4163,21 +4194,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); + + x[1] = 0; - x[1] = 0; + for_each_kmem_cache_node(cachep, node, n) { - for_each_kmem_cache_node(cachep, node, n) { + check_irq_on(); + spin_lock_irq(&n->list_lock); - check_irq_on(); - spin_lock_irq(&n->list_lock); + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ -- cgit v0.10.2 From 249247b6f8ee362189a2f2bf598a14ff6c95fb4c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:27 -0700 Subject: mm/slab: remove object status buffer for DEBUG_SLAB_LEAK Now, we don't use object status buffer in any setup. Remove it. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 907abe9..02be9d9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -380,22 +380,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) -{ - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; -} - static inline bool is_store_user_clean(struct kmem_cache *cachep) { return atomic_read(&cachep->store_user_clean) == 1; @@ -413,7 +399,6 @@ static inline void set_store_user_dirty(struct kmem_cache *cachep) } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -476,9 +461,6 @@ static size_t calculate_freelist_size(int nr_objs, size_t align) size_t freelist_size; freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - if (align) freelist_size = ALIGN(freelist_size, align); @@ -491,10 +473,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, int nr_objs; size_t remained_size; size_t freelist_size; - int extra_space = 0; - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); /* * Ignore padding for the initial guess. The padding * is at most @align-1 bytes, and @buffer_size is at @@ -503,7 +482,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); + nr_objs = slab_size / (buffer_size + idx_size); /* * This calculated number will be either the right @@ -1961,16 +1940,13 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + offslab_limit /= sizeof(freelist_idx_t); if (num > offslab_limit) break; @@ -2533,7 +2509,6 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - set_obj_status(page, i, OBJECT_FREE); set_free_obj(page, i, i); } } @@ -2745,7 +2720,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); slab_kernel_map(cachep, objp, 0, caller); @@ -2878,8 +2852,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -2904,8 +2876,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); -- cgit v0.10.2 From 2e6b3602168797fd4d80d86d208c4ba8fcfa3b8b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:30 -0700 Subject: mm/slab: put the freelist at the end of slab page Currently, the freelist is at the front of slab page. This requires extra space to meet object alignment requirement. If we put the freelist at the end of a slab page, objects could start at page boundary and will be at correct alignment. This is possible because freelist has no alignment constraint itself. This gives us two benefits: It removes extra memory space for the freelist alignment and remove complex calculation at cache initialization step. I can't think notable drawback here. I mentioned that this would reduce extra memory space, but, this benefit is rather theoretical because it can be applied to very few cases. Following is the example cache type that can get benefit from this change. size align num before after 32 8 124 4100 4092 64 8 63 4103 4095 88 8 46 4102 4094 272 8 15 4103 4095 408 8 10 4098 4090 32 16 124 4108 4092 64 16 63 4111 4095 32 32 124 4124 4092 64 32 63 4127 4095 96 32 42 4106 4074 before means whole size for objects and aligned freelist before applying patch and after shows the result of this patch. Since before is more than 4096, number of object should decrease and memory waste happens. Anyway, this patch removes complex calculation so looks beneficial to me. [akpm@linux-foundation.org: fix kerneldoc] Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 02be9d9..b3d91b0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -456,55 +456,12 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return this_cpu_ptr(cachep->cpu_cache); } -static size_t calculate_freelist_size(int nr_objs, size_t align) -{ - size_t freelist_size; - - freelist_size = nr_objs * sizeof(freelist_idx_t); - if (align) - freelist_size = ALIGN(freelist_size, align); - - return freelist_size; -} - -static int calculate_nr_objs(size_t slab_size, size_t buffer_size, - size_t idx_size, size_t align) -{ - int nr_objs; - size_t remained_size; - size_t freelist_size; - - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = slab_size / (buffer_size + idx_size); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - remained_size = slab_size - nr_objs * buffer_size; - freelist_size = calculate_freelist_size(nr_objs, align); - if (remained_size < freelist_size) - nr_objs--; - - return nr_objs; -} - /* * Calculate the number of objects and left-over bytes for a given buffer size. */ static void cache_estimate(unsigned long gfporder, size_t buffer_size, - size_t align, int flags, size_t *left_over, - unsigned int *num) + unsigned long flags, size_t *left_over, unsigned int *num) { - int nr_objs; - size_t mgmt_size; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -512,9 +469,12 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One freelist_idx_t for each object - * - Padding to respect alignment of @align * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because @@ -522,16 +482,13 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * correct alignment when allocated. */ if (flags & CFLGS_OFF_SLAB) { - mgmt_size = 0; - nr_objs = slab_size / buffer_size; - + *num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; } else { - nr_objs = calculate_nr_objs(slab_size, buffer_size, - sizeof(freelist_idx_t), align); - mgmt_size = calculate_freelist_size(nr_objs, align); + *num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } #if DEBUG @@ -1911,7 +1868,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. - * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. @@ -1921,7 +1877,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, size_t align, unsigned long flags) + size_t size, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; @@ -1931,7 +1887,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, align, flags, &remainder, &num); + cache_estimate(gfporder, size, flags, &remainder, &num); if (!num) continue; @@ -2207,12 +2163,12 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, cachep->align, flags); + left_over = calculate_slab_order(cachep, size, flags); if (!cachep->num) return -E2BIG; - freelist_size = calculate_freelist_size(cachep->num, cachep->align); + freelist_size = cachep->num * sizeof(freelist_idx_t); /* * If the slab has been placed off-slab, and we have enough space then @@ -2223,11 +2179,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) left_over -= freelist_size; } - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ - freelist_size = calculate_freelist_size(cachep->num, 0); - } - cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < cachep->align) @@ -2443,6 +2394,9 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); + page->s_mem = addr + colour_off; + page->active = 0; + if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, @@ -2450,11 +2404,11 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, if (!freelist) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; } - page->active = 0; - page->s_mem = addr + colour_off; + return freelist; } -- cgit v0.10.2 From 832a15d209cd260180407bde1af18965b21623f3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:33 -0700 Subject: mm/slab: align cache size first before determination of OFF_SLAB candidate Finding suitable OFF_SLAB candidate is more related to aligned cache size rather than original size. Same reasoning can be applied to the debug pagealloc candidate. So, this patch moves up alignment fixup to proper position. From that point, size is aligned so we can remove some alignment fixups. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index b3d91b0..d5dffc8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2125,6 +2125,17 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } +#endif + + size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + +#if DEBUG /* * To activate debug pagealloc, off-slab management is necessary * requirement. In early phase of initialization, small sized slab @@ -2135,8 +2146,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); + size < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - size; size = PAGE_SIZE; } #endif @@ -2148,20 +2159,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) + !(flags & SLAB_NOLEAKTRACE)) { /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; - - size = ALIGN(size, cachep->align); - /* - * We should restrict the number of objects in a slab to implement - * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. - */ - if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) - size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + } left_over = calculate_slab_order(cachep, size, flags); -- cgit v0.10.2 From 158e319bba59e890c3920ce6d827c188287bae84 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:35 -0700 Subject: mm/slab: clean up cache type determination Current cache type determination code is open-code and looks not understandable. Following patch will introduce one more cache type and it would make code more complex. So, before it happens, this patch abstracts these codes. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index d5dffc8..9b56685 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2023,6 +2023,64 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Determine if the slab management is 'on' or 'off' slab. + * (bootstrapping cannot cope with offslab caches so don't do + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) + */ + if (size < OFF_SLAB_MIN_SIZE) + return false; + + if (slab_early_init) + return false; + + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2047,7 +2105,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size; size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; @@ -2098,6 +2155,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * 4) Store it. */ cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2152,43 +2213,18 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif - /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) - */ - if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) { - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ + if (set_off_slab_cache(cachep, size, flags)) { flags |= CFLGS_OFF_SLAB; + goto done; } - left_over = calculate_slab_order(cachep, size, flags); - - if (!cachep->num) - return -E2BIG; - - freelist_size = cachep->num * sizeof(freelist_idx_t); + if (set_on_slab_cache(cachep, size, flags)) + goto done; - /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. - */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { - flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; - } + return -E2BIG; - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2209,7 +2245,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) #endif if (OFF_SLAB(cachep)) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); + cachep->freelist_cache = + kmalloc_slab(cachep->freelist_size, 0u); /* * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than -- cgit v0.10.2 From f3a3c320d54eea39a419fd539f5b0e9c74517b0a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:38 -0700 Subject: mm/slab: do not change cache size if debug pagealloc isn't possible We can fail to setup off slab in some conditions. Even in this case, debug pagealloc increases cache size to PAGE_SIZE in advance and it is waste because debug pagealloc cannot work for it when it isn't the off slab. To improve this situation, this patch checks first that this cache with increased size is suitable for off slab. It actually increases cache size when it is suitable for off-slab, so possible waste is removed. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 9b56685..21aad9d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2206,10 +2206,17 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) */ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && !slab_early_init && size >= kmalloc_size(INDEX_NODE) && - size >= 256 && cachep->object_size > cache_line_size() && - size < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - size; - size = PAGE_SIZE; + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } } #endif -- cgit v0.10.2 From 3217fd9bdf0017bd0847939f67d52a9c71d8fc56 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:41 -0700 Subject: mm/slab: make criteria for off slab determination robust and simple To become an off slab, there are some constraints to avoid bootstrapping problem and recursive call. This can be avoided differently by simply checking that corresponding kmalloc cache is ready and it's not a off slab. It would be more robust because static size checking can be affected by cache size change or architecture type but dynamic checking isn't. One check 'freelist_cache->size > cachep->size / 2' is added to check benefit of choosing off slab, because, now, there is no size constraint which ensures enough advantage when selecting off slab. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 21aad9d..ab43d9f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -272,7 +272,6 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -1879,7 +1878,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1896,16 +1894,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { + struct kmem_cache *freelist_cache; + size_t freelist_size; + + freelist_size = num * sizeof(freelist_idx_t); + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). + * Needed to avoid possible looping condition + * in cache_grow() */ - offslab_limit = size; - offslab_limit /= sizeof(freelist_idx_t); + if (OFF_SLAB(freelist_cache)) + continue; - if (num > offslab_limit) - break; + /* check if off slab has enough benefit */ + if (freelist_cache->size > cachep->size / 2) + continue; } /* Found something acceptable - save it away */ @@ -2031,17 +2037,9 @@ static bool set_off_slab_cache(struct kmem_cache *cachep, cachep->num = 0; /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. */ - if (size < OFF_SLAB_MIN_SIZE) - return false; - - if (slab_early_init) - return false; - if (flags & SLAB_NOLEAKTRACE) return false; @@ -2205,7 +2203,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * sized slab is initialized in current slab initialization sequence. */ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && - !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size()) { if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { size_t tmp_size = ALIGN(size, PAGE_SIZE); @@ -2254,14 +2251,6 @@ done: if (OFF_SLAB(cachep)) { cachep->freelist_cache = kmalloc_slab(cachep->freelist_size, 0u); - /* - * This is a possibility for one of the kmalloc_{dma,}_caches. - * But since we go off slab only for object size greater than - * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created - * in ascending order,this should not happen at all. - * But leave a BUG_ON for some lucky dude. - */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); } err = setup_cpu_cache(cachep, gfp); -- cgit v0.10.2 From d8410234db6a1bb05408ba2f0e3fd6b04ef626a3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:44 -0700 Subject: mm/slab: factor out slab list fixup code Slab list should be fixed up after object is detached from the slab and this happens at two places. They do exactly same thing. They will be changed in the following patch, so, to reduce code duplication, this patch factor out them and make it common function. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index ab43d9f..95e5d63 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2723,6 +2723,17 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page) +{ + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); + else + list_add(&page->lru, &n->slabs_partial); +} + static struct page *get_first_slab(struct kmem_cache_node *n) { struct page *page; @@ -2796,12 +2807,7 @@ retry: ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); } - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page); } must_grow: @@ -3067,13 +3073,8 @@ retry: obj = slab_get_obj(cachep, page); n->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page); spin_unlock(&n->list_lock); goto done; -- cgit v0.10.2 From 10b2e9e8e808bd30e1f4018a36366d07b0abd12f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:47 -0700 Subject: mm/slab: factor out debugging initialization in cache_init_objs() cache_init_objs() will be changed in following patch and current form doesn't fit well for that change. So, before doing it, this patch separates debugging initialization. This would cause two loop iteration when debugging is enabled, but, this overhead seems too light than debug feature itself so effect may not be visible. This patch will greatly simplify changes in cache_init_objs() in following patch. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 95e5d63..d3608d1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2460,14 +2460,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2496,10 +2496,22 @@ static void cache_init_objs(struct kmem_cache *cachep, poison_obj(cachep, objp, POISON_FREE); slab_kernel_map(cachep, objp, 0, 0); } -#else - if (cachep->ctor) - cachep->ctor(objp); + } #endif +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + + cache_init_objs_debug(cachep, page); + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) + cachep->ctor(index_to_obj(cachep, page, i)); + set_free_obj(page, i, i); } } -- cgit v0.10.2 From b03a017bebc403d40aa53a092e79b3020786537d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:50 -0700 Subject: mm/slab: introduce new slab management type, OBJFREELIST_SLAB SLAB needs an array to manage freed objects in a slab. It is only used if some objects are freed so we can use free object itself as this array. This requires additional branch in somewhat critical lock path to check if it is first freed object or not but that's all we need. Benefits is that we can save extra memory usage and reduce some computational overhead by allocating a management array when new slab is created. Code change is rather complex than what we can expect from the idea, in order to handle debugging feature efficiently. If you want to see core idea only, please remove '#if DEBUG' block in the patch. Although this idea can apply to all caches whose size is larger than management array size, it isn't applied to caches which have a constructor. If such cache's object is used for management array, constructor should be called for it before that object is returned to user. I guess that overhead overwhelm benefit in that case so this idea doesn't applied to them at least now. For summary, from now on, slab management type is determined by following logic. 1) if management array size is smaller than object size and no ctor, it becomes OBJFREELIST_SLAB. 2) if management array size is smaller than leftover, it becomes NORMAL_SLAB which uses leftover as a array. 3) if OFF_SLAB help to save memory than way 4), it becomes OFF_SLAB. It allocate a management array from the other cache so memory waste happens. 4) others become NORMAL_SLAB. It uses dedicated internal memory in a slab as a management array so it causes memory waste. In my system, without enabling CONFIG_DEBUG_SLAB, Almost caches become OBJFREELIST_SLAB and NORMAL_SLAB (using leftover) which doesn't waste memory. Following is the result of number of caches with specific slab management type. TOTAL = OBJFREELIST + NORMAL(leftover) + NORMAL + OFF /Before/ 126 = 0 + 60 + 25 + 41 /After/ 126 = 97 + 12 + 15 + 2 Result shows that number of caches that doesn't waste memory increase from 60 to 109. I did some benchmarking and it looks that benefit are more than loss. Kmalloc: Repeatedly allocate then free test /Before/ [ 0.286809] 1. Kmalloc: Repeatedly allocate then free test [ 1.143674] 100000 times kmalloc(32) -> 116 cycles kfree -> 78 cycles [ 1.441726] 100000 times kmalloc(64) -> 121 cycles kfree -> 80 cycles [ 1.815734] 100000 times kmalloc(128) -> 168 cycles kfree -> 85 cycles [ 2.380709] 100000 times kmalloc(256) -> 287 cycles kfree -> 95 cycles [ 3.101153] 100000 times kmalloc(512) -> 370 cycles kfree -> 117 cycles [ 3.942432] 100000 times kmalloc(1024) -> 413 cycles kfree -> 156 cycles [ 5.227396] 100000 times kmalloc(2048) -> 622 cycles kfree -> 248 cycles [ 7.519793] 100000 times kmalloc(4096) -> 1102 cycles kfree -> 452 cycles /After/ [ 1.205313] 100000 times kmalloc(32) -> 117 cycles kfree -> 78 cycles [ 1.510526] 100000 times kmalloc(64) -> 124 cycles kfree -> 81 cycles [ 1.827382] 100000 times kmalloc(128) -> 130 cycles kfree -> 84 cycles [ 2.226073] 100000 times kmalloc(256) -> 177 cycles kfree -> 92 cycles [ 2.814747] 100000 times kmalloc(512) -> 286 cycles kfree -> 112 cycles [ 3.532952] 100000 times kmalloc(1024) -> 344 cycles kfree -> 141 cycles [ 4.608777] 100000 times kmalloc(2048) -> 519 cycles kfree -> 210 cycles [ 6.350105] 100000 times kmalloc(4096) -> 789 cycles kfree -> 391 cycles In fact, I tested another idea implementing OBJFREELIST_SLAB with extendable linked array through another freed object. It can remove memory waste completely but it causes more computational overhead in critical lock path and it seems that overhead outweigh benefit. So, this patch doesn't include it. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index d3608d1..85e394f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -270,7 +270,9 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) +#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) #define CFLGS_OFF_SLAB (0x80000000UL) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) #define BATCHREFILL_LIMIT 16 @@ -480,7 +482,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ - if (flags & CFLGS_OFF_SLAB) { + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { *num = slab_size / buffer_size; *left_over = slab_size % buffer_size; } else { @@ -1801,6 +1803,12 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct page *page) { int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, page->freelist - obj_offset(cachep), + POISON_FREE); + } + for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); @@ -2029,6 +2037,29 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + static bool set_off_slab_cache(struct kmem_cache *cachep, size_t size, unsigned long flags) { @@ -2217,6 +2248,11 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } + if (set_off_slab_cache(cachep, size, flags)) { flags |= CFLGS_OFF_SLAB; goto done; @@ -2434,7 +2470,9 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, page->s_mem = addr + colour_off; page->active = 0; - if (OFF_SLAB(cachep)) { + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); @@ -2507,6 +2545,11 @@ static void cache_init_objs(struct kmem_cache *cachep, cache_init_objs_debug(cachep, page); + if (OBJFREELIST_SLAB(cachep)) { + page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + obj_offset(cachep); + } + for (i = 0; i < cachep->num; i++) { /* constructor could break poison info */ if (DEBUG == 0 && cachep->ctor) @@ -2558,6 +2601,9 @@ static void slab_put_obj(struct kmem_cache *cachep, } #endif page->active--; + if (!page->freelist) + page->freelist = objp + obj_offset(cachep); + set_free_obj(page, page->active, objnr); } @@ -2632,7 +2678,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (OFF_SLAB(cachep) && !freelist) goto opps1; slab_map_pages(cachep, page, freelist); @@ -2735,14 +2781,42 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct page *page) + struct kmem_cache_node *n, struct page *page, + void **list) { /* move slabp to correct slabp list: */ list_del(&page->lru); - if (page->active == cachep->num) + if (page->active == cachep->num) { list_add(&page->lru, &n->slabs_full); - else + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = page->freelist; + + *objp = *list; + *list = objp; + } +#endif + page->freelist = NULL; + } + } else list_add(&page->lru, &n->slabs_partial); } @@ -2768,6 +2842,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, struct kmem_cache_node *n; struct array_cache *ac; int node; + void *list = NULL; check_irq_off(); node = numa_mem_id(); @@ -2819,13 +2894,14 @@ retry: ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); } - fixup_slab_list(cachep, n, page); + fixup_slab_list(cachep, n, page, &list); } must_grow: n->free_objects -= ac->avail; alloc_done: spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { int x; @@ -3062,6 +3138,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct page *page; struct kmem_cache_node *n; void *obj; + void *list = NULL; int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); @@ -3086,9 +3163,10 @@ retry: obj = slab_get_obj(cachep, page); n->free_objects--; - fixup_slab_list(cachep, n, page); + fixup_slab_list(cachep, n, page, &list); spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); goto done; must_grow: -- cgit v0.10.2 From 70f75067b15659bb03404e75eded41011c67dc57 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:53 -0700 Subject: mm/slab: avoid returning values by reference Returing values by reference is bad practice. Instead, just use function return value. Signed-off-by: Joonsoo Kim Suggested-by: Christoph Lameter Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 85e394f..4f4e647 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -460,9 +460,10 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) /* * Calculate the number of objects and left-over bytes for a given buffer size. */ -static void cache_estimate(unsigned long gfporder, size_t buffer_size, - unsigned long flags, size_t *left_over, unsigned int *num) +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + unsigned long flags, size_t *left_over) { + unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -483,13 +484,15 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * correct alignment when allocated. */ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { - *num = slab_size / buffer_size; + num = slab_size / buffer_size; *left_over = slab_size % buffer_size; } else { - *num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); *left_over = slab_size % (buffer_size + sizeof(freelist_idx_t)); } + + return num; } #if DEBUG @@ -1893,7 +1896,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, flags, &remainder, &num); + num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue; -- cgit v0.10.2 From f68f8dddb5e9101b5bddb4d61ba0d81e4f9e0040 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:56 -0700 Subject: mm/slab: re-implement pfmemalloc support Current implementation of pfmemalloc handling in SLAB has some problems. 1) pfmemalloc_active is set to true when there is just one or more pfmemalloc slabs in the system, but it is cleared when there is no pfmemalloc slab in one arbitrary kmem_cache. So, pfmemalloc_active could be wrongly cleared. 2) Search to partial and free list doesn't happen when non-pfmemalloc object are not found in cpu cache. Instead, allocating new slab happens and it is not optimal. 3) Even after sk_memalloc_socks() is disabled, cpu cache would keep pfmemalloc objects tagged with SLAB_OBJ_PFMEMALLOC. It isn't cleared if sk_memalloc_socks() is disabled so it could cause problem. 4) If cpu cache is filled with pfmemalloc objects, it would cause slow down non-pfmemalloc allocation. To me, current pointer tagging approach looks complex and fragile so this patch re-implement whole thing instead of fixing problems one by one. Design principle for new implementation is that 1) Don't disrupt non-pfmemalloc allocation in fast path even if sk_memalloc_socks() is enabled. It's more likely case than pfmemalloc allocation. 2) Ensure that pfmemalloc slab is used only for pfmemalloc allocation. 3) Don't consider performance of pfmemalloc allocation in memory deficiency state. As a result, all pfmemalloc alloc/free in memory tight state will be handled in slow-path. If there is non-pfmemalloc free object, it will be returned first even for pfmemalloc user in fast-path so that performance of pfmemalloc user isn't affected in normal case and pfmemalloc objects will be kept as long as possible. Signed-off-by: Joonsoo Kim Tested-by: Mel Gorman Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 4f4e647..863c005 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * true if a page was allocated from pfmemalloc reserves for network-based - * swap - */ -static bool pfmemalloc_active __read_mostly; - -/* * struct array_cache * * Purpose: @@ -195,10 +189,6 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * - * Entries should not be directly dereferenced as - * entries belonging to slabs marked pfmemalloc will - * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; @@ -207,23 +197,6 @@ struct alien_cache { struct array_cache ac; }; -#define SLAB_OBJ_PFMEMALLOC 1 -static inline bool is_obj_pfmemalloc(void *objp) -{ - return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; -} - -static inline void set_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); - return; -} - -static inline void clear_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); -} - /* * Need this for bootstrapping a per node allocator. */ @@ -623,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, return ac; } -static inline bool is_slab_pfmemalloc(struct page *page) -{ - return PageSlabPfmemalloc(page); -} - -/* Clears pfmemalloc_active if no slabs have pfmalloc set */ -static void recheck_pfmemalloc_active(struct kmem_cache *cachep, - struct array_cache *ac) -{ - struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); - struct page *page; - unsigned long flags; - - if (!pfmemalloc_active) - return; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - pfmemalloc_active = false; -out: - spin_unlock_irqrestore(&n->list_lock, flags); -} - -static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, - gfp_t flags, bool force_refill) +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct page *page, void *objp) { - int i; - void *objp = ac->entry[--ac->avail]; - - /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ - if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_cache_node *n; - - if (gfp_pfmemalloc_allowed(flags)) { - clear_obj_pfmemalloc(&objp); - return objp; - } - - /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 0; i < ac->avail; i++) { - /* If a !PFMEMALLOC object is found, swap them */ - if (!is_obj_pfmemalloc(ac->entry[i])) { - objp = ac->entry[i]; - ac->entry[i] = ac->entry[ac->avail]; - ac->entry[ac->avail] = objp; - return objp; - } - } - - /* - * If there are empty slabs on the slabs_free list and we are - * being forced to refill the cache, mark this one !pfmemalloc. - */ - n = get_node(cachep, numa_mem_id()); - if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); - clear_obj_pfmemalloc(&objp); - recheck_pfmemalloc_active(cachep, ac); - return objp; - } - - /* No !PFMEMALLOC objects available */ - ac->avail++; - objp = NULL; - } - - return objp; -} - -static inline void *ac_get_obj(struct kmem_cache *cachep, - struct array_cache *ac, gfp_t flags, bool force_refill) -{ - void *objp; - - if (unlikely(sk_memalloc_socks())) - objp = __ac_get_obj(cachep, ac, flags, force_refill); - else - objp = ac->entry[--ac->avail]; - - return objp; -} - -static noinline void *__ac_put_obj(struct kmem_cache *cachep, - struct array_cache *ac, void *objp) -{ - if (unlikely(pfmemalloc_active)) { - /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); - if (PageSlabPfmemalloc(page)) - set_obj_pfmemalloc(&objp); - } + struct kmem_cache_node *n; + int page_node; + LIST_HEAD(list); - return objp; -} + page_node = page_to_nid(page); + n = get_node(cachep, page_node); -static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) -{ - if (unlikely(sk_memalloc_socks())) - objp = __ac_put_obj(cachep, ac, objp); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); - ac->entry[ac->avail++] = objp; + slabs_destroy(cachep, &list); } /* @@ -939,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -1540,10 +1414,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (page_is_pfmemalloc(page)) - pfmemalloc_active = true; - nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1551,8 +1421,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); - if (page_is_pfmemalloc(page)) + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -2823,7 +2695,46 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, list_add(&page->lru, &n->slabs_partial); } -static struct page *get_first_slab(struct kmem_cache_node *n) +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, + struct page *page, bool pfmemalloc) +{ + if (!page) + return NULL; + + if (pfmemalloc) + return page; + + if (!PageSlabPfmemalloc(page)) + return page; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + ClearPageSlabPfmemalloc(page); + return page; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&page->lru); + if (!page->active) + list_add_tail(&page->lru, &n->slabs_free); + else + list_add_tail(&page->lru, &n->slabs_partial); + + list_for_each_entry(page, &n->slabs_partial, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + list_for_each_entry(page, &n->slabs_free, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + return NULL; +} + +static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; @@ -2835,11 +2746,41 @@ static struct page *get_first_slab(struct kmem_cache_node *n) struct page, lru); } + if (sk_memalloc_socks()) + return get_valid_first_slab(n, page, pfmemalloc); + return page; } -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, - bool force_refill) +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct page *page; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, page); + n->free_objects--; + + fixup_slab_list(cachep, n, page, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; @@ -2849,8 +2790,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, check_irq_off(); node = numa_mem_id(); - if (unlikely(force_refill)) - goto force_grow; + retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; @@ -2876,7 +2816,7 @@ retry: while (batchcount > 0) { struct page *page; /* Get slab alloc is to come from. */ - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -2894,7 +2834,7 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); + ac->entry[ac->avail++] = slab_get_obj(cachep, page); } fixup_slab_list(cachep, n, page, &list); @@ -2908,7 +2848,15 @@ alloc_done: if (unlikely(!ac->avail)) { int x; -force_grow: + + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ @@ -2916,7 +2864,7 @@ force_grow: node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && (ac->avail == 0 || force_refill)) + if (!x && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -2924,7 +2872,7 @@ force_grow: } ac->touched = 1; - return ac_get_obj(cachep, ac, flags, force_refill); + return ac->entry[--ac->avail]; } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -2982,28 +2930,20 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; - bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; - objp = ac_get_obj(cachep, ac, flags, false); + objp = ac->entry[--ac->avail]; - /* - * Allow for the possibility all avail objects are not allowed - * by the current flags - */ - if (objp) { - STATS_INC_ALLOCHIT(cachep); - goto out; - } - force_refill = true; + STATS_INC_ALLOCHIT(cachep); + goto out; } STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags, force_refill); + objp = cache_alloc_refill(cachep, flags); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. @@ -3151,7 +3091,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -3304,7 +3244,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, void *objp; struct page *page; - clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; page = virt_to_head_page(objp); @@ -3410,7 +3349,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac_put_obj(cachep, ac, objp); + if (sk_memalloc_socks()) { + struct page *page = virt_to_head_page(objp); + + if (unlikely(PageSlabPfmemalloc(page))) { + cache_free_pfmemalloc(cachep, page, objp); + return; + } + } + + ac->entry[ac->avail++] = objp; } /** -- cgit v0.10.2 From 282acb436176db665b6d61688ca5885a09cbdace Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:54:59 -0700 Subject: slub: drop lock at the end of free_debug_processing This series takes the suggestion of Christoph Lameter and only focuses on optimizing the slow path where the debug processing runs. The two main optimizations in this series are letting the consistency checks be skipped and relaxing the cmpxchg restrictions when we are not doing consistency checks. With hackbench -g 20 -l 1000 averaged over 100 runs: Before slub_debug=P mean 15.607 variance .086 stdev .294 After slub_debug=P mean 10.836 variance .155 stdev .394 This still isn't as fast as what is in grsecurity unfortunately so there's still work to be done. Profiling ___slab_alloc shows that 25-50% of time is spent in deactivate_slab. I haven't looked too closely to see if this is something that can be optimized. My plan for now is to focus on getting all of this merged (if appropriate) before digging in to another task. This patch (of 4): Currently, free_debug_processing has a comment "Keep node_lock to preserve integrity until the object is actually freed". In actuallity, the lock is dropped immediately in __slab_free. Rather than wait until __slab_free and potentially throw off the unlikely marking, just drop the lock in __slab_free. This also lets free_debug_processing take its own copy of the spinlock flags rather than trying to share the ones from __slab_free. Since there is no use for the node afterwards, change the return type of free_debug_processing to return an int like alloc_debug_processing. Credit to Mathias Krause for the original work which inspired this series [akpm@linux-foundation.org: fix build] Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 9620815..a03e0ae 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1044,16 +1044,17 @@ bad: } /* Supports checking bulk free of a constructed freelist */ -static noinline struct kmem_cache_node *free_debug_processing( +static noinline int free_debug_processing( struct kmem_cache *s, struct page *page, void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) + unsigned long addr) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; + unsigned long uninitialized_var(flags); - spin_lock_irqsave(&n->list_lock, *flags); + spin_lock_irqsave(&n->list_lock, flags); slab_lock(page); if (!check_slab(s, page)) @@ -1106,17 +1107,14 @@ out: bulk_cnt, cnt); slab_unlock(page); - /* - * Keep node_lock to preserve integrity - * until the object is actually freed - */ - return n; + spin_unlock_irqrestore(&n->list_lock, flags); + return 1; fail: slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, *flags); + spin_unlock_irqrestore(&n->list_lock, flags); slab_fix(s, "Object at 0x%p not freed", object); - return NULL; + return 0; } static int __init setup_slub_debug(char *str) @@ -1207,10 +1205,10 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline struct kmem_cache_node *free_debug_processing( +static inline int free_debug_processing( struct kmem_cache *s, struct page *page, void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) { return NULL; } + unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -2588,8 +2586,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, head, tail, cnt, - addr, &flags))) + !free_debug_processing(s, page, head, tail, cnt, addr)) return; do { -- cgit v0.10.2 From 804aa132d341cc5838b710e62de02d62e6ad0185 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:55:02 -0700 Subject: slub: fix/clean free_debug_processing return paths Since commit 19c7ff9ecd89 ("slub: Take node lock during object free checks") check_object has been incorrectly returning success as it follows the out label which just returns the node. Thanks to refactoring, the out and fail paths are now basically the same. Combine the two into one and just use a single label. Credit to Mathias Krause for the original work which inspired this series Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index a03e0ae..744d29b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1053,24 +1053,25 @@ static noinline int free_debug_processing( void *object = head; int cnt = 0; unsigned long uninitialized_var(flags); + int ret = 0; spin_lock_irqsave(&n->list_lock, flags); slab_lock(page); if (!check_slab(s, page)) - goto fail; + goto out; next_object: cnt++; if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; + goto out; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto fail; + goto out; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) @@ -1087,7 +1088,7 @@ next_object: } else object_err(s, page, object, "page slab pointer corrupt."); - goto fail; + goto out; } if (s->flags & SLAB_STORE_USER) @@ -1101,6 +1102,8 @@ next_object: object = get_freepointer(s, object); goto next_object; } + ret = 1; + out: if (cnt != bulk_cnt) slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", @@ -1108,13 +1111,9 @@ out: slab_unlock(page); spin_unlock_irqrestore(&n->list_lock, flags); - return 1; - -fail: - slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, flags); - slab_fix(s, "Object at 0x%p not freed", object); - return 0; + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; } static int __init setup_slub_debug(char *str) -- cgit v0.10.2 From becfda68abca673d61d5cc953e8e099816db99d9 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:55:06 -0700 Subject: slub: convert SLAB_DEBUG_FREE to SLAB_CONSISTENCY_CHECKS SLAB_DEBUG_FREE allows expensive consistency checks at free to be turned on or off. Expand its use to be able to turn off all consistency checks. This gives a nice speed up if you only want features such as poisoning or tracing. Credit to Mathias Krause for the original work which inspired this series Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index f0d3409..8465241 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -35,8 +35,8 @@ slub_debug=, Enable options only for select slabs Possible debug options are - F Sanity checks on (enables SLAB_DEBUG_FREE. Sorry - SLAB legacy issues) + F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS + Sorry SLAB legacy issues) Z Red zoning P Poisoning (object and padding) U User tracking (free and alloc) diff --git a/include/linux/slab.h b/include/linux/slab.h index 5d49f0c..e4b5687 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -20,7 +20,7 @@ * Flags to pass to kmem_cache_create(). * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ -#define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ +#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ #define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ #define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ diff --git a/mm/slab.h b/mm/slab.h index e880bbe..b793436 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -125,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #elif defined(CONFIG_SLUB_DEBUG) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif @@ -311,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * to not do even the assignment. In that case, slab_equal_or_root * will also be a constant. */ - if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + if (!memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); diff --git a/mm/slub.c b/mm/slub.c index 744d29b..9cde663 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -160,7 +160,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define MAX_PARTIAL 10 -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* @@ -1007,20 +1007,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) - goto bad; + return 0; if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; + return 0; } if (!check_object(s, page, object, SLUB_RED_INACTIVE)) - goto bad; + return 0; + + return 1; +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) +{ + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, page, object, addr)) + goto bad; + } /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) @@ -1043,39 +1055,21 @@ bad: return 0; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct page *page, - void *head, void *tail, int bulk_cnt, - unsigned long addr) +static inline int free_consistency_checks(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - void *object = head; - int cnt = 0; - unsigned long uninitialized_var(flags); - int ret = 0; - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(page); - - if (!check_slab(s, page)) - goto out; - -next_object: - cnt++; - if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto out; + return 0; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto out; + return 0; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - goto out; + return 0; if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { @@ -1088,7 +1082,37 @@ next_object: } else object_err(s, page, object, "page slab pointer corrupt."); - goto out; + return 0; + } + return 1; +} + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; + unsigned long uninitialized_var(flags); + int ret = 0; + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, page)) + goto out; + } + +next_object: + cnt++; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, page, object, addr)) + goto out; } if (s->flags & SLAB_STORE_USER) @@ -1145,7 +1169,7 @@ static int __init setup_slub_debug(char *str) for (; *str && *str != ','; str++) { switch (tolower(*str)) { case 'f': - slub_debug |= SLAB_DEBUG_FREE; + slub_debug |= SLAB_CONSISTENCY_CHECKS; break; case 'z': slub_debug |= SLAB_RED_ZONE; @@ -1449,7 +1473,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (kmem_cache_debug(s)) { + if (s->flags & SLAB_CONSISTENCY_CHECKS) { void *p; slab_pad_check(s, page); @@ -4769,16 +4793,16 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_CONSISTENCY_CHECKS; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_CONSISTENCY_CHECKS; } return length; } @@ -5313,7 +5337,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'd'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; - if (s->flags & SLAB_DEBUG_FREE) + if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 86e698d..1889163 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -135,7 +135,7 @@ static void usage(void) "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" - "f / F Sanity Checks (SLAB_DEBUG_FREE)\n" + "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" "z / Z Redzoning\n" "p / P Poisoning\n" "u / U Tracking\n" -- cgit v0.10.2 From 149daaf3a02c020ddae64668c3a1bd0c4f314d6a Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:55:09 -0700 Subject: slub: relax CMPXCHG consistency restrictions When debug options are enabled, cmpxchg on the page is disabled. This is because the page must be locked to ensure there are no false positives when performing consistency checks. Some debug options such as poisoning and red zoning only act on the object itself. There is no need to protect other CPUs from modification on only the object. Allow cmpxchg to happen with poisoning and red zoning are set on a slab. Credit to Mathias Krause for the original work which inspired this series Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 9cde663..c2a227d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -164,6 +164,14 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) SLAB_POISON | SLAB_STORE_USER) /* + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information + */ +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + + +/* * Debugging flags that require metadata to be stored in the slab. These get * disabled when slub_debug=O is used and a cache's min order increases with * metadata. @@ -3338,7 +3346,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; #endif @@ -4846,7 +4854,6 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; } calculate_sizes(s, -1); @@ -4867,7 +4874,6 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; } calculate_sizes(s, -1); -- cgit v0.10.2 From d86bd1bece6fc41d59253002db5441fe960a37f6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index b7e57927..ac5143f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,6 +81,7 @@ struct kmem_cache { int reserved; /* Reserved bytes at the end of slabs */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ + int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif diff --git a/mm/slub.c b/mm/slub.c index c2a227d..2d4d817 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -232,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -279,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -442,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -475,6 +483,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -614,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -631,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -663,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -755,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -804,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -1445,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -3274,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3283,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* -- cgit v0.10.2 From 02c43638ec46fffd0c54b0d10819e36e0bc622f7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 15 Mar 2016 14:55:15 -0700 Subject: fs/mpage.c:mpage_readpages(): use lru_to_page() helper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/mpage.c b/fs/mpage.c index 1480d3a..6bd9fd9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, map_bh.b_state = 0; map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = lru_to_page(pages); prefetchw(&page->flags); list_del(&page->lru); -- cgit v0.10.2 From d91749c1dda71a7030c054a5ab8dc5419bc6730b Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Tue, 15 Mar 2016 14:55:18 -0700 Subject: mm/page_alloc.c: calculate zone_start_pfn at zone_spanned_pages_in_node() Xeon E7 v3 based systems supports Address Range Mirroring and UEFI BIOS complied with UEFI spec 2.5 can notify which ranges are mirrored (reliable) via EFI memory map. Now Linux kernel utilize its information and allocates boot time memory from reliable region. My requirement is: - allocate kernel memory from mirrored region - allocate user memory from non-mirrored region In order to meet my requirement, ZONE_MOVABLE is useful. By arranging non-mirrored range into ZONE_MOVABLE, mirrored memory is used for kernel allocations. My idea is to extend existing "kernelcore" option and introduces kernelcore=mirror option. By specifying "mirror" instead of specifying the amount of memory, non-mirrored region will be arranged into ZONE_MOVABLE. Earlier discussions are at: https://lkml.org/lkml/2015/10/9/24 https://lkml.org/lkml/2015/10/15/9 https://lkml.org/lkml/2015/11/27/18 https://lkml.org/lkml/2015/12/8/836 For example, suppose 2-nodes system with the following memory range: node 0 [mem 0x0000000000001000-0x000000109fffffff] node 1 [mem 0x00000010a0000000-0x000000209fffffff] and the following ranges are marked as reliable (mirrored): [0x0000000000000000-0x0000000100000000] [0x0000000100000000-0x0000000180000000] [0x0000000800000000-0x0000000880000000] [0x00000010a0000000-0x0000001120000000] [0x00000017a0000000-0x0000001820000000] If you specify kernelcore=mirror, ZONE_NORMAL and ZONE_MOVABLE are arranged like bellow: - node 0: ZONE_NORMAL : [0x0000000100000000-0x00000010a0000000] ZONE_MOVABLE: [0x0000000180000000-0x00000010a0000000] - node 1: ZONE_NORMAL : [0x00000010a0000000-0x00000020a0000000] ZONE_MOVABLE: [0x0000001120000000-0x00000020a0000000] In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL are treated as absent pages, and vice versa. This patch (of 2): Currently each zone's zone_start_pfn is calculated at free_area_init_core(). However zone's range is fixed at the time when invoking zone_spanned_pages_in_node(). This patch changes how each zone->zone_start_pfn is calculated in zone_spanned_pages_in_node(). Signed-off-by: Taku Izumi Cc: Tony Luck Cc: Xishi Qiu Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Dave Hansen Cc: Matt Fleming Cc: Arnd Bergmann Cc: Steve Capper Cc: Sudeep Holla Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb..0d20a19 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4953,31 +4953,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *ignored) { - unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ - if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) return 0; /* Move the zone boundaries inside the node if necessary */ - zone_end_pfn = min(zone_end_pfn, node_end_pfn); - zone_start_pfn = max(zone_start_pfn, node_start_pfn); + *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); + *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); /* Return the spanned pages */ - return zone_end_pfn - zone_start_pfn; + return *zone_end_pfn - *zone_start_pfn; } /* @@ -5042,8 +5042,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *zones_size) { + unsigned int zone; + + *zone_start_pfn = node_start_pfn; + for (zone = 0; zone < zone_type; zone++) + *zone_start_pfn += zones_size[zone]; + + *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; + return zones_size[zone_type]; } @@ -5072,15 +5082,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; + unsigned long zone_start_pfn, zone_end_pfn; unsigned long size, real_size; size = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, zones_size); real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + if (size) + zone->zone_start_pfn = zone_start_pfn; + else + zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; @@ -5201,7 +5218,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); @@ -5222,6 +5238,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; realsize = freesize = zone->present_pages; @@ -5290,7 +5307,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); - zone_start_pfn += size; } } @@ -5358,6 +5374,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); +#else + start_pfn = node_start_pfn; #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); -- cgit v0.10.2 From 342332e6a925e9ed015e5465062c38d2b86ec8f9 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Tue, 15 Mar 2016 14:55:22 -0700 Subject: mm/page_alloc.c: introduce kernelcore=mirror option This patch extends existing "kernelcore" option and introduces kernelcore=mirror option. By specifying "mirror" instead of specifying the amount of memory, non-mirrored (non-reliable) region will be arranged into ZONE_MOVABLE. [akpm@linux-foundation.org: fix build with CONFIG_HAVE_MEMBLOCK_NODE_MAP=n] Signed-off-by: Taku Izumi Tested-by: Sudeep Holla Cc: Tony Luck Cc: Xishi Qiu Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Dave Hansen Cc: Matt Fleming Cc: Arnd Bergmann Cc: Steve Capper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8ae47a7..208ae72 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1750,7 +1750,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. keepinitrd [HW,ARM] - kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter + kernelcore= [KNL,X86,IA-64,PPC] + Format: nn[KMGTPE] | "mirror" + This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested amount is spread evenly throughout all nodes in the system. The @@ -1766,6 +1768,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. use the HighMem zone if it exists, and the Normal zone if it does not. + Instead of specifying the amount of memory (nn[KMGTPE]), + you can specify "mirror" option. In case "mirror" + option is specified, mirrored (reliable) memory is used + for non-movable allocations and remaining memory is used + for Movable pages. nn[KMGTPE] and "mirror" are exclusive, + so you can NOT specify nn[KMGTPE] and "mirror" at the same + time. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0d20a19..b8160b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -247,6 +247,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static bool mirrored_kernelcore; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4491,6 +4492,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + struct memblock_region *r = NULL, *tmp; +#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; @@ -4516,6 +4520,40 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) break; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * if not mirrored_kernelcore and ZONE_MOVABLE exists, + * range from zone_movable_pfn[nid] to end of each node + * should be ZONE_MOVABLE not ZONE_NORMAL. skip it. + */ + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && + pfn >= zone_movable_pfn[nid]) + continue; + + /* + * check given memblock attribute by firmware which + * can affect kernel memory layout. + * if zone==ZONE_MOVABLE but memory is mirrored, + * it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || + pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); + continue; + } + } +#endif } /* @@ -4934,11 +4972,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); - /* Adjust for ZONE_MOVABLE starting within this range */ - } else if (*zone_start_pfn < zone_movable_pfn[nid] && - *zone_end_pfn > zone_movable_pfn[nid]) { - *zone_end_pfn = zone_movable_pfn[nid]; - /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5023,6 +5056,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long nr_absent; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) @@ -5034,7 +5068,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + + /* + * ZONE_MOVABLE handling. + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages + * and vice versa. + */ + if (zone_movable_pfn[nid]) { + if (mirrored_kernelcore) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + } + } else { + if (zone_type == ZONE_NORMAL) + nr_absent += node_end_pfn - zone_movable_pfn[nid]; + } + } + + return nr_absent; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -5547,6 +5613,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* + * If kernelcore=mirror is specified, ignore movablecore option + */ + if (mirrored_kernelcore) { + bool mem_below_4gb_not_mirrored = false; + + for_each_memblock(memory, r) { + if (memblock_is_mirror(r)) + continue; + + nid = r->nid; + + usable_startpfn = memblock_region_memory_base_pfn(r); + + if (usable_startpfn < 0x100000) { + mem_below_4gb_not_mirrored = true; + continue; + } + + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + if (mem_below_4gb_not_mirrored) + pr_warn("This configuration results in unmirrored kernel memory."); + + goto out2; + } + + /* * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore @@ -5806,6 +5902,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) */ static int __init cmdline_parse_kernelcore(char *p) { + /* parse kernelcore=mirror */ + if (parse_option_str(p, "mirror")) { + mirrored_kernelcore = true; + return 0; + } + return cmdline_parse_core(p, &required_kernelcore); } -- cgit v0.10.2 From b72d0ffb5dbc4070089b36230b98687ca4577cbc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 15 Mar 2016 14:55:25 -0700 Subject: mm/page_alloc.c: rework code layout in memmap_init_zone() This function is getting full of weird tricks to avoid word-wrapping. Use a goto to eliminate a tab stop then use the new space Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b8160b9..fe4378f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4508,54 +4508,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* - * There can be holes in boot-time mem_map[]s - * handed to this function. They do not - * exist on hotplugged memory. + * There can be holes in boot-time mem_map[]s handed to this + * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (!update_defer_init(pgdat, pfn, end_pfn, - &nr_initialised)) - break; + if (context != MEMMAP_EARLY) + goto not_early; + + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) + break; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * if not mirrored_kernelcore and ZONE_MOVABLE exists, - * range from zone_movable_pfn[nid] to end of each node - * should be ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && - pfn >= zone_movable_pfn[nid]) - continue; + /* + * If not mirrored_kernelcore and ZONE_MOVABLE exists, range + * from zone_movable_pfn[nid] to end of each node should be + * ZONE_MOVABLE not ZONE_NORMAL. skip it. + */ + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) + continue; - /* - * check given memblock attribute by firmware which - * can affect kernel memory layout. - * if zone==ZONE_MOVABLE but memory is mirrored, - * it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || - pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); - continue; - } + /* + * Check given memblock attribute by firmware which can affect + * kernel memory layout. If zone==ZONE_MOVABLE but memory is + * mirrored, it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); + continue; } -#endif } +#endif +not_early: /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations -- cgit v0.10.2 From d59b1087a98e402ed9a7cc577f4da435f9a555f5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 15 Mar 2016 14:55:27 -0700 Subject: mm/page-writeback: fix dirty_ratelimit calculation Calculation of dirty_ratelimit sometimes is not correct. E.g. initial values of dirty_ratelimit == INIT_BW and step == 0, lead to the following result: UBSAN: Undefined behaviour in ../mm/page-writeback.c:1286:7 shift exponent 25600 is too large for 64-bit type 'long unsigned int' The fix is straightforward - make step 0 if the shift exponent is too big. Signed-off-by: Andrey Ryabinin Cc: Wu Fengguang Cc: Tejun Heo Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6fe7d15..d782cba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except @@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; -- cgit v0.10.2 From ea6eabb05b26bd3d6f60b29b77a03bc61479fc0f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 15 Mar 2016 14:55:30 -0700 Subject: mm/debug_pagealloc: ask users for default setting of debug_pagealloc Since commit 031bc5743f158 ("mm/debug-pagealloc: make debug-pagealloc boottime configurable") CONFIG_DEBUG_PAGEALLOC is by default not adding any page debugging. This resulted in several unnoticed bugs, e.g. https://lkml.kernel.org/g/<569F5E29.3090107@de.ibm.com> or https://lkml.kernel.org/g/<56A20F30.4050705@de.ibm.com> as this behaviour change was not even documented in Kconfig. Let's provide a new Kconfig symbol that allows to change the default back to enabled, e.g. for debug kernels. This also makes the change obvious to kernel packagers. Let's also change the Kconfig description for CONFIG_DEBUG_PAGEALLOC, to indicate that there are two stages of overhead. Signed-off-by: Christian Borntraeger Cc: Joonsoo Kim Cc: Peter Zijlstra Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da..a0c136a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). - This results in a large slowdown, but helps to find certain types - of memory corruption. + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify @@ -26,5 +26,19 @@ config DEBUG_PAGEALLOC that would result in incorrect warnings of memory corruption after a resume because free pages are not saved to the suspend image. + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + default n + depends on DEBUG_PAGEALLOC + ---help--- + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + config PAGE_POISONING bool diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe4378f..36a0a79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -478,7 +478,8 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -489,6 +490,9 @@ static int __init early_debug_pagealloc(char *buf) if (strcmp(buf, "on") == 0) _debug_pagealloc_enabled = true; + if (strcmp(buf, "off") == 0) + _debug_pagealloc_enabled = false; + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); -- cgit v0.10.2 From 07061aab2f750bbf61337b922aa8a245b5da85e1 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Tue, 15 Mar 2016 14:55:33 -0700 Subject: mm: fix two typos in comments for to_vmem_altmap() Commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()"), introduced the to_vmem_altmap() function. The comments in this function contain two typos (one misspelling of the Kconfig option CONFIG_SPARSEMEM_VMEMMAP, and one missing letter 'n'), let's fix them up. Signed-off-by: Andreas Ziegler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/memremap.c b/kernel/memremap.c index fb9b887..584febd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -391,7 +391,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /* * 'memmap_start' is the virtual address for the first "struct * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple * pointer arithmetic, so we can perform this to_vmem_altmap() * conversion without concern for the initialization state of * the struct page fields. @@ -400,7 +400,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) struct dev_pagemap *pgmap; /* - * Uncoditionally retrieve a dev_pagemap associated with the + * Unconditionally retrieve a dev_pagemap associated with the * given physical address, this is only for use in the * arch_{add|remove}_memory() for setting up and tearing down * the memmap. -- cgit v0.10.2 From 32b635298ff4e991d8d8f64dc23782b02eec29c3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 15 Mar 2016 14:55:36 -0700 Subject: mm: filemap: remove redundant code in do_read_cache_page do_read_cache_page and __read_cache_page duplicate page filler code when filling the page for the first time. This patch simply removes the duplicate logic. Signed-off-by: Mel Gorman Reviewed-by: Jan Kara Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/filemap.c b/mm/filemap.c index da7a35d..deae0b9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page) return page; } -static struct page *__read_cache_page(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, @@ -2325,31 +2325,19 @@ repeat: /* Presumably ENOMEM for radix tree node */ return ERR_PTR(err); } + +filler: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); - } else { - page = wait_on_page_read(page); + return ERR_PTR(err); } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) - -{ - struct page *page; - int err; -retry: - page = __read_cache_page(mapping, index, filler, data, gfp); - if (IS_ERR(page)) - return page; + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + goto out; + } if (PageUptodate(page)) goto out; @@ -2357,21 +2345,14 @@ retry: if (!page->mapping) { unlock_page(page); page_cache_release(page); - goto retry; + goto repeat; } if (PageUptodate(page)) { unlock_page(page); goto out; } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - return ERR_PTR(err); - } else { - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; - } + goto filler; + out: mark_page_accessed(page); return page; -- cgit v0.10.2 From ebded02788b5d7c7600f8cff26ae07896d568649 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 15 Mar 2016 14:55:39 -0700 Subject: mm: filemap: avoid unnecessary calls to lock_page when waiting for IO to complete during a read In the generic read paths the kernel looks up a page in the page cache and if it's up to date, it is used. If not, the page lock is acquired to wait for IO to complete and then check the page. If multiple processes are waiting on IO, they all serialise against the lock and duplicate the checks. This is unnecessary. The page lock in itself does not give any guarantees to the callers about the page state as it can be immediately truncated or reclaimed after the page is unlocked. It's sufficient to wait_on_page_locked and then continue if the page is up to date on wakeup. It is possible that a truncated but up-to-date page is returned but the reference taken during read prevents it disappearing underneath the caller and the data is still valid if PageUptodate. The overall impact is small as even if processes serialise on the lock, the lock section is tiny once the IO is complete. Profiles indicated that unlock_page and friends are generally a tiny portion of a read-intensive workload. An artificial test was created that had instances of dd access a cache-cold file on an ext4 filesystem and measure how long the read took. paralleldd 4.4.0 4.4.0 vanilla avoidlock Amean Elapsd-1 5.28 ( 0.00%) 5.15 ( 2.50%) Amean Elapsd-4 5.29 ( 0.00%) 5.17 ( 2.12%) Amean Elapsd-7 5.28 ( 0.00%) 5.18 ( 1.78%) Amean Elapsd-12 5.20 ( 0.00%) 5.33 ( -2.50%) Amean Elapsd-21 5.14 ( 0.00%) 5.21 ( -1.41%) Amean Elapsd-30 5.30 ( 0.00%) 5.12 ( 3.38%) Amean Elapsd-48 5.78 ( 0.00%) 5.42 ( 6.21%) Amean Elapsd-79 6.78 ( 0.00%) 6.62 ( 2.46%) Amean Elapsd-110 9.09 ( 0.00%) 8.99 ( 1.15%) Amean Elapsd-128 10.60 ( 0.00%) 10.43 ( 1.66%) The impact is small but intuitively, it makes sense to avoid unnecessary calls to lock_page. Signed-off-by: Mel Gorman Reviewed-by: Jan Kara Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/filemap.c b/mm/filemap.c index deae0b9..4a0f5fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1668,6 +1668,15 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + wait_on_page_locked_killable(page); + if (PageUptodate(page)) + goto page_ok; + if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; @@ -2341,12 +2350,52 @@ filler: if (PageUptodate(page)) goto out; + /* + * Page is not up to date and may be locked due one of the following + * case a: Page is being filled and the page lock is held + * case b: Read/write error clearing the page uptodate status + * case c: Truncation in progress (page locked) + * case d: Reclaim in progress + * + * Case a, the page will be up to date when the page is unlocked. + * There is no need to serialise on the page lock here as the page + * is pinned so the lock gives no additional protection. Even if the + * the page is truncated, the data is still valid if PageUptodate as + * it's a race vs truncate race. + * Case b, the page will not be up to date + * Case c, the page may be truncated but in itself, the data may still + * be valid after IO completes as it's a read vs truncate race. The + * operation must restart if the page is not uptodate on unlock but + * otherwise serialising on page lock to stabilise the mapping gives + * no additional guarantees to the caller as the page lock is + * released before return. + * Case d, similar to truncation. If reclaim holds the page lock, it + * will be a race with remove_mapping that determines if the mapping + * is valid on unlock but otherwise the data is valid and there is + * no need to serialise with page lock. + * + * As the page lock gives no additional guarantee, we optimistically + * wait on the page to be unlocked and check if it's up to date and + * use the page if it is. Otherwise, the page lock is required to + * distinguish between the different cases. The motivation is that we + * avoid spurious serialisations and wakeups when multiple processes + * wait on the same page for IO to complete. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) + goto out; + + /* Distinguish between all the cases under the safety of the lock */ lock_page(page); + + /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); page_cache_release(page); goto repeat; } + + /* Someone else locked and filled the page in a very small window */ if (PageUptodate(page)) { unlock_page(page); goto out; -- cgit v0.10.2 From 20f6e03a40ba536dfc7c6f83dd894d994aeb39f3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:42 -0700 Subject: tracepoints: move trace_print_flags definitions to tracepoint-defs.h The following patch will need to declare array of struct trace_print_flags in a header. To prevent this header from pulling in all of RCU through trace_events.h, move the struct trace_print_flags{_64} definitions to the new lightweight tracepoint-defs.h header. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 925730b..705df7d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -15,16 +15,6 @@ struct tracer; struct dentry; struct bpf_prog; -struct trace_print_flags { - unsigned long mask; - const char *name; -}; - -struct trace_print_flags_u64 { - unsigned long long mask; - const char *name; -}; - const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index e1ee97c..4ac89ac 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -3,13 +3,23 @@ /* * File can be included directly by headers who only want to access - * tracepoint->key to guard out of line trace calls. Otherwise - * linux/tracepoint.h should be used. + * tracepoint->key to guard out of line trace calls, or the definition of + * trace_print_flags{_u64}. Otherwise linux/tracepoint.h should be used. */ #include #include +struct trace_print_flags { + unsigned long mask; + const char *name; +}; + +struct trace_print_flags_u64 { + unsigned long long mask; + const char *name; +}; + struct tracepoint_func { void *func; void *data; -- cgit v0.10.2 From 1f7866b4aebd19e2525775083279e171b36783a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:45 -0700 Subject: mm, tracing: make show_gfp_flags() up to date The show_gfp_flags() macro provides human-friendly printing of gfp flags in tracepoints. However, it is somewhat out of date and missing several flags. This patches fills in the missing flags, and distinguishes properly between GFP_ATOMIC and __GFP_ATOMIC which were both translated to "GFP_ATOMIC". More generally, all __GFP_X flags which were previously printed as GFP_X, are now printed as __GFP_X, since ommiting the underscores results in output that doesn't actually match the source code, and can only lead to confusion. Where both variants are defined equal (e.g. _DMA and _DMA32), the variant without underscores are preferred. Also add a note in gfp.h so hopefully future changes will be synced better. __GFP_MOVABLE is defined twice in include/linux/gfp.h with different comments. Leave just the newer one, which was intended to replace the old one. Signed-off-by: Vlastimil Babka Reviewed-by: Michal Hocko Acked-by: David Rientjes Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index af1f2b2..bbe5e7f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -9,6 +9,11 @@ struct vm_area_struct; +/* + * In case of changes, please don't forget to update + * include/trace/events/gfpflags.h + */ + /* Plain integer GFP bitmasks. Do not use this directly. */ #define ___GFP_DMA 0x01u #define ___GFP_HIGHMEM 0x02u @@ -48,7 +53,6 @@ struct vm_area_struct; #define __GFP_DMA ((__force gfp_t)___GFP_DMA) #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) -#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */ #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index dde6bf0..f53b216 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -11,33 +11,42 @@ #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ - {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \ + {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ {(unsigned long)GFP_USER, "GFP_USER"}, \ {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \ + {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \ {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ - {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \ - {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \ - {(unsigned long)__GFP_IO, "GFP_IO"}, \ - {(unsigned long)__GFP_COLD, "GFP_COLD"}, \ - {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \ - {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \ - {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \ - {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \ - {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ - {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ - {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ - {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \ - {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ - {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ - {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ - {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \ - {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ - ) : "GFP_NOWAIT" + {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \ + {(unsigned long)GFP_DMA, "GFP_DMA"}, \ + {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ + {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ + {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ + {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ + {(unsigned long)__GFP_IO, "__GFP_IO"}, \ + {(unsigned long)__GFP_FS, "__GFP_FS"}, \ + {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ + {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ + {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \ + {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ + {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ + {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \ + {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \ + {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \ + {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \ + {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \ + {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \ + {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ + {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ + {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \ + {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ + {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ + {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ + {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \ + ) : "none" -- cgit v0.10.2 From 14e0a214d62d284ff40b1fd7d687cb66fca9fc67 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:49 -0700 Subject: tools, perf: make gfp_compact_table up to date When updating tracing's show_gfp_flags() I have noticed that perf's gfp_compact_table is also outdated. Fill in the missing flags and place a note in gfp.h to increase chance that future updates are synced. Convert the __GFP_X flags from "GFP_X" to "__GFP_X" strings in line with the previous patch. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index bbe5e7f..3d6d878 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -11,7 +11,7 @@ struct vm_area_struct; /* * In case of changes, please don't forget to update - * include/trace/events/gfpflags.h + * include/trace/events/gfpflags.h and tools/perf/builtin-kmem.c */ /* Plain integer GFP bitmasks. Do not use this directly. */ diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 4d3340c..83343ed 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -612,30 +612,39 @@ static const struct { { "GFP_HIGHUSER", "HU" }, { "GFP_USER", "U" }, { "GFP_TEMPORARY", "TMP" }, + { "GFP_KERNEL_ACCOUNT", "KAC" }, { "GFP_KERNEL", "K" }, { "GFP_NOFS", "NF" }, { "GFP_ATOMIC", "A" }, { "GFP_NOIO", "NI" }, - { "GFP_HIGH", "H" }, - { "GFP_WAIT", "W" }, - { "GFP_IO", "I" }, - { "GFP_COLD", "CO" }, - { "GFP_NOWARN", "NWR" }, - { "GFP_REPEAT", "R" }, - { "GFP_NOFAIL", "NF" }, - { "GFP_NORETRY", "NR" }, - { "GFP_COMP", "C" }, - { "GFP_ZERO", "Z" }, - { "GFP_NOMEMALLOC", "NMA" }, - { "GFP_MEMALLOC", "MA" }, - { "GFP_HARDWALL", "HW" }, - { "GFP_THISNODE", "TN" }, - { "GFP_RECLAIMABLE", "RC" }, - { "GFP_MOVABLE", "M" }, - { "GFP_NOTRACK", "NT" }, - { "GFP_NO_KSWAPD", "NK" }, - { "GFP_OTHER_NODE", "ON" }, { "GFP_NOWAIT", "NW" }, + { "GFP_DMA", "D" }, + { "__GFP_HIGHMEM", "HM" }, + { "GFP_DMA32", "D32" }, + { "__GFP_HIGH", "H" }, + { "__GFP_ATOMIC", "_A" }, + { "__GFP_IO", "I" }, + { "__GFP_FS", "F" }, + { "__GFP_COLD", "CO" }, + { "__GFP_NOWARN", "NWR" }, + { "__GFP_REPEAT", "R" }, + { "__GFP_NOFAIL", "NF" }, + { "__GFP_NORETRY", "NR" }, + { "__GFP_COMP", "C" }, + { "__GFP_ZERO", "Z" }, + { "__GFP_NOMEMALLOC", "NMA" }, + { "__GFP_MEMALLOC", "MA" }, + { "__GFP_HARDWALL", "HW" }, + { "__GFP_THISNODE", "TN" }, + { "__GFP_RECLAIMABLE", "RC" }, + { "__GFP_MOVABLE", "M" }, + { "__GFP_ACCOUNT", "AC" }, + { "__GFP_NOTRACK", "NT" }, + { "__GFP_WRITE", "WR" }, + { "__GFP_RECLAIM", "R" }, + { "__GFP_DIRECT_RECLAIM", "DR" }, + { "__GFP_KSWAPD_RECLAIM", "KR" }, + { "__GFP_OTHER_NODE", "ON" }, }; static size_t max_gfp_len; -- cgit v0.10.2 From 420adbe9fc1a45187cfa74df9dbfd72272c4e2fa Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:52 -0700 Subject: mm, tracing: unify mm flags handling in tracepoints and printk In tracepoints, it's possible to print gfp flags in a human-friendly format through a macro show_gfp_flags(), which defines a translation array and passes is to __print_flags(). Since the following patch will introduce support for gfp flags printing in printk(), it would be nice to reuse the array. This is not straightforward, since __print_flags() can't simply reference an array defined in a .c file such as mm/debug.c - it has to be a macro to allow the macro magic to communicate the format to userspace tools such as trace-cmd. The solution is to create a macro __def_gfpflag_names which is used both in show_gfp_flags(), and to define the gfpflag_names[] array in mm/debug.c. On the other hand, mm/debug.c also defines translation tables for page flags and vma flags, and desire was expressed (but not implemented in this series) to use these also from tracepoints. Thus, this patch also renames the events/gfpflags.h file to events/mmflags.h and moves the table definitions there, using the same macro approach as for gfpflags. This allows translating all three kinds of mm-specific flags both in tracepoints and printk. Signed-off-by: Vlastimil Babka Reviewed-by: Michal Hocko Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3d6d878..06546b3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -11,7 +11,7 @@ struct vm_area_struct; /* * In case of changes, please don't forget to update - * include/trace/events/gfpflags.h and tools/perf/builtin-kmem.c + * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ /* Plain integer GFP bitmasks. Do not use this directly. */ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index d866f21..677807f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -6,7 +6,7 @@ #include #include -#include +#include struct btrfs_root; struct btrfs_fs_info; diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index c92d1e1..111e566 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #define COMPACTION_STATUS \ EM( COMPACT_DEFERRED, "deferred") \ diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h deleted file mode 100644 index f53b216..0000000 --- a/include/trace/events/gfpflags.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * The order of these masks is important. Matching masks will be seen - * first and the left over flags will end up showing by themselves. - * - * For example, if we have GFP_KERNEL before GFP_USER we wil get: - * - * GFP_KERNEL|GFP_HARDWALL - * - * Thus most bits set go first. - */ -#define show_gfp_flags(flags) \ - (flags) ? __print_flags(flags, "|", \ - {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ - {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ - {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ - {(unsigned long)GFP_USER, "GFP_USER"}, \ - {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \ - {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \ - {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ - {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \ - {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ - {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ - {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \ - {(unsigned long)GFP_DMA, "GFP_DMA"}, \ - {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ - {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ - {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ - {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ - {(unsigned long)__GFP_IO, "__GFP_IO"}, \ - {(unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ - {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ - {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \ - {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ - {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ - {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \ - {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \ - {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \ - {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \ - {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \ - {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \ - {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ - {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ - {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \ - {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ - {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ - {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \ - ) : "none" - diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 47c6212..551ba4a 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -6,8 +6,6 @@ #include -#include - #define SCAN_STATUS \ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index f7554fd..ca72173 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -6,7 +6,7 @@ #include #include -#include +#include DECLARE_EVENT_CLASS(kmem_alloc, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h new file mode 100644 index 0000000..a849185 --- /dev/null +++ b/include/trace/events/mmflags.h @@ -0,0 +1,164 @@ +/* + * The order of these masks is important. Matching masks will be seen + * first and the left over flags will end up showing by themselves. + * + * For example, if we have GFP_KERNEL before GFP_USER we wil get: + * + * GFP_KERNEL|GFP_HARDWALL + * + * Thus most bits set go first. + */ + +#define __def_gfpflag_names \ + {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ + {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ + {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ + {(unsigned long)GFP_USER, "GFP_USER"}, \ + {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \ + {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \ + {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ + {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \ + {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ + {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ + {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \ + {(unsigned long)GFP_DMA, "GFP_DMA"}, \ + {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ + {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ + {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ + {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ + {(unsigned long)__GFP_IO, "__GFP_IO"}, \ + {(unsigned long)__GFP_FS, "__GFP_FS"}, \ + {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ + {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ + {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \ + {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ + {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ + {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \ + {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \ + {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \ + {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \ + {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \ + {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \ + {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ + {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ + {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \ + {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ + {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ + {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ + {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \ + +#define show_gfp_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + __def_gfpflag_names \ + ) : "none" + +#ifdef CONFIG_MMU +#define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_MLOCK(flag,string) +#endif + +#ifdef CONFIG_ARCH_USES_PG_UNCACHED +#define IF_HAVE_PG_UNCACHED(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_UNCACHED(flag,string) +#endif + +#ifdef CONFIG_MEMORY_FAILURE +#define IF_HAVE_PG_HWPOISON(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_HWPOISON(flag,string) +#endif + +#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_IDLE(flag,string) +#endif + +#define __def_pageflag_names \ + {1UL << PG_locked, "locked" }, \ + {1UL << PG_error, "error" }, \ + {1UL << PG_referenced, "referenced" }, \ + {1UL << PG_uptodate, "uptodate" }, \ + {1UL << PG_dirty, "dirty" }, \ + {1UL << PG_lru, "lru" }, \ + {1UL << PG_active, "active" }, \ + {1UL << PG_slab, "slab" }, \ + {1UL << PG_owner_priv_1, "owner_priv_1" }, \ + {1UL << PG_arch_1, "arch_1" }, \ + {1UL << PG_reserved, "reserved" }, \ + {1UL << PG_private, "private" }, \ + {1UL << PG_private_2, "private_2" }, \ + {1UL << PG_writeback, "writeback" }, \ + {1UL << PG_head, "head" }, \ + {1UL << PG_swapcache, "swapcache" }, \ + {1UL << PG_mappedtodisk, "mappedtodisk" }, \ + {1UL << PG_reclaim, "reclaim" }, \ + {1UL << PG_swapbacked, "swapbacked" }, \ + {1UL << PG_unevictable, "unevictable" } \ +IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ +IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ +IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ +IF_HAVE_PG_IDLE(PG_young, "young" ) \ +IF_HAVE_PG_IDLE(PG_idle, "idle" ) + +#define show_page_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + __def_pageflag_names \ + ) : "none" + +#if defined(CONFIG_X86) +#define __VM_ARCH_SPECIFIC {VM_PAT, "pat" } +#elif defined(CONFIG_PPC) +#define __VM_ARCH_SPECIFIC {VM_SAO, "sao" } +#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) +#define __VM_ARCH_SPECIFIC {VM_GROWSUP, "growsup" } +#elif !defined(CONFIG_MMU) +#define __VM_ARCH_SPECIFIC {VM_MAPPED_COPY,"mappedcopy" } +#else +#define __VM_ARCH_SPECIFIC {VM_ARCH_1, "arch_1" } +#endif + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name }, +#else +#define IF_HAVE_VM_SOFTDIRTY(flag,name) +#endif + +#define __def_vmaflag_names \ + {VM_READ, "read" }, \ + {VM_WRITE, "write" }, \ + {VM_EXEC, "exec" }, \ + {VM_SHARED, "shared" }, \ + {VM_MAYREAD, "mayread" }, \ + {VM_MAYWRITE, "maywrite" }, \ + {VM_MAYEXEC, "mayexec" }, \ + {VM_MAYSHARE, "mayshare" }, \ + {VM_GROWSDOWN, "growsdown" }, \ + {VM_PFNMAP, "pfnmap" }, \ + {VM_DENYWRITE, "denywrite" }, \ + {VM_LOCKONFAULT, "lockonfault" }, \ + {VM_LOCKED, "locked" }, \ + {VM_IO, "io" }, \ + {VM_SEQ_READ, "seqread" }, \ + {VM_RAND_READ, "randread" }, \ + {VM_DONTCOPY, "dontcopy" }, \ + {VM_DONTEXPAND, "dontexpand" }, \ + {VM_ACCOUNT, "account" }, \ + {VM_NORESERVE, "noreserve" }, \ + {VM_HUGETLB, "hugetlb" }, \ + __VM_ARCH_SPECIFIC , \ + {VM_DONTDUMP, "dontdump" }, \ +IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ + {VM_MIXEDMAP, "mixedmap" }, \ + {VM_HUGEPAGE, "hugepage" }, \ + {VM_NOHUGEPAGE, "nohugepage" }, \ + {VM_MERGEABLE, "mergeable" } \ + +#define show_vma_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + __def_vmaflag_names \ + ) : "none" diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 31763dd..0101ef3 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u diff --git a/mm/debug.c b/mm/debug.c index f05b2d5..410af90 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -9,41 +9,14 @@ #include #include #include +#include static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, - {1UL << PG_head, "head" }, - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) - {1UL << PG_young, "young" }, - {1UL << PG_idle, "idle" }, -#endif + __def_pageflag_names +}; + +static const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names }; static void dump_flags(unsigned long flags, @@ -108,47 +81,8 @@ EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflags_names[] = { - {VM_READ, "read" }, - {VM_WRITE, "write" }, - {VM_EXEC, "exec" }, - {VM_SHARED, "shared" }, - {VM_MAYREAD, "mayread" }, - {VM_MAYWRITE, "maywrite" }, - {VM_MAYEXEC, "mayexec" }, - {VM_MAYSHARE, "mayshare" }, - {VM_GROWSDOWN, "growsdown" }, - {VM_PFNMAP, "pfnmap" }, - {VM_DENYWRITE, "denywrite" }, - {VM_LOCKONFAULT, "lockonfault" }, - {VM_LOCKED, "locked" }, - {VM_IO, "io" }, - {VM_SEQ_READ, "seqread" }, - {VM_RAND_READ, "randread" }, - {VM_DONTCOPY, "dontcopy" }, - {VM_DONTEXPAND, "dontexpand" }, - {VM_ACCOUNT, "account" }, - {VM_NORESERVE, "noreserve" }, - {VM_HUGETLB, "hugetlb" }, -#if defined(CONFIG_X86) - {VM_PAT, "pat" }, -#elif defined(CONFIG_PPC) - {VM_SAO, "sao" }, -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) - {VM_GROWSUP, "growsup" }, -#elif !defined(CONFIG_MMU) - {VM_MAPPED_COPY, "mappedcopy" }, -#else - {VM_ARCH_1, "arch_1" }, -#endif - {VM_DONTDUMP, "dontdump" }, -#ifdef CONFIG_MEM_SOFT_DIRTY - {VM_SOFTDIRTY, "softdirty" }, -#endif - {VM_MIXEDMAP, "mixedmap" }, - {VM_HUGEPAGE, "hugepage" }, - {VM_NOHUGEPAGE, "nohugepage" }, - {VM_MERGEABLE, "mergeable" }, +static const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names }; void dump_vma(const struct vm_area_struct *vma) @@ -162,7 +96,7 @@ void dump_vma(const struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); + dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names)); } EXPORT_SYMBOL(dump_vma); @@ -233,8 +167,8 @@ void dump_mm(const struct mm_struct *mm) "" /* This is here to not have a comma! */ ); - dump_flags(mm->def_flags, vmaflags_names, - ARRAY_SIZE(vmaflags_names)); + dump_flags(mm->def_flags, vmaflag_names, + ARRAY_SIZE(vmaflag_names)); } #endif /* CONFIG_DEBUG_VM */ diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 83343ed..c9cb3be 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -602,7 +602,7 @@ static int gfpcmp(const void *a, const void *b) return fa->flags - fb->flags; } -/* see include/trace/events/gfpflags.h */ +/* see include/trace/events/mmflags.h */ static const struct { const char *original; const char *compact; -- cgit v0.10.2 From edf14cdbf9a0e5ab52698ca66d07a76ade0d5c46 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:56 -0700 Subject: mm, printk: introduce new format string for flags In mm we use several kinds of flags bitfields that are sometimes printed for debugging purposes, or exported to userspace via sysfs. To make them easier to interpret independently on kernel version and config, we want to dump also the symbolic flag names. So far this has been done with repeated calls to pr_cont(), which is unreliable on SMP, and not usable for e.g. sysfs export. To get a more reliable and universal solution, this patch extends printk() format string for pointers to handle the page flags (%pGp), gfp_flags (%pGg) and vma flags (%pGv). Existing users of dump_flag_names() are converted and simplified. It would be possible to pass flags by value instead of pointer, but the %p format string for pointers already has extensions for various kernel structures, so it's a good fit, and the extra indirection in a non-critical path is negligible. [linux@rasmusvillemoes.dk: lots of good implementation suggestions] Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 5d1128b..5962949 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -298,6 +298,24 @@ bitmap and its derivatives such as cpumask and nodemask: Passed by reference. +Flags bitfields such as page flags, gfp_flags: + + %pGp referenced|uptodate|lru|active|private + %pGg GFP_USER|GFP_DMA32|GFP_NOWARN + %pGv read|exec|mayread|maywrite|mayexec|denywrite + + For printing flags bitfields as a collection of symbolic constants that + would construct the value. The type of flags is given by the third + character. Currently supported are [p]age flags, [v]ma_flags (both + expect unsigned long *) and [g]fp_flags (expects gfp_t *). The flag + names and print order depends on the particular type. + + Note that this format should not be used directly in TP_printk() part + of a tracepoint. Instead, use the show_*_flags() functions from + . + + Passed by reference. + Network device features: %pNF 0x000000000000c000 diff --git a/lib/test_printf.c b/lib/test_printf.c index 4f6ae60..563f10e 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -17,6 +17,9 @@ #include #include +#include +#include + #define BUF_SIZE 256 #define PAD_SIZE 16 #define FILL_CHAR '$' @@ -411,6 +414,55 @@ netdev_features(void) } static void __init +flags(void) +{ + unsigned long flags; + gfp_t gfp; + char *cmp_buffer; + + flags = 0; + test("", "%pGp", &flags); + + /* Page flags should filter the zone id */ + flags = 1UL << NR_PAGEFLAGS; + test("", "%pGp", &flags); + + flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru + | 1UL << PG_active | 1UL << PG_swapbacked; + test("uptodate|dirty|lru|active|swapbacked", "%pGp", &flags); + + + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC + | VM_DENYWRITE; + test("read|exec|mayread|maywrite|mayexec|denywrite", "%pGv", &flags); + + gfp = GFP_TRANSHUGE; + test("GFP_TRANSHUGE", "%pGg", &gfp); + + gfp = GFP_ATOMIC|__GFP_DMA; + test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); + + gfp = __GFP_ATOMIC; + test("__GFP_ATOMIC", "%pGg", &gfp); + + cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); + if (!cmp_buffer) + return; + + /* Any flags not translated by the table should remain numeric */ + gfp = ~__GFP_BITS_MASK; + snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); + test(cmp_buffer, "%pGg", &gfp); + + snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + (unsigned long) gfp); + gfp |= __GFP_ATOMIC; + test(cmp_buffer, "%pGg", &gfp); + + kfree(cmp_buffer); +} + +static void __init test_pointer(void) { plain(); @@ -428,6 +480,7 @@ test_pointer(void) struct_clk(); bitmap(); netdev_features(); + flags(); } static int __init diff --git a/lib/vsprintf.c b/lib/vsprintf.c index f44e178..525c8e1 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -35,6 +35,8 @@ #include #endif +#include "../mm/internal.h" /* For the trace_print_flags arrays */ + #include /* for PAGE_SIZE */ #include /* for dereference_function_descriptor() */ #include /* cpu_to_le16 */ @@ -1407,6 +1409,72 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, } } +static +char *format_flags(char *buf, char *end, unsigned long flags, + const struct trace_print_flags *names) +{ + unsigned long mask; + const struct printf_spec strspec = { + .field_width = -1, + .precision = -1, + }; + const struct printf_spec numspec = { + .flags = SPECIAL|SMALL, + .field_width = -1, + .precision = -1, + .base = 16, + }; + + for ( ; flags && names->name; names++) { + mask = names->mask; + if ((flags & mask) != mask) + continue; + + buf = string(buf, end, names->name, strspec); + + flags &= ~mask; + if (flags) { + if (buf < end) + *buf = '|'; + buf++; + } + } + + if (flags) + buf = number(buf, end, flags, numspec); + + return buf; +} + +static noinline_for_stack +char *flags_string(char *buf, char *end, void *flags_ptr, const char *fmt) +{ + unsigned long flags; + const struct trace_print_flags *names; + + switch (fmt[1]) { + case 'p': + flags = *(unsigned long *)flags_ptr; + /* Remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + names = pageflag_names; + break; + case 'v': + flags = *(unsigned long *)flags_ptr; + names = vmaflag_names; + break; + case 'g': + flags = *(gfp_t *)flags_ptr; + names = gfpflag_names; + break; + default: + WARN_ONCE(1, "Unsupported flags modifier: %c\n", fmt[1]); + return buf; + } + + return format_flags(buf, end, flags, names); +} + int kptr_restrict __read_mostly; /* @@ -1495,6 +1563,11 @@ int kptr_restrict __read_mostly; * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'Cr' For a clock, it prints the current rate of the clock + * - 'G' For flags to be printed as a collection of symbolic strings that would + * construct the specific value. Supported flags given by option: + * p page flags (see struct page) given as pointer to unsigned long + * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t + * v vma flags (VM_*) given as pointer to unsigned long * * ** Please update also Documentation/printk-formats.txt when making changes ** * @@ -1648,6 +1721,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return bdev_name(buf, end, ptr, spec, fmt); #endif + case 'G': + return flags_string(buf, end, ptr, fmt); } spec.flags |= SMALL; if (spec.field_width == -1) { diff --git a/mm/debug.c b/mm/debug.c index 410af90..0328fd3 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -11,12 +11,21 @@ #include #include -static const struct trace_print_flags pageflag_names[] = { - __def_pageflag_names +#include "internal.h" + +const struct trace_print_flags pageflag_names[] = { + __def_pageflag_names, + {0, NULL} +}; + +const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names, + {0, NULL} }; -static const struct trace_print_flags gfpflag_names[] = { - __def_gfpflag_names +const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names, + {0, NULL} }; static void dump_flags(unsigned long flags, @@ -58,14 +67,15 @@ void dump_page_badflags(struct page *page, const char *reason, if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + dump_flags(page->flags, pageflag_names, + ARRAY_SIZE(pageflag_names) - 1); if (reason) pr_alert("page dumped because: %s\n", reason); if (page->flags & badflags) { pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, - pageflag_names, ARRAY_SIZE(pageflag_names)); + dump_flags(page->flags & badflags, pageflag_names, + ARRAY_SIZE(pageflag_names) - 1); } #ifdef CONFIG_MEMCG if (page->mem_cgroup) @@ -81,10 +91,6 @@ EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflag_names[] = { - __def_vmaflag_names -}; - void dump_vma(const struct vm_area_struct *vma) { pr_emerg("vma %p start %p end %p\n" @@ -96,7 +102,7 @@ void dump_vma(const struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names)); + dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names) - 1); } EXPORT_SYMBOL(dump_vma); @@ -168,7 +174,7 @@ void dump_mm(const struct mm_struct *mm) ); dump_flags(mm->def_flags, vmaflag_names, - ARRAY_SIZE(vmaflag_names)); + ARRAY_SIZE(vmaflag_names) - 1); } #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/internal.h b/mm/internal.h index a38a21e..6636e1d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * The set of flags that only affect watermark checking and reclaim @@ -466,4 +467,9 @@ static inline void try_to_unmap_flush_dirty(void) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +extern const struct trace_print_flags pageflag_names[]; +extern const struct trace_print_flags vmaflag_names[]; +extern const struct trace_print_flags gfpflag_names[]; + #endif /* __MM_INTERNAL_H */ -- cgit v0.10.2 From b8eceeb99014cf5ae90d7f2c606d36074baa73ae Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:59 -0700 Subject: mm, debug: replace dump_flags() with the new printk formats With the new printk format strings for flags, we can get rid of dump_flags() in mm/debug.c. This also fixes dump_vma() which used dump_flags() for printing vma flags. However dump_flags() did a page-flags specific filtering of bits higher than NR_PAGEFLAGS in order to remove the zone id part. For dump_vma() this resulted in removing several VM_* flags from the symbolic translation. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/debug.c b/mm/debug.c index 0328fd3..231e145 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -28,36 +28,6 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -static void dump_flags(unsigned long flags, - const struct trace_print_flags *names, int count) -{ - const char *delim = ""; - unsigned long mask; - int i; - - pr_emerg("flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < count && flags; i++) { - - mask = names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - pr_cont("%s%s", delim, names[i].name); - delim = "|"; - } - - /* check for left over flags */ - if (flags) - pr_cont("%s%#lx", delim, flags); - - pr_cont(")\n"); -} - void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags) { @@ -68,15 +38,15 @@ void dump_page_badflags(struct page *page, const char *reason, pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - dump_flags(page->flags, pageflag_names, - ARRAY_SIZE(pageflag_names) - 1); + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + if (reason) pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, pageflag_names, - ARRAY_SIZE(pageflag_names) - 1); - } + + badflags &= page->flags; + if (badflags) + pr_alert("bad because of flags: %#lx(%pGp)\n", badflags, + &badflags); #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -96,13 +66,14 @@ void dump_vma(const struct vm_area_struct *vma) pr_emerg("vma %p start %p end %p\n" "next %p prev %p mm %p\n" "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n", + "pgoff %lx file %p private_data %p\n" + "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, - vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names) - 1); + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); @@ -136,7 +107,7 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" #endif - "%s", /* This is here to hold the comma */ + "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU @@ -170,11 +141,8 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) mm->tlb_flush_pending, #endif - "" /* This is here to not have a comma! */ - ); - - dump_flags(mm->def_flags, vmaflag_names, - ARRAY_SIZE(vmaflag_names) - 1); + mm->def_flags, &mm->def_flags + ); } #endif /* CONFIG_DEBUG_VM */ -- cgit v0.10.2 From c5c990e8a1fe313a5ab13f069f1410acf932927b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:02 -0700 Subject: mm, page_alloc: print symbolic gfp_flags on allocation failure It would be useful to translate gfp_flags into string representation when printing in case of an allocation failure, especially as the flags have been undergoing some changes recently and the script ./scripts/gfp-translate needs a matching source version to be accurate. Example output: stapio: page allocation failure: order:9, mode:0x2080020(GFP_ATOMIC) Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36a0a79..4e8029a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2695,9 +2695,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", - current->comm, order, gfp_mask); - + pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", + current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); -- cgit v0.10.2 From a0795cd416d1142117695f932a9690611ae0edbb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:05 -0700 Subject: mm, oom: print symbolic gfp_flags in oom warning It would be useful to translate gfp_flags into string representation when printing in case of an OOM, especially as the flags have been undergoing some changes recently and the script ./scripts/gfp-translate needs a matching source version to be accurate. Example output: a.out invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|GFP_ZERO), order=0, om_score_adj=0 Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dc490c0..e97a05d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " + "oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) -- cgit v0.10.2 From 60f30350fd69a3e4d5f0f45937d3274c22565134 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:08 -0700 Subject: mm, page_owner: print migratetype of page and pageblock, symbolic flags The information in /sys/kernel/debug/page_owner includes the migratetype of the pageblock the page belongs to. This is also checked against the page's migratetype (as declared by gfp_flags during its allocation), and the page is reported as Fallback if its migratetype differs from the pageblock's one. t This is somewhat misleading because in fact fallback allocation is not the only reason why these two can differ. It also doesn't direcly provide the page's migratetype, although it's possible to derive that from the gfp_flags. It's arguably better to print both page and pageblock's migratetype and leave the interpretation to the consumer than to suggest fallback allocation as the only possible reason. While at it, we can print the migratetypes as string the same way as /proc/pagetypeinfo does, as some of the numeric values depend on kernel configuration. For that, this patch moves the migratetype_names array from #ifdef CONFIG_PROC_FS part of mm/vmstat.c to mm/page_alloc.c and exports it. With the new format strings for flags, we can now also provide symbolic page and gfp flags in the /sys/kernel/debug/page_owner file. This replaces the positional printing of page flags as single letters, which might have looked nicer, but was limited to a subset of flags, and required the user to remember the letters. Example page_owner entry after the patch: Page allocated via order 0, mask 0x24213ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD|__GFP_NOWARN|__GFP_NORETRY) PFN 520 type Movable Block 1 type Movable Flags 0xfffff8001006c(referenced|uptodate|lru|active|mappedtodisk) [] __alloc_pages_nodemask+0x134/0x230 [] alloc_pages_current+0x88/0x120 [] __page_cache_alloc+0xe6/0x120 [] __do_page_cache_readahead+0xdc/0x240 [] ondemand_readahead+0x135/0x260 [] page_cache_sync_readahead+0x31/0x50 [] generic_file_read_iter+0x453/0x760 [] __vfs_read+0xa7/0xd0 Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7b6c2cf..9fc23ab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -63,6 +63,9 @@ enum { MIGRATE_TYPES }; +/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ +extern char * const migratetype_names[MIGRATE_TYPES]; + #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) #else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8029a..030fafc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", + "HighAtomic", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, diff --git a/mm/page_owner.c b/mm/page_owner.c index 983c3a1..7a37a30 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -100,8 +100,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask 0x%x\n", - page_ext->order, page_ext->gfp_mask); + "Page allocated via order %u, mask %#x(%pGg)\n", + page_ext->order, page_ext->gfp_mask, + &page_ext->gfp_mask); if (ret >= count) goto err; @@ -110,23 +111,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, - "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, + migratetype_names[page_mt], pfn >> pageblock_order, - pageblock_mt, - pageblock_mt != page_mt ? "Fallback" : " ", - PageLocked(page) ? "K" : " ", - PageError(page) ? "E" : " ", - PageReferenced(page) ? "R" : " ", - PageUptodate(page) ? "U" : " ", - PageDirty(page) ? "D" : " ", - PageLRU(page) ? "L" : " ", - PageActive(page) ? "A" : " ", - PageSlab(page) ? "S" : " ", - PageWriteback(page) ? "W" : " ", - PageCompound(page) ? "C" : " ", - PageSwapCache(page) ? "B" : " ", - PageMappedToDisk(page) ? "M" : " "); + migratetype_names[pageblock_mt], + page->flags, &page->flags); if (ret >= count) goto err; diff --git a/mm/vmstat.c b/mm/vmstat.c index 084c672..72c1798 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #endif #ifdef CONFIG_PROC_FS -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Movable", - "Reclaimable", - "HighAtomic", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { -- cgit v0.10.2 From 7dd80b8af0bcd705a9ef2fa272c082882616a499 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:12 -0700 Subject: mm, page_owner: convert page_owner_inited to static key CONFIG_PAGE_OWNER attempts to impose negligible runtime overhead when enabled during compilation, but not actually enabled during runtime by boot param page_owner=on. This overhead can be further reduced using the static key mechanism, which this patch does. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/page_owner.txt b/Documentation/vm/page_owner.txt index 8f3ce9b..ffff143 100644 --- a/Documentation/vm/page_owner.txt +++ b/Documentation/vm/page_owner.txt @@ -28,10 +28,11 @@ with page owner and page owner is disabled in runtime due to no enabling boot option, runtime overhead is marginal. If disabled in runtime, it doesn't require memory to store owner information, so there is no runtime memory overhead. And, page owner inserts just two unlikely branches into -the page allocator hotpath and if it returns false then allocation is -done like as the kernel without page owner. These two unlikely branches -would not affect to allocation performance. Following is the kernel's -code size change due to this facility. +the page allocator hotpath and if not enabled, then allocation is done +like as the kernel without page owner. These two unlikely branches should +not affect to allocation performance, especially if the static keys jump +label patching functionality is available. Following is the kernel's code +size change due to this facility. - Without page owner text data bss dec hex filename diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index cacaabe..8e2eb15 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -1,8 +1,10 @@ #ifndef __LINUX_PAGE_OWNER_H #define __LINUX_PAGE_OWNER_H +#include + #ifdef CONFIG_PAGE_OWNER -extern bool page_owner_inited; +extern struct static_key_false page_owner_inited; extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned int order); @@ -12,27 +14,23 @@ extern gfp_t __get_page_owner_gfp(struct page *page); static inline void reset_page_owner(struct page *page, unsigned int order) { - if (likely(!page_owner_inited)) - return; - - __reset_page_owner(page, order); + if (static_branch_unlikely(&page_owner_inited)) + __reset_page_owner(page, order); } static inline void set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { - if (likely(!page_owner_inited)) - return; - - __set_page_owner(page, order, gfp_mask); + if (static_branch_unlikely(&page_owner_inited)) + __set_page_owner(page, order, gfp_mask); } static inline gfp_t get_page_owner_gfp(struct page *page) { - if (likely(!page_owner_inited)) + if (static_branch_unlikely(&page_owner_inited)) + return __get_page_owner_gfp(page); + else return 0; - - return __get_page_owner_gfp(page); } #else static inline void reset_page_owner(struct page *page, unsigned int order) diff --git a/mm/page_owner.c b/mm/page_owner.c index 7a37a30..feaa28b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -5,10 +5,11 @@ #include #include #include +#include #include "internal.h" static bool page_owner_disabled = true; -bool page_owner_inited __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_owner_inited); static void init_early_allocated_pages(void); @@ -37,7 +38,7 @@ static void init_page_owner(void) if (page_owner_disabled) return; - page_owner_inited = true; + static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -147,7 +148,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct page *page; struct page_ext *page_ext; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; @@ -295,7 +296,7 @@ static int __init pageowner_init(void) { struct dentry *dentry; - if (!page_owner_inited) { + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 72c1798..69ce64f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1120,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) #ifdef CONFIG_PAGE_OWNER int mtype; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return; drain_all_pages(NULL); -- cgit v0.10.2 From d435edca928805074dae005ab9a42d9fa60fc702 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:15 -0700 Subject: mm, page_owner: copy page owner info during migration The page_owner mechanism stores gfp_flags of an allocation and stack trace that lead to it. During page migration, the original information is practically replaced by the allocation of free page as the migration target. Arguably this is less useful and might lead to all the page_owner info for migratable pages gradually converge towards compaction or numa balancing migrations. It has also lead to inaccuracies such as one fixed by commit e2cfc91120fa ("mm/page_owner: set correct gfp_mask on page_owner"). This patch thus introduces copying the page_owner info during migration. However, since the fact that the page has been migrated from its original place might be useful for debugging, the next patch will introduce a way to track that information as well. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 8e2eb15..6440daa 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,6 +11,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); extern gfp_t __get_page_owner_gfp(struct page *page); +extern void __copy_page_owner(struct page *oldpage, struct page *newpage); static inline void reset_page_owner(struct page *page, unsigned int order) { @@ -32,6 +33,11 @@ static inline gfp_t get_page_owner_gfp(struct page *page) else return 0; } +static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +{ + if (static_branch_unlikely(&page_owner_inited)) + __copy_page_owner(oldpage, newpage); +} #else static inline void reset_page_owner(struct page *page, unsigned int order) { @@ -44,6 +50,8 @@ static inline gfp_t get_page_owner_gfp(struct page *page) { return 0; } - +static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +{ +} #endif /* CONFIG_PAGE_OWNER */ #endif /* __LINUX_PAGE_OWNER_H */ diff --git a/mm/migrate.c b/mm/migrate.c index 3ad0fea..8133805 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -578,6 +579,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) */ if (PageWriteback(newpage)) end_page_writeback(newpage); + + copy_page_owner(page, newpage); } /************************************************************ diff --git a/mm/page_owner.c b/mm/page_owner.c index feaa28b..774b556 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -84,6 +84,31 @@ gfp_t __get_page_owner_gfp(struct page *page) return page_ext->gfp_mask; } +void __copy_page_owner(struct page *oldpage, struct page *newpage) +{ + struct page_ext *old_ext = lookup_page_ext(oldpage); + struct page_ext *new_ext = lookup_page_ext(newpage); + int i; + + new_ext->order = old_ext->order; + new_ext->gfp_mask = old_ext->gfp_mask; + new_ext->nr_entries = old_ext->nr_entries; + + for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) + new_ext->trace_entries[i] = old_ext->trace_entries[i]; + + /* + * We don't clear the bit on the oldpage as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overal stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the oldpage to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) -- cgit v0.10.2 From 7cd12b4abfd2f8f42414c520bbd051a5b7dc7a8c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:18 -0700 Subject: mm, page_owner: track and print last migrate reason During migration, page_owner info is now copied with the rest of the page, so the stacktrace leading to free page allocation during migration is overwritten. For debugging purposes, it might be however useful to know that the page has been migrated since its initial allocation. This might happen many times during the lifetime for different reasons and fully tracking this, especially with stacktraces would incur extra memory costs. As a compromise, store and print the migrate_reason of the last migration that occurred to the page. This is enough to distinguish compaction, numa balancing etc. Example page_owner entry after the patch: Page allocated via order 0, mask 0x24200ca(GFP_HIGHUSER_MOVABLE) PFN 628753 type Movable Block 1228 type Movable Flags 0x1fffff80040030(dirty|lru|swapbacked) [] __alloc_pages_nodemask+0x134/0x230 [] alloc_pages_vma+0xb5/0x250 [] shmem_alloc_page+0x61/0x90 [] shmem_getpage_gfp+0x678/0x960 [] shmem_fallocate+0x329/0x440 [] vfs_fallocate+0x140/0x230 [] SyS_fallocate+0x44/0x70 [] entry_SYSCALL_64_fastpath+0x12/0x71 Page has been migrated, last migrate reason: compaction Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/migrate.h b/include/linux/migrate.h index cac1c09..9b50325 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -23,9 +23,13 @@ enum migrate_reason { MR_SYSCALL, /* also applies to cpusets */ MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, - MR_CMA + MR_CMA, + MR_TYPES }; +/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */ +extern char *migrate_reason_names[MR_TYPES]; + #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 17f118a..e1fe7cf 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -45,6 +45,7 @@ struct page_ext { unsigned int order; gfp_t gfp_mask; unsigned int nr_entries; + int last_migrate_reason; unsigned long trace_entries[8]; #endif }; diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 6440daa..555893b 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -12,6 +12,7 @@ extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); extern gfp_t __get_page_owner_gfp(struct page *page); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); +extern void __set_page_owner_migrate_reason(struct page *page, int reason); static inline void reset_page_owner(struct page *page, unsigned int order) { @@ -38,6 +39,11 @@ static inline void copy_page_owner(struct page *oldpage, struct page *newpage) if (static_branch_unlikely(&page_owner_inited)) __copy_page_owner(oldpage, newpage); } +static inline void set_page_owner_migrate_reason(struct page *page, int reason) +{ + if (static_branch_unlikely(&page_owner_inited)) + __set_page_owner_migrate_reason(page, reason); +} #else static inline void reset_page_owner(struct page *page, unsigned int order) { @@ -53,5 +59,8 @@ static inline gfp_t get_page_owner_gfp(struct page *page) static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { } +static inline void set_page_owner_migrate_reason(struct page *page, int reason) +{ +} #endif /* CONFIG_PAGE_OWNER */ #endif /* __LINUX_PAGE_OWNER_H */ diff --git a/mm/debug.c b/mm/debug.c index 231e145..78dc548 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -10,9 +10,20 @@ #include #include #include +#include #include "internal.h" +char *migrate_reason_names[MR_TYPES] = { + "compaction", + "memory_failure", + "memory_hotplug", + "syscall_or_cpuset", + "mempolicy_mbind", + "numa_misplaced", + "cma", +}; + const struct trace_print_flags pageflag_names[] = { __def_pageflag_names, {0, NULL} diff --git a/mm/migrate.c b/mm/migrate.c index 8133805..432ecd0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -955,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { put_new_page = NULL; + set_page_owner_migrate_reason(newpage, reason); + } out: if (rc != -EAGAIN) { @@ -1021,7 +1023,7 @@ out: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int rc = -EAGAIN; int *result = NULL; @@ -1079,6 +1081,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); put_new_page = NULL; + set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1151,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode); + pass > 2, mode, reason); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, @@ -1842,6 +1845,7 @@ fail_putback: set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); page_remove_rmap(page, true); + set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/page_owner.c b/mm/page_owner.c index 774b556..a57068c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "internal.h" static bool page_owner_disabled = true; @@ -73,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) page_ext->order = order; page_ext->gfp_mask = gfp_mask; page_ext->nr_entries = trace.nr_entries; + page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +void __set_page_owner_migrate_reason(struct page *page, int reason) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + page_ext->last_migrate_reason = reason; +} + gfp_t __get_page_owner_gfp(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); @@ -151,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + if (page_ext->last_migrate_reason != -1) { + ret += snprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); + if (ret >= count) + goto err; + } + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; -- cgit v0.10.2 From 4e462112e98f9ad6dd62e160f8b14c7df5fed2fc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:21 -0700 Subject: mm, page_owner: dump page owner info from dump_page() The page_owner mechanism is useful for dealing with memory leaks. By reading /sys/kernel/debug/page_owner one can determine the stack traces leading to allocations of all pages, and find e.g. a buggy driver. This information might be also potentially useful for debugging, such as the VM_BUG_ON_PAGE() calls to dump_page(). So let's print the stored info from dump_page(). Example output: page:ffffea000292f1c0 count:1 mapcount:0 mapping:ffff8800b2f6cc18 index:0x91d flags: 0x1fffff8001002c(referenced|uptodate|lru|mappedtodisk) page dumped because: VM_BUG_ON_PAGE(1) page->mem_cgroup:ffff8801392c5000 page allocated via order 0, migratetype Movable, gfp_mask 0x24213ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD|__GFP_NOWARN|__GFP_NORETRY) [] __alloc_pages_nodemask+0x134/0x230 [] alloc_pages_current+0x88/0x120 [] __page_cache_alloc+0xe6/0x120 [] __do_page_cache_readahead+0xdc/0x240 [] ondemand_readahead+0x135/0x260 [] page_cache_async_readahead+0x6c/0x70 [] generic_file_read_iter+0x3f2/0x760 [] __vfs_read+0xa7/0xd0 page has been migrated, last migrate reason: compaction Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 555893b..46f1b93 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -13,6 +13,7 @@ extern void __set_page_owner(struct page *page, extern gfp_t __get_page_owner_gfp(struct page *page); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); +extern void __dump_page_owner(struct page *page); static inline void reset_page_owner(struct page *page, unsigned int order) { @@ -44,6 +45,11 @@ static inline void set_page_owner_migrate_reason(struct page *page, int reason) if (static_branch_unlikely(&page_owner_inited)) __set_page_owner_migrate_reason(page, reason); } +static inline void dump_page_owner(struct page *page) +{ + if (static_branch_unlikely(&page_owner_inited)) + __dump_page_owner(page); +} #else static inline void reset_page_owner(struct page *page, unsigned int order) { @@ -62,5 +68,8 @@ static inline void copy_page_owner(struct page *oldpage, struct page *newpage) static inline void set_page_owner_migrate_reason(struct page *page, int reason) { } +static inline void dump_page_owner(struct page *page) +{ +} #endif /* CONFIG_PAGE_OWNER */ #endif /* __LINUX_PAGE_OWNER_H */ diff --git a/mm/debug.c b/mm/debug.c index 78dc548..61b1f1b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -67,6 +68,7 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); + dump_page_owner(page); } EXPORT_SYMBOL(dump_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 030fafc..d98672d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -443,6 +443,7 @@ static void bad_page(struct page *page, const char *reason, printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); dump_page_badflags(page, reason, bad_flags); + dump_page_owner(page); print_modules(); dump_stack(); diff --git a/mm/page_owner.c b/mm/page_owner.c index a57068c..44ad1f0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -183,6 +183,31 @@ err: return -ENOMEM; } +void __dump_page_owner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; + gfp_t gfp_mask = page_ext->gfp_mask; + int mt = gfpflags_to_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + pr_alert("page allocated via order %u, migratetype %s, " + "gfp_mask %#x(%pGg)\n", page_ext->order, + migratetype_names[mt], gfp_mask, &gfp_mask); + print_stack_trace(&trace, 0); + + if (page_ext->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); +} + static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { -- cgit v0.10.2 From ff8e81163889ac4c7f59e7f7db6377d0c5d8d69c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:24 -0700 Subject: mm, debug: move bad flags printing to bad_page() Since bad_page() is the only user of the badflags parameter of dump_page_badflags(), we can move the code to bad_page() and simplify a bit. The dump_page_badflags() function is renamed to __dump_page() and can still be called separately from dump_page() for temporary debug prints where page_owner info is not desired. The only user-visible change is that page->mem_cgroup is printed before the bad flags. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 053824b0..de7be78 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,8 +9,7 @@ struct vm_area_struct; struct mm_struct; extern void dump_page(struct page *page, const char *reason); -extern void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags); +extern void __dump_page(struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); diff --git a/mm/debug.c b/mm/debug.c index 61b1f1b..df7247b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -40,8 +40,7 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) +void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), @@ -50,15 +49,12 @@ void dump_page_badflags(struct page *page, const char *reason, pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); if (reason) pr_alert("page dumped because: %s\n", reason); - badflags &= page->flags; - if (badflags) - pr_alert("bad because of flags: %#lx(%pGp)\n", badflags, - &badflags); #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -67,7 +63,7 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { - dump_page_badflags(page, reason, 0); + __dump_page(page, reason); dump_page_owner(page); } EXPORT_SYMBOL(dump_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d98672d..0691403 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,7 +430,7 @@ static void bad_page(struct page *page, const char *reason, goto out; } if (nr_unshown) { - printk(KERN_ALERT + pr_alert( "BUG: Bad page state: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; @@ -440,9 +440,13 @@ static void bad_page(struct page *page, const char *reason, if (nr_shown++ == 0) resume = jiffies + 60 * HZ; - printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page_badflags(page, reason, bad_flags); + __dump_page(page, reason); + bad_flags &= page->flags; + if (bad_flags) + pr_alert("bad because of flags: %#lx(%pGp)\n", + bad_flags, &bad_flags); dump_page_owner(page); print_modules(); -- cgit v0.10.2 From 8823b1dbc05fab1a8bec275eeae4709257c2661d Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:56:27 -0700 Subject: mm/page_poison.c: enable PAGE_POISONING as a separate option Page poisoning is currently set up as a feature if architectures don't have architecture debug page_alloc to allow unmapping of pages. It has uses apart from that though. Clearing of the pages on free provides an increase in security as it helps to limit the risk of information leaks. Allow page poisoning to be enabled as a separate option independent of kernel_map pages since the two features do separate work. Because of how hiberanation is implemented, the checks on alloc cannot occur if hibernation is enabled. The runtime alloc checks can also be enabled with an option when !HIBERNATION. Credit to Grsecurity/PaX team for inspiring this work Signed-off-by: Laura Abbott Cc: Rafael J. Wysocki Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Kees Cook Cc: Mathias Krause Cc: Dave Hansen Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 208ae72..8e5abd6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2731,6 +2731,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. we can turn it on. on: enable the feature + page_poison= [KNL] Boot-time parameter changing the state of + poisoning on the buddy allocator. + off: turn off poisoning + on: turn on poisoning + panic= [KNL] Kernel behaviour on panic: delay timeout > 0: seconds before rebooting timeout = 0: wait forever diff --git a/include/linux/mm.h b/include/linux/mm.h index 69fd6bb..99dcc8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2176,6 +2176,15 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +#ifdef CONFIG_PAGE_POISONING +extern bool page_poisoning_enabled(void); +extern void kernel_poison_pages(struct page *page, int numpages, int enable); +#else +static inline bool page_poisoning_enabled(void) { return false; } +static inline void kernel_poison_pages(struct page *page, int numpages, + int enable) { } +#endif + #ifdef CONFIG_DEBUG_PAGEALLOC extern bool _debug_pagealloc_enabled; extern void __kernel_map_pages(struct page *page, int numpages, int enable); diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index a0c136a..1f99f9a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -41,4 +41,27 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT can be overridden by debug_pagealloc=off|on. config PAGE_POISONING - bool + bool "Poison pages after freeing" + select PAGE_EXTENSION + select PAGE_POISONING_NO_SANITY if HIBERNATION + ---help--- + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If unsure, say N + +config PAGE_POISONING_NO_SANITY + depends on PAGE_POISONING + bool "Only poison, don't sanity check" + ---help--- + Skip the sanity checking on alloc, only fill the pages with + poison on free. This reduces some of the overhead of the + poisoning feature. + + If you are only interested in sanitization, say Y. Otherwise + say N. diff --git a/mm/Makefile b/mm/Makefile index 2ed4319..cfdd481d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o -obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c deleted file mode 100644 index 5bf5906..0000000 --- a/mm/debug-pagealloc.c +++ /dev/null @@ -1,137 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -static bool page_poisoning_enabled __read_mostly; - -static bool need_page_poisoning(void) -{ - if (!debug_pagealloc_enabled()) - return false; - - return true; -} - -static void init_page_poisoning(void) -{ - if (!debug_pagealloc_enabled()) - return; - - page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline bool page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static void poison_page(struct page *page) -{ - void *addr = kmap_atomic(page); - - set_page_poison(page); - memset(addr, PAGE_POISON, PAGE_SIZE); - kunmap_atomic(addr); -} - -static void poison_pages(struct page *page, int n) -{ - int i; - - for (i = 0; i < n; i++) - poison_page(page + i); -} - -static bool single_bit_flip(unsigned char a, unsigned char b) -{ - unsigned char error = a ^ b; - - return error && !(error & (error - 1)); -} - -static void check_poison_mem(unsigned char *mem, size_t bytes) -{ - static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); - unsigned char *start; - unsigned char *end; - - start = memchr_inv(mem, PAGE_POISON, bytes); - if (!start) - return; - - for (end = mem + bytes - 1; end > start; end--) { - if (*end != PAGE_POISON) - break; - } - - if (!__ratelimit(&ratelimit)) - return; - else if (start == end && single_bit_flip(*start, PAGE_POISON)) - printk(KERN_ERR "pagealloc: single bit error\n"); - else - printk(KERN_ERR "pagealloc: memory corruption\n"); - - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, - end - start + 1, 1); - dump_stack(); -} - -static void unpoison_page(struct page *page) -{ - void *addr; - - if (!page_poison(page)) - return; - - addr = kmap_atomic(page); - check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); - kunmap_atomic(addr); -} - -static void unpoison_pages(struct page *page, int n) -{ - int i; - - for (i = 0; i < n; i++) - unpoison_page(page + i); -} - -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (!page_poisoning_enabled) - return; - - if (enable) - unpoison_pages(page, numpages); - else - poison_pages(page, numpages); -} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0691403..2a08349 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1025,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) PAGE_SIZE << order); } arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); return true; @@ -1420,6 +1421,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) diff --git a/mm/page_poison.c b/mm/page_poison.c new file mode 100644 index 0000000..89d3bc7 --- /dev/null +++ b/mm/page_poison.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include +#include + +static bool __page_poisoning_enabled __read_mostly; +static bool want_page_poisoning __read_mostly; + +static int early_page_poison_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + want_page_poisoning = true; + else if (strcmp(buf, "off") == 0) + want_page_poisoning = false; + + return 0; +} +early_param("page_poison", early_page_poison_param); + +bool page_poisoning_enabled(void) +{ + return __page_poisoning_enabled; +} + +static bool need_page_poisoning(void) +{ + return want_page_poisoning; +} + +static void init_page_poisoning(void) +{ + /* + * page poisoning is debug page alloc for some arches. If either + * of those options are enabled, enable poisoning + */ + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { + if (!want_page_poisoning && !debug_pagealloc_enabled()) + return; + } else { + if (!want_page_poisoning) + return; + } + + __page_poisoning_enabled = true; +} + +struct page_ext_operations page_poisoning_ops = { + .need = need_page_poisoning, + .init = init_page_poisoning, +}; + +static inline void set_page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static inline void clear_page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static inline bool page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static void poison_page(struct page *page) +{ + void *addr = kmap_atomic(page); + + set_page_poison(page); + memset(addr, PAGE_POISON, PAGE_SIZE); + kunmap_atomic(addr); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); + unsigned char *start; + unsigned char *end; + + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) + return; + + start = memchr_inv(mem, PAGE_POISON, bytes); + if (!start) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!__ratelimit(&ratelimit)) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + pr_err("pagealloc: single bit error\n"); + else + pr_err("pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_page(struct page *page) +{ + void *addr; + + if (!page_poison(page)) + return; + + addr = kmap_atomic(page); + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + kunmap_atomic(addr); +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_poison_pages(struct page *page, int numpages, int enable) +{ + if (!page_poisoning_enabled()) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif -- cgit v0.10.2 From 1414c7f4f7d72d138fff35f00151d15749b5beda Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:56:30 -0700 Subject: mm/page_poisoning.c: allow for zero poisoning By default, page poisoning uses a poison value (0xaa) on free. If this is changed to 0, the page is not only sanitized but zeroing on alloc with __GFP_ZERO can be skipped as well. The tradeoff is that detecting corruption from the poisoning is harder to detect. This feature also cannot be used with hibernation since pages are not guaranteed to be zeroed after hibernation. Credit to Grsecurity/PaX team for inspiring this work Signed-off-by: Laura Abbott Acked-by: Rafael J. Wysocki Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Kees Cook Cc: Mathias Krause Cc: Dave Hansen Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 99dcc8f..b97243d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2179,10 +2179,12 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); +extern bool page_is_poisoned(struct page *page); #else static inline bool page_poisoning_enabled(void) { return false; } static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } +static inline bool page_is_poisoned(struct page *page) { return false; } #endif #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/include/linux/poison.h b/include/linux/poison.h index 4a27153..51334ed 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -30,7 +30,11 @@ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) /********** mm/debug-pagealloc.c **********/ +#ifdef CONFIG_PAGE_POISONING_ZERO +#define PAGE_POISON 0x00 +#else #define PAGE_POISON 0xaa +#endif /********** mm/page_alloc.c ************/ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a2..aa0f26b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1158,6 +1158,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1166,3 +1182,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1f99f9a..5c50b23 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -65,3 +65,17 @@ config PAGE_POISONING_NO_SANITY If you are only interested in sanitization, say Y. Otherwise say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a08349..50897dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1405,15 +1405,24 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(bool poisoned) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled() && poisoned; +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, int alloc_flags) { int i; + bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (poisoned) + poisoned &= page_is_poisoned(p); } set_page_private(page, 0); @@ -1424,7 +1433,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b..2d864e6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_poison.c b/mm/page_poison.c index 89d3bc7..479e7ea 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -71,11 +71,14 @@ static inline void clear_page_poison(struct page *page) __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -static inline bool page_poison(struct page *page) +bool page_is_poisoned(struct page *page) { struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (!page_ext) + return false; + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -137,7 +140,7 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_poison(page)) + if (!page_is_poisoned(page)) return; addr = kmap_atomic(page); -- cgit v0.10.2 From 5b3810e5c6e1b9a1858464a53b5d72ee050bb918 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:33 -0700 Subject: mm, sl[au]b: print gfp_flags as strings in slab_out_of_memory() We can now print gfp_flags more human-readable. Make use of this in slab_out_of_memory() for SLUB and SLAB. Also convert the SLAB variant it to pr_warn() along the way. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 863c005..852fc5c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1350,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) return; - printk(KERN_WARNING - "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nodeid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { @@ -1377,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) num_slabs += active_slabs; num_objs = num_slabs * cachep->num; - printk(KERN_WARNING - " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } diff --git a/mm/slub.c b/mm/slub.c index 2d4d817..6c91324 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2236,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nid, gfpflags); + pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); -- cgit v0.10.2 From 23a003bfd23ea9ea0b7756b920e51f64b284b468 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 15 Mar 2016 14:56:36 -0700 Subject: mm/madvise: pass return code of memory_failure() to userspace Currently the return value of memory_failure() is not passed to userspace when madvise(MADV_HWPOISON) is used. This is inconvenient for test programs that want to know the result of error handling. So let's return it to the caller as we already do in the MADV_SOFT_OFFLINE case. Signed-off-by: Naoya Horiguchi Cc: Chen Gong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/madvise.c b/mm/madvise.c index f56825b..6a77114 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) } pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); - /* Ignore return value for now */ - memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + if (ret) + return ret; } return 0; } -- cgit v0.10.2 From 0b94f17507dc3a92f091812e5bff890c8845d1a3 Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Tue, 15 Mar 2016 14:56:39 -0700 Subject: mm/memory-failure.c: remove useless "undef"s Remove the useless #undef, since the corresponding #define has already been removed. Signed-off-by: Wang Xiaoqiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ac595e7..67c30eb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -826,8 +826,6 @@ static struct page_state { #undef lru #undef swapbacked #undef head -#undef tail -#undef compound #undef slab #undef reserved -- cgit v0.10.2 From 4355c018c2ba8017592520573e76ad376ad656db Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Tue, 15 Mar 2016 14:56:42 -0700 Subject: mm/mempolicy.c: skip VM_HUGETLB and VM_MIXEDMAP VMA for lazy mbind VM_HUGETLB and VM_MIXEDMAP vma needs to be excluded to avoid compound pages being marked for migration and unexpected COWs when handling hugetlb fault. Thanks to Naoya Horiguchi for reminding me on these checks. Signed-off-by: Liang Chen Signed-off-by: Gavin Guo Suggested-by: Naoya Horiguchi Acked-by: David Rientjes Cc: SeongJae Park Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9a3f6b9..8cbc743 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (!is_vm_hugetlb_page(vma) && + (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } -- cgit v0.10.2 From 9cb65bc3b1114004e2ccee5939031325c7bf16e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Tue, 15 Mar 2016 14:56:45 -0700 Subject: mm/memory.c: make apply_to_page_range() more robust MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arm and arm64 used to trigger this BUG_ON() - this has now been fixed. But a WARN_ON() here is sufficient to catch future buggy callers. Signed-off-by: Mika Penttilä Reviewed-by: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index 8132787..8adb5b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1876,7 +1876,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long end = addr + size; int err; - BUG_ON(addr >= end); + if (WARN_ON(addr >= end)) + return -EINVAL; + pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); -- cgit v0.10.2 From 31bc3858ea3ebcc3157b3f5f0e624c5962f5a7a6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 15 Mar 2016 14:56:48 -0700 Subject: memory-hotplug: add automatic onlining policy for the newly added memory Currently, all newly added memory blocks remain in 'offline' state unless someone onlines them, some linux distributions carry special udev rules like: SUBSYSTEM=="memory", ACTION=="add", ATTR{state}=="offline", ATTR{state}="online" to make this happen automatically. This is not a great solution for virtual machines where memory hotplug is being used to address high memory pressure situations as such onlining is slow and a userspace process doing this (udev) has a chance of being killed by the OOM killer as it will probably require to allocate some memory. Introduce default policy for the newly added memory blocks in /sys/devices/system/memory/auto_online_blocks file with two possible values: "offline" which preserves the current behavior and "online" which causes all newly added memory blocks to go online as soon as they're added. The default is "offline". Signed-off-by: Vitaly Kuznetsov Reviewed-by: Daniel Kiper Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Daniel Kiper Cc: Dan Williams Cc: Tang Chen Cc: David Vrabel Acked-by: David Rientjes Cc: Naoya Horiguchi Cc: Xishi Qiu Cc: Mel Gorman Cc: "K. Y. Srinivasan" Cc: Igor Mammedov Cc: Kay Sievers Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index ce2cfcf..443f4b4 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -256,10 +256,27 @@ If the memory block is offline, you'll read "offline". 5.2. How to online memory ------------ -Even if the memory is hot-added, it is not at ready-to-use state. -For using newly added memory, you have to "online" the memory block. +When the memory is hot-added, the kernel decides whether or not to "online" +it according to the policy which can be read from "auto_online_blocks" file: -For onlining, you have to write "online" to the memory block's state file as: +% cat /sys/devices/system/memory/auto_online_blocks + +The default is "offline" which means the newly added memory is not in a +ready-to-use state and you have to "online" the newly added memory blocks +manually. Automatic onlining can be requested by writing "online" to +"auto_online_blocks" file: + +% echo online > /sys/devices/system/memory/auto_online_blocks + +This sets a global policy and impacts all memory blocks that will subsequently +be hotplugged. Currently offline blocks keep their state. It is possible, under +certain circumstances, that some memory blocks will be added but will fail to +online. User space tools can check their "state" files +(/sys/devices/system/memory/memoryXXX/state) and try to online them manually. + +If the automatic onlining wasn't requested, failed, or some memory block was +offlined it is possible to change the individual block's state by writing to the +"state" file: % echo online > /sys/devices/system/memory/memoryXXX/state diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 213456c..f46dba8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -251,7 +251,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t return ret; } -static int memory_block_change_state(struct memory_block *mem, +int memory_block_change_state(struct memory_block *mem, unsigned long to_state, unsigned long from_state_req) { int ret = 0; @@ -439,6 +439,37 @@ print_block_size(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); /* + * Memory auto online policy. + */ + +static ssize_t +show_auto_online_blocks(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (memhp_auto_online) + return sprintf(buf, "online\n"); + else + return sprintf(buf, "offline\n"); +} + +static ssize_t +store_auto_online_blocks(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + if (sysfs_streq(buf, "online")) + memhp_auto_online = true; + else if (sysfs_streq(buf, "offline")) + memhp_auto_online = false; + else + return -EINVAL; + + return count; +} + +static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks, + store_auto_online_blocks); + +/* * Some architectures will have custom drivers to do this, and * will not need to do it from userspace. The fake hot-add code * as well as ppc64 will do all of their discovery in userspace @@ -746,6 +777,7 @@ static struct attribute *memory_root_attrs[] = { #endif &dev_attr_block_size_bytes.attr, + &dev_attr_auto_online_blocks.attr, NULL }; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index dc4305b..e6058de 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -338,7 +338,7 @@ static enum bp_state reserve_additional_memory(void) } #endif - rc = add_memory_resource(nid, resource); + rc = add_memory_resource(nid, resource, false); if (rc) { pr_warn("Cannot add additional memory (%i)\n", rc); goto err; diff --git a/include/linux/memory.h b/include/linux/memory.h index 8b8d8d1..82730ad 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -109,6 +109,9 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); extern int register_new_memory(int, struct mem_section *); +extern int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, + unsigned long from_state_req); #ifdef CONFIG_MEMORY_HOTREMOVE extern int unregister_memory_section(struct mem_section *); #endif diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4340599..769d768 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -99,6 +99,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); +extern bool memhp_auto_online; + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size); @@ -267,7 +269,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); -extern int add_memory_resource(int nid, struct resource *resource); +extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 979b18c..484e867 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -77,6 +77,9 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool memhp_auto_online; +EXPORT_SYMBOL_GPL(memhp_auto_online); + void get_online_mems(void) { might_sleep(); @@ -1261,8 +1264,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, return zone_default; } +static int online_memory_block(struct memory_block *mem, void *arg) +{ + return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; pg_data_t *pgdat = NULL; @@ -1322,6 +1330,11 @@ int __ref add_memory_resource(int nid, struct resource *res) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* online pages if requested */ + if (online) + walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), + NULL, online_memory_block); + goto out; error: @@ -1345,7 +1358,7 @@ int __ref add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, memhp_auto_online); if (ret < 0) release_memory_resource(res); return ret; -- cgit v0.10.2 From 703fc13a3f6615e29ce3eb862275d7b58a5d03ba Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 15 Mar 2016 14:56:52 -0700 Subject: xen_balloon: support memory auto onlining policy Add support for the newly added kernel memory auto onlining policy to Xen ballon driver. Signed-off-by: Vitaly Kuznetsov Suggested-by: Daniel Kiper Reviewed-by: Daniel Kiper Acked-by: David Vrabel Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Daniel Kiper Cc: Dan Williams Cc: Tang Chen Cc: David Rientjes Cc: Naoya Horiguchi Cc: Xishi Qiu Cc: Mel Gorman Cc: "K. Y. Srinivasan" Cc: Igor Mammedov Cc: Kay Sievers Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 73708ac..979a831 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -37,23 +37,30 @@ config XEN_BALLOON_MEMORY_HOTPLUG Memory could be hotplugged in following steps: - 1) dom0: xl mem-max + 1) target domain: ensure that memory auto online policy is in + effect by checking /sys/devices/system/memory/auto_online_blocks + file (should be 'online'). + + 2) control domain: xl mem-max where is >= requested memory size, - 2) dom0: xl mem-set + 3) control domain: xl mem-set where is requested memory size; alternatively memory could be added by writing proper value to /sys/devices/system/xen_memory/xen_memory0/target or - /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, + /sys/devices/system/xen_memory/xen_memory0/target_kb on the + target domain. - 3) domU: for i in /sys/devices/system/memory/memory*/state; do \ - [ "`cat "$i"`" = offline ] && echo online > "$i"; done + Alternatively, if memory auto onlining was not requested at step 1 + the newly added memory can be manually onlined in the target domain + by doing the following: - Memory could be onlined automatically on domU by adding following line to udev rules: + for i in /sys/devices/system/memory/memory*/state; do \ + [ "`cat "$i"`" = offline ] && echo online > "$i"; done - SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" + or by adding the following line to udev rules: - In that case step 3 should be omitted. + SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT int "Hotplugged memory limit (in GiB) for a PV guest" diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index e6058de..7c8a2cf 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -338,7 +338,16 @@ static enum bp_state reserve_additional_memory(void) } #endif - rc = add_memory_resource(nid, resource, false); + /* + * add_memory_resource() will call online_pages() which in its turn + * will call xen_online_page() callback causing deadlock if we don't + * release balloon_mutex here. Unlocking here is safe because the + * callers drop the mutex before trying again. + */ + mutex_unlock(&balloon_mutex); + rc = add_memory_resource(nid, resource, memhp_auto_online); + mutex_lock(&balloon_mutex); + if (rc) { pr_warn("Cannot add additional memory (%i)\n", rc); goto err; -- cgit v0.10.2 From cecf257b62e39eab0e802e91c62ae5679673db02 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 15 Mar 2016 14:56:55 -0700 Subject: mm: vmscan: do not clear SHRINKER_NUMA_AWARE if nr_node_ids == 1 Currently, on shrinker registration we clear SHRINKER_NUMA_AWARE if there's the only NUMA node present. The comment states that this will allow us to save some small loop time later. It used to be true when this code was added (see commit 1d3d4437eae1b ("vmscan: per-node deferred work")), but since commit 6b4f7799c6a57 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") it doesn't make any difference. Anyway, running on non-NUMA machine shouldn't make a shrinker NUMA unaware, so zap this hunk. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 71b1c29..db5722a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); - /* - * If we only have one possible node in the system anyway, save - * ourselves the trouble and disable NUMA aware behavior. This way we - * will save memory and some small loop time later. - */ - if (nr_node_ids == 1) - shrinker->flags &= ~SHRINKER_NUMA_AWARE; - if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; -- cgit v0.10.2 From d7206a70af5c094446927b5dea8704f0f96303e3 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 15 Mar 2016 14:56:58 -0700 Subject: mm/madvise: update comment on sys_madvise() Some new MADV_* advices are not documented in sys_madvise() comment. So let's update it. [akpm@linux-foundation.org: modifications suggested by Michal] Signed-off-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Minchan Kim Cc: "Kirill A. Shutemov" Cc: Jason Baron Cc: Chen Gong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/madvise.c b/mm/madvise.c index 6a77114..a011473 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -639,14 +639,28 @@ madvise_behavior_valid(int behavior) * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_FREE - the application marks pages in the given range as lazy free, + * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_HWPOISON - trigger memory error handler as if the given memory range + * were corrupted by unrecoverable hardware memory failure. + * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * MADV_HUGEPAGE - the application wants to back the given range by transparent + * huge pages in the future. Existing pages might be coalesced and + * new pages might be allocated as THP. + * MADV_NOHUGEPAGE - mark the given range as not worth being backed by + * transparent huge pages so the existing pages will not be + * coalesced into THP and new pages will not be allocated as THP. + * MADV_DONTDUMP - the application wants to prevent pages in the given range + * from being included in its core dump. + * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * * return values: * zero - success -- cgit v0.10.2 From 0db2cb8da89d991762ec2aece45e55ceaee34664 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 15 Mar 2016 14:57:01 -0700 Subject: mm, vmscan: make zone_reclaimable_pages more precise zone_reclaimable_pages() is used in should_reclaim_retry() which uses it to calculate the target for the watermark check. This means that precise numbers are important for the correct decision. zone_reclaimable_pages uses zone_page_state which can contain stale data with per-cpu diffs not synced yet (the last vmstat_update might have run 1s in the past). Use zone_page_state_snapshot() in zone_reclaimable_pages() instead. None of the current callers is in a hot path where getting the precise value (which involves per-cpu iteration) would cause an unreasonable overhead. Signed-off-by: Michal Hocko Signed-off-by: Tetsuo Handa Suggested-by: David Rientjes Acked-by: David Rientjes Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index db5722a..039f08d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,21 +195,21 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON) + - zone_page_state(zone, NR_ISOLATED_ANON); + nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ISOLATED_ANON); return nr; } bool zone_reclaimable(struct zone *zone) { - return zone_page_state(zone, NR_PAGES_SCANNED) < + return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < zone_reclaimable_pages(zone) * 6; } -- cgit v0.10.2 From 81f8c3a461d16f0355ced3d56d6d1bb5923207a1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:04 -0700 Subject: mm: memcontrol: generalize locking for the page->mem_cgroup binding These patches tag the page cache radix tree eviction entries with the memcg an evicted page belonged to, thus making per-cgroup LRU reclaim work properly and be as adaptive to new cache workingsets as global reclaim already is. This should have been part of the original thrash detection patch series, but was deferred due to the complexity of those patches. This patch (of 5): So far the only sites that needed to exclude charge migration to stabilize page->mem_cgroup have been per-cgroup page statistics, hence the name mem_cgroup_begin_page_stat(). But per-cgroup thrash detection will add another site that needs to ensure page->mem_cgroup lifetime. Rename these locking functions to the more generic lock_page_memcg() and unlock_page_memcg(). Since charge migration is a cgroup1 feature only, we might be able to delete it at some point, and these now easy to identify locking sites along with it. Signed-off-by: Johannes Weiner Suggested-by: Vladimir Davydov Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/buffer.c b/fs/buffer.c index e1632ab..dc99151 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -621,7 +621,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. * - * The caller must hold mem_cgroup_begin_page_stat() lock. + * The caller must hold lock_page_memcg(). */ static void __set_page_dirty(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg, int warn) @@ -683,17 +683,17 @@ int __set_page_dirty_buffers(struct page *page) } while (bh != head); } /* - * Use mem_group_begin_page_stat() to keep PageDirty synchronized with - * per-memcg dirty page counters. + * Lock out page->mem_cgroup migration to keep PageDirty + * synchronized with per-memcg dirty page counters. */ - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) __set_page_dirty(page, mapping, memcg, 1); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1169,13 +1169,13 @@ void mark_buffer_dirty(struct buffer_head *bh) struct address_space *mapping = NULL; struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (!TestSetPageDirty(page)) { mapping = page_mapping(page); if (mapping) __set_page_dirty(page, mapping, memcg, 0); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a9ebabfe..5f85ebc 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1978,10 +1978,10 @@ xfs_vm_set_page_dirty( } while (bh != head); } /* - * Use mem_group_begin_page_stat() to keep PageDirty synchronized with - * per-memcg dirty page counters. + * Lock out page->mem_cgroup migration to keep PageDirty + * synchronized with per-memcg dirty page counters. */ - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1998,7 +1998,7 @@ xfs_vm_set_page_dirty( } spin_unlock_irqrestore(&mapping->tree_lock, flags); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 30b02e7..8502fd4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -429,8 +429,8 @@ bool mem_cgroup_oom_synchronize(bool wait); extern int do_swap_account; #endif -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page); -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); +struct mem_cgroup *lock_page_memcg(struct page *page); +void unlock_page_memcg(struct mem_cgroup *memcg); /** * mem_cgroup_update_page_stat - update page state statistics @@ -438,7 +438,13 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); * @idx: page state item to account * @val: number of pages (positive or negative) * - * See mem_cgroup_begin_page_stat() for locking requirements. + * Callers must use lock_page_memcg() to prevent double accounting + * when the page is concurrently being moved to another memcg: + * + * memcg = lock_page_memcg(page); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * unlock_page_memcg(memcg); */ static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) @@ -613,12 +619,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +static inline struct mem_cgroup *lock_page_memcg(struct page *page) { return NULL; } -static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +static inline void unlock_page_memcg(struct mem_cgroup *memcg) { } diff --git a/mm/filemap.c b/mm/filemap.c index 4a0f5fa..ee8140c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -101,7 +101,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -177,7 +177,7 @@ static void page_cache_tree_delete(struct address_space *mapping, * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock and - * mem_cgroup_begin_page_stat(). + * lock_page_memcg(). */ void __delete_from_page_cache(struct page *page, void *shadow, struct mem_cgroup *memcg) @@ -263,11 +263,11 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (freepage) freepage(page); @@ -561,7 +561,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = mem_cgroup_begin_page_stat(old); + memcg = lock_page_memcg(old); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL, memcg); error = radix_tree_insert(&mapping->page_tree, offset, new); @@ -576,7 +576,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); mem_cgroup_replace_page(old, new); radix_tree_preload_end(); if (freepage) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d06cae2..953f0f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1709,19 +1709,13 @@ cleanup: } /** - * mem_cgroup_begin_page_stat - begin a page state statistics transaction - * @page: page that is going to change accounted state - * - * This function must mark the beginning of an accounted page state - * change to prevent double accounting when the page is concurrently - * being moved to another memcg: + * lock_page_memcg - lock a page->mem_cgroup binding + * @page: the page * - * memcg = mem_cgroup_begin_page_stat(page); - * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg); + * This function protects unlocked LRU pages from being moved to + * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +struct mem_cgroup *lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1759,20 +1753,20 @@ again: /* * When charge migration first begins, we can have locked and * unlocked page stat updates happening concurrently. Track - * the task who has the lock for mem_cgroup_end_page_stat(). + * the task who has the lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; return memcg; } -EXPORT_SYMBOL(mem_cgroup_begin_page_stat); +EXPORT_SYMBOL(lock_page_memcg); /** - * mem_cgroup_end_page_stat - finish a page state statistics transaction - * @memcg: the memcg that was accounted against + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @memcg: the memcg returned by lock_page_memcg() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +void unlock_page_memcg(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1785,7 +1779,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_end_page_stat); +EXPORT_SYMBOL(unlock_page_memcg); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -4923,9 +4917,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) lru_add_drain_all(); /* - * Signal mem_cgroup_begin_page_stat() to take the memcg's - * move_lock while we're moving its pages to another memcg. - * Then wait for already started RCU-only updates to finish. + * Signal lock_page_memcg() to take the memcg's move_lock + * while we're moving its pages to another memcg. Then wait + * for already started RCU-only updates to finish. */ atomic_inc(&mc.from->moving_account); synchronize_rcu(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d782cba..2b5ea12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2410,7 +2410,7 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). * * NOTE: This relies on being atomic wrt interrupts. */ @@ -2442,7 +2442,7 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg, struct bdi_writeback *wb) @@ -2471,13 +2471,13 @@ int __set_page_dirty_nobuffers(struct page *page) { struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 1; } @@ -2488,7 +2488,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2496,7 +2496,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2629,14 +2629,14 @@ void cancel_dirty_page(struct page *page) struct mem_cgroup *memcg; bool locked; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, memcg, wb); unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); } else { ClearPageDirty(page); } @@ -2705,7 +2705,7 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); @@ -2714,7 +2714,7 @@ int clear_page_dirty_for_io(struct page *page) ret = 1; } unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return ret; } return TestClearPageDirty(page); @@ -2727,7 +2727,7 @@ int test_clear_page_writeback(struct page *page) struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2755,7 +2755,7 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return ret; } @@ -2765,7 +2765,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2796,7 +2796,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 79f3bf0..2871e7d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1289,19 +1289,19 @@ void page_add_file_rmap(struct page *page) { struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); } static void page_remove_file_rmap(struct page *page) { struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { @@ -1325,7 +1325,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); } static void page_remove_anon_compound_rmap(struct page *page) diff --git a/mm/truncate.c b/mm/truncate.c index e3ee0e2..51a24f6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -528,7 +528,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; @@ -536,7 +536,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -545,7 +545,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 039f08d..08547a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -608,7 +608,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -648,7 +648,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -676,7 +676,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (freepage != NULL) freepage(page); @@ -686,7 +686,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 0; } -- cgit v0.10.2 From 689c94f03ae251181725fe853b4ffc870f05c7fe Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:07 -0700 Subject: mm: workingset: #define radix entry eviction mask This is a compile-time constant, no need to calculate it on refault. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/workingset.c b/mm/workingset.c index 61ead9e..3ef92f6 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -152,6 +152,10 @@ * refault distance will immediately activate the refaulting page. */ +#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ + ZONES_SHIFT + NODES_SHIFT) +#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + static void *pack_shadow(unsigned long eviction, struct zone *zone) { eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); @@ -168,7 +172,6 @@ static void unpack_shadow(void *shadow, unsigned long entry = (unsigned long)shadow; unsigned long eviction; unsigned long refault; - unsigned long mask; int zid, nid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; @@ -181,8 +184,7 @@ static void unpack_shadow(void *shadow, *zone = NODE_DATA(nid)->node_zones + zid; refault = atomic_long_read(&(*zone)->inactive_age); - mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + - RADIX_TREE_EXCEPTIONAL_SHIFT); + /* * The unsigned subtraction here gives an accurate distance * across inactive_age overflows in most cases. @@ -199,7 +201,7 @@ static void unpack_shadow(void *shadow, * inappropriate activation leading to pressure on the active * list is not a problem. */ - *distance = (refault - eviction) & mask; + *distance = (refault - eviction) & EVICTION_MASK; } /** -- cgit v0.10.2 From 162453bfbdf4c0f58cb3058aad9ad8cda1044cda Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:10 -0700 Subject: mm: workingset: separate shadow unpacking and refault calculation Per-cgroup thrash detection will need to derive a live memcg from the eviction cookie, and doing that inside unpack_shadow() will get nasty with the reference handling spread over two functions. In preparation, make unpack_shadow() clearly about extracting static data, and let workingset_refault() do all the higher-level handling. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/workingset.c b/mm/workingset.c index 3ef92f6..f874b2c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -165,13 +165,10 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, - struct zone **zone, - unsigned long *distance) +static void unpack_shadow(void *shadow, struct zone **zonep, + unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - unsigned long eviction; - unsigned long refault; int zid, nid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; @@ -179,29 +176,9 @@ static void unpack_shadow(void *shadow, entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; - eviction = entry; - - *zone = NODE_DATA(nid)->node_zones + zid; - refault = atomic_long_read(&(*zone)->inactive_age); - - /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. - * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. - */ - *distance = (refault - eviction) & EVICTION_MASK; + *zonep = NODE_DATA(nid)->node_zones + zid; + *evictionp = entry; } /** @@ -233,9 +210,32 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long eviction; + unsigned long refault; struct zone *zone; - unpack_shadow(shadow, &zone, &refault_distance); + unpack_shadow(shadow, &zone, &eviction); + + refault = atomic_long_read(&zone->inactive_age); + + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; + inc_zone_state(zone, WORKINGSET_REFAULT); if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { -- cgit v0.10.2 From 612e44939c3c77245ac80843c0c7876c8cf97282 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:13 -0700 Subject: mm: workingset: eviction buckets for bigmem/lowbit machines For per-cgroup thrash detection, we need to store the memcg ID inside the radix tree cookie as well. However, on 32 bit that doesn't leave enough bits for the eviction timestamp to cover the necessary range of recently evicted pages. The radix tree entry would look like this: [ RADIX_TREE_EXCEPTIONAL(2) | ZONEID(2) | MEMCGID(16) | EVICTION(12) ] 12 bits means 4096 pages, means 16M worth of recently evicted pages. But refaults are actionable up to distances covering half of memory. To not miss refaults, we have to stretch out the range at the cost of how precisely we can tell when a page was evicted. This way we can shave off lower bits from the eviction timestamp until the necessary range is covered. E.g. grouping evictions into 1M buckets (256 pages) will stretch the longest representable refault distance to 4G. This patch implements eviction buckets that are automatically sized according to the available bits and the necessary refault range, in preparation for per-cgroup thrash detection. The maximum actionable distance is currently half of memory, but to support memory hotplug of up to 200% of boot-time memory, we size the buckets to cover double the distance. Beyond that, thrashing won't be detectable anymore. During boot, the kernel will print out the exact parameters, like so: [ 0.113929] workingset: timestamp_bits=12 max_order=18 bucket_order=6 In this example, there are 12 radix entry bits available for the eviction timestamp, to cover a maximum distance of 2^18 pages (this is a 1G machine). Consequently, evictions must be grouped into buckets of 2^6 pages, or 256K. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/workingset.c b/mm/workingset.c index f874b2c..9a26a60 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,19 @@ ZONES_SHIFT + NODES_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) +/* + * Eviction timestamps need to be able to cover the full range of + * actionable refaults. However, bits are tight in the radix tree + * entry, and after storing the identifier for the lruvec there might + * not be enough left to represent every single actionable refault. In + * that case, we have to sacrifice granularity for distance, and group + * evictions into coarser buckets by shaving off lower timestamp bits. + */ +static unsigned int bucket_order __read_mostly; + static void *pack_shadow(unsigned long eviction, struct zone *zone) { + eviction >>= bucket_order; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -178,7 +189,7 @@ static void unpack_shadow(void *shadow, struct zone **zonep, entry >>= NODES_SHIFT; *zonep = NODE_DATA(nid)->node_zones + zid; - *evictionp = entry; + *evictionp = entry << bucket_order; } /** @@ -400,8 +411,25 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits; + unsigned int max_order; int ret; + BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of + * memory (totalram_pages/2). However, memory hotplug may add + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + max_order = fls_long(totalram_pages - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; + printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + timestamp_bits, max_order, bucket_order); + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); if (ret) goto err; -- cgit v0.10.2 From 23047a96d7cfcfca1a6d026ecaec526ea4803e9e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:16 -0700 Subject: mm: workingset: per-cgroup cache thrash detection Cache thrash detection (see a528910e12ec "mm: thrash detection-based file cache sizing" for details) currently only works on the system level, not inside cgroups. Worse, as the refaults are compared to the global number of active cache, cgroups might wrongfully get all their refaults activated when their pages are hotter than those of others. Move the refault machinery from the zone to the lruvec, and then tag eviction entries with the memcg ID. This makes the thrash detection work correctly inside cgroups. [sergey.senozhatsky@gmail.com: do not return from workingset_activation() with locked rcu and page] Signed-off-by: Johannes Weiner Signed-off-by: Sergey Senozhatsky Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8502fd4..09b4498 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -89,6 +89,10 @@ enum mem_cgroup_events_target { }; #ifdef CONFIG_MEMCG + +#define MEM_CGROUP_ID_SHIFT 16 +#define MEM_CGROUP_ID_MAX USHRT_MAX + struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -265,6 +269,11 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +static inline bool mem_cgroup_disabled(void) +{ + return !cgroup_subsys_enabled(memory_cgrp_subsys); +} + /** * mem_cgroup_events - count memory events against a cgroup * @memcg: the memory cgroup @@ -312,6 +321,28 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return 0; + + return memcg->css.id; +} + +/** + * mem_cgroup_from_id - look up a memcg from an id + * @id: the id to look up + * + * Caller must hold rcu_read_lock() and use css_tryget() as necessary. + */ +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id, &memory_cgrp_subsys); + return mem_cgroup_from_css(css); +} + /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find @@ -353,11 +384,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); ino_t page_cgroup_ino(struct page *page); -static inline bool mem_cgroup_disabled(void) -{ - return !cgroup_subsys_enabled(memory_cgrp_subsys); -} - static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) @@ -502,8 +528,17 @@ void mem_cgroup_split_huge_fixup(struct page *head); #endif #else /* CONFIG_MEMCG */ + +#define MEM_CGROUP_ID_SHIFT 0 +#define MEM_CGROUP_ID_MAX 0 + struct mem_cgroup; +static inline bool mem_cgroup_disabled(void) +{ + return true; +} + static inline void mem_cgroup_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx, unsigned int nr) @@ -586,9 +621,16 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } -static inline bool mem_cgroup_disabled(void) +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - return true; + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(id); + /* XXX: This should always return root_mem_cgroup */ + return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9fc23ab..03cbdd9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -212,10 +212,12 @@ struct zone_reclaim_stat { }; struct lruvec { - struct list_head lists[NR_LRU_LISTS]; - struct zone_reclaim_stat reclaim_stat; + struct list_head lists[NR_LRU_LISTS]; + struct zone_reclaim_stat reclaim_stat; + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; #ifdef CONFIG_MEMCG - struct zone *zone; + struct zone *zone; #endif }; @@ -490,9 +492,6 @@ struct zone { spinlock_t lru_lock; struct lruvec lruvec; - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; - /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter @@ -761,6 +760,8 @@ static inline struct zone *lruvec_zone(struct lruvec *lruvec) #endif } +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru); + #ifdef CONFIG_HAVE_MEMORY_PRESENT void memory_present(int nid, unsigned long start, unsigned long end); #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 953f0f9..864e237 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -/* - * We restrict the id in the range of [1, 65535], so it can fit into - * an unsigned short. - */ -#define MEM_CGROUP_ID_MAX USHRT_MAX - -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); -} - #ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. diff --git a/mm/vmscan.c b/mm/vmscan.c index 08547a7..fd434cc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone) zone_reclaimable_pages(zone) * 6; } -static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); @@ -1923,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec) unsigned long inactive; unsigned long active; - inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); return active > inactive; } @@ -2063,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_file_is_low(lruvec) && - get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2089,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2130,7 +2130,7 @@ out: unsigned long size; unsigned long scan; - size = get_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru); scan = size >> sc->priority; if (!scan && pass && force_scan) diff --git a/mm/workingset.c b/mm/workingset.c index 9a26a60..14bc23a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -153,7 +153,8 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - ZONES_SHIFT + NODES_SHIFT) + ZONES_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -166,9 +167,10 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(unsigned long eviction, struct zone *zone) +static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) { eviction >>= bucket_order; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -176,18 +178,21 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, struct zone **zonep, +static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - int zid, nid; + int memcgid, nid, zid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; zid = entry & ((1UL << ZONES_SHIFT) - 1); entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + entry >>= MEM_CGROUP_ID_SHIFT; + *memcgidp = memcgid; *zonep = NODE_DATA(nid)->node_zones + zid; *evictionp = entry << bucket_order; } @@ -202,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg = page_memcg(page); struct zone *zone = page_zone(page); + int memcgid = mem_cgroup_id(memcg); unsigned long eviction; + struct lruvec *lruvec; - eviction = atomic_long_inc_return(&zone->inactive_age); - return pack_shadow(eviction, zone); + /* Page is fully exclusive and pins page->mem_cgroup */ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + eviction = atomic_long_inc_return(&lruvec->inactive_age); + return pack_shadow(memcgid, zone, eviction); } /** @@ -221,13 +235,42 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long active_file; + struct mem_cgroup *memcg; unsigned long eviction; + struct lruvec *lruvec; unsigned long refault; struct zone *zone; + int memcgid; - unpack_shadow(shadow, &zone, &eviction); + unpack_shadow(shadow, &memcgid, &zone, &eviction); - refault = atomic_long_read(&zone->inactive_age); + rcu_read_lock(); + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the page's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared page. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + * + * XXX: On !CONFIG_MEMCG, this will always return NULL; it + * would be better if the root_mem_cgroup existed in all + * configurations instead. + */ + memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !memcg) { + rcu_read_unlock(); + return false; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + rcu_read_unlock(); /* * The unsigned subtraction here gives an accurate distance @@ -249,7 +292,7 @@ bool workingset_refault(void *shadow) inc_zone_state(zone, WORKINGSET_REFAULT); - if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + if (refault_distance <= active_file) { inc_zone_state(zone, WORKINGSET_ACTIVATE); return true; } @@ -262,7 +305,23 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - atomic_long_inc(&page_zone(page)->inactive_age); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + memcg = lock_page_memcg(page); + /* + * Filter non-memcg pages here, e.g. unmap can call + * mark_page_accessed() on VDSO pages. + * + * XXX: See workingset_refault() - this should return + * root_mem_cgroup even for !CONFIG_MEMCG. + */ + if (!mem_cgroup_disabled() && !memcg) + goto out; + lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg); + atomic_long_inc(&lruvec->inactive_age); +out: + unlock_page_memcg(memcg); } /* -- cgit v0.10.2 From 6a93ca8fde3cfce0f00f02281139a377c83e8d8c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:19 -0700 Subject: mm: migrate: do not touch page->mem_cgroup of live pages Changing a page's memcg association complicates dealing with the page, so we want to limit this as much as possible. Page migration e.g. does not have to do that. Just like page cache replacement, it can forcibly charge a replacement page, and then uncharge the old page when it gets freed. Temporarily overcharging the cgroup by a single page is not an issue in practice, and charging is so cheap nowadays that this is much preferrable to the headache of messing with live pages. The only place that still changes the page->mem_cgroup binding of live pages is when pages move along with a task to another cgroup. But that path isolates the page from the LRU, takes the page lock, and the move lock (lock_page_memcg()). That means page->mem_cgroup is always stable in callers that have the page isolated from the LRU or locked. Lighter unlocked paths, like writeback accounting, can use lock_page_memcg(). [akpm@linux-foundation.org: fix build] [vdavydov@virtuozzo.com: fix lockdep splat] Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 09b4498..c45ab3f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -300,7 +300,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); -void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage); +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); @@ -580,7 +580,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_replace_page(struct page *old, struct page *new) +static inline void mem_cgroup_migrate(struct page *old, struct page *new) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index b97243d..6b471d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -905,20 +905,11 @@ static inline struct mem_cgroup *page_memcg(struct page *page) { return page->mem_cgroup; } - -static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) -{ - page->mem_cgroup = memcg; -} #else static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } - -static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) -{ -} #endif /* diff --git a/mm/filemap.c b/mm/filemap.c index ee8140c..d8317ca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -577,7 +577,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); unlock_page_memcg(memcg); - mem_cgroup_replace_page(old, new); + mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) freepage(old); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 864e237..64506b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4457,7 +4457,7 @@ static int mem_cgroup_move_account(struct page *page, VM_BUG_ON(compound && !PageTransHuge(page)); /* - * Prevent mem_cgroup_replace_page() from looking at + * Prevent mem_cgroup_migrate() from looking at * page->mem_cgroup of its source page while we change it. */ ret = -EBUSY; @@ -5486,16 +5486,17 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_replace_page - migrate a charge to another page - * @oldpage: currently charged page - * @newpage: page to transfer the charge to + * mem_cgroup_migrate - charge a page's replacement + * @oldpage: currently circulating page + * @newpage: replacement page * - * Migrate the charge from @oldpage to @newpage. + * Charge @newpage as a replacement page for @oldpage. @oldpage will + * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. * Either or both pages might be on the LRU already. */ -void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; unsigned int nr_pages; diff --git a/mm/migrate.c b/mm/migrate.c index 432ecd0..848327d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -326,12 +326,13 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; /* No turning back from here */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) SetPageSwapBacked(newpage); + mem_cgroup_migrate(page, newpage); + return MIGRATEPAGE_SUCCESS; } @@ -373,7 +374,6 @@ int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page: * no turning back from here. */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -428,6 +428,8 @@ int migrate_page_move_mapping(struct address_space *mapping, } local_irq_enable(); + mem_cgroup_migrate(page, newpage); + return MIGRATEPAGE_SUCCESS; } @@ -458,9 +460,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; + get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -468,6 +470,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); + + mem_cgroup_migrate(page, newpage); + return MIGRATEPAGE_SUCCESS; } @@ -775,7 +780,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; } @@ -1842,8 +1846,7 @@ fail_putback: } mlock_migrate_page(new_page, page); - set_page_memcg(new_page, page_memcg(page)); - set_page_memcg(page, NULL); + mem_cgroup_migrate(page, new_page); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); diff --git a/mm/shmem.c b/mm/shmem.c index 440e2a7..1acfdbc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } -- cgit v0.10.2 From 62cccb8c8e7a3ca233f49d5e7dcb1557d25465cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:22 -0700 Subject: mm: simplify lock_page_memcg() Now that migration doesn't clear page->mem_cgroup of live pages anymore, it's safe to make lock_page_memcg() and the memcg stat functions take pages, and spare the callers from memcg objects. [akpm@linux-foundation.org: fix warnings] Signed-off-by: Johannes Weiner Suggested-by: Vladimir Davydov Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/buffer.c b/fs/buffer.c index dc99151..33be296 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -624,14 +624,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * The caller must hold lock_page_memcg(). */ static void __set_page_dirty(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, int warn) + int warn) { unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping, int __set_page_dirty_buffers(struct page *page) { int newly_dirty; - struct mem_cgroup *memcg; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -686,14 +685,14 @@ int __set_page_dirty_buffers(struct page *page) * Lock out page->mem_cgroup migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ - memcg = lock_page_memcg(page); + lock_page_memcg(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, memcg, 1); + __set_page_dirty(page, mapping, 1); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; struct address_space *mapping = NULL; - struct mem_cgroup *memcg; - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { mapping = page_mapping(page); if (mapping) - __set_page_dirty(page, mapping, memcg, 0); + __set_page_dirty(page, mapping, 0); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f85ebc..5c57b7b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1957,7 +1957,6 @@ xfs_vm_set_page_dirty( loff_t end_offset; loff_t offset; int newly_dirty; - struct mem_cgroup *memcg; if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -1981,7 +1980,7 @@ xfs_vm_set_page_dirty( * Lock out page->mem_cgroup migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ - memcg = lock_page_memcg(page); + lock_page_memcg(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1992,13 +1991,13 @@ xfs_vm_set_page_dirty( spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c45ab3f..d560c9a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -455,42 +455,42 @@ bool mem_cgroup_oom_synchronize(bool wait); extern int do_swap_account; #endif -struct mem_cgroup *lock_page_memcg(struct page *page); -void unlock_page_memcg(struct mem_cgroup *memcg); +void lock_page_memcg(struct page *page); +void unlock_page_memcg(struct page *page); /** * mem_cgroup_update_page_stat - update page state statistics - * @memcg: memcg to account against + * @page: the page * @idx: page state item to account * @val: number of pages (positive or negative) * * Callers must use lock_page_memcg() to prevent double accounting * when the page is concurrently being moved to another memcg: * - * memcg = lock_page_memcg(page); + * lock_page_memcg(page); * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(memcg, state, -1); - * unlock_page_memcg(memcg); + * mem_cgroup_update_page_stat(page, state, -1); + * unlock_page_memcg(page); */ -static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, +static inline void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_stat_index idx, int val) { VM_BUG_ON(!rcu_read_lock_held()); - if (memcg) - this_cpu_add(memcg->stat->count[idx], val); + if (page->mem_cgroup) + this_cpu_add(page->mem_cgroup->stat->count[idx], val); } -static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, +static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(memcg, idx, 1); + mem_cgroup_update_page_stat(page, idx, 1); } -static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, +static inline void mem_cgroup_dec_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(memcg, idx, -1); + mem_cgroup_update_page_stat(page, idx, -1); } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, @@ -661,12 +661,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline struct mem_cgroup *lock_page_memcg(struct page *page) +static inline void lock_page_memcg(struct page *page) { - return NULL; } -static inline void unlock_page_memcg(struct mem_cgroup *memcg) +static inline void unlock_page_memcg(struct page *page) { } @@ -692,12 +691,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, +static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { } -static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, +static inline void mem_cgroup_dec_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b471d1..a862b4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1291,10 +1291,9 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg); +void account_page_dirtied(struct page *page, struct address_space *mapping); void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, struct bdi_writeback *wb); + struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void cancel_dirty_page(struct page *page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 92395a0..183b15e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -663,8 +663,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg); +extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* diff --git a/mm/filemap.c b/mm/filemap.c index d8317ca..8e629c4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -179,8 +179,7 @@ static void page_cache_tree_delete(struct address_space *mapping, * is safe. The caller must hold the mapping's tree_lock and * lock_page_memcg(). */ -void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -239,8 +238,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, memcg, - inode_to_wb(mapping->host)); + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } /** @@ -254,7 +252,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; unsigned long flags; void (*freepage)(struct page *); @@ -263,11 +260,11 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = lock_page_memcg(page); + lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (freepage) freepage(page); @@ -551,7 +548,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); - struct mem_cgroup *memcg; unsigned long flags; pgoff_t offset = old->index; @@ -561,9 +557,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = lock_page_memcg(old); + lock_page_memcg(old); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(old, NULL, memcg); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -576,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(old); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64506b2..3e41998 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1690,7 +1690,7 @@ cleanup: * This function protects unlocked LRU pages from being moved to * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *lock_page_memcg(struct page *page) +void lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1699,25 +1699,18 @@ struct mem_cgroup *lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page from being uncharged. - * E.g. end-writeback clearing PageWriteback(), which allows - * migration to go ahead and uncharge the page before the - * account transaction might be complete. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return NULL; + return; if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1733,16 +1726,18 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return memcg; + return; } EXPORT_SYMBOL(lock_page_memcg); /** * unlock_page_memcg - unlock a page->mem_cgroup binding - * @memcg: the memcg returned by lock_page_memcg() + * @page: the page */ -void unlock_page_memcg(struct mem_cgroup *memcg) +void unlock_page_memcg(struct page *page) { + struct mem_cgroup *memcg = page->mem_cgroup; + if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2b5ea12..d7cf2c5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2414,8 +2414,7 @@ int __set_page_dirty_no_writeback(struct page *page) * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg) +void account_page_dirtied(struct page *page, struct address_space *mapping) { struct inode *inode = mapping->host; @@ -2427,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2445,10 +2444,10 @@ EXPORT_SYMBOL(account_page_dirtied); * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, struct bdi_writeback *wb) + struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); @@ -2469,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, */ int __set_page_dirty_nobuffers(struct page *page) { - struct mem_cgroup *memcg; - - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 1; } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2496,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2626,17 +2623,16 @@ void cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; - memcg = lock_page_memcg(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, memcg, wb); + account_page_cleaned(page, mapping, wb); unlocked_inode_to_wb_end(inode, locked); - unlock_page_memcg(memcg); + unlock_page_memcg(page); } else { ClearPageDirty(page); } @@ -2667,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; /* @@ -2705,16 +2700,16 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = lock_page_memcg(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, locked); - unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } return TestClearPageDirty(page); @@ -2724,10 +2719,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2751,21 +2745,20 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2793,10 +2786,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 2871e7d..02f0bfc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); } static void page_remove_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = lock_page_memcg(page); + lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { @@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page) * pte lock(a spinlock) is held, which implies preemption disabled. */ __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - unlock_page_memcg(memcg); + unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) diff --git a/mm/truncate.c b/mm/truncate.c index 51a24f6..87311af 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg; unsigned long flags; if (page->mapping != mapping) @@ -528,15 +527,15 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = lock_page_memcg(page); + lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -545,7 +544,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index fd434cc..34f7e2d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -603,12 +603,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; - struct mem_cgroup *memcg; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = lock_page_memcg(page); + lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -648,7 +647,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -674,9 +673,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow, memcg); + __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (freepage != NULL) freepage(page); @@ -686,7 +685,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index 14bc23a..6130ba0 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -305,10 +305,9 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - struct mem_cgroup *memcg; struct lruvec *lruvec; - memcg = lock_page_memcg(page); + lock_page_memcg(page); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. @@ -316,12 +315,12 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - if (!mem_cgroup_disabled() && !memcg) + if (!mem_cgroup_disabled() && !page_memcg(page)) goto out; - lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg); + lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); atomic_long_inc(&lruvec->inactive_age); out: - unlock_page_memcg(memcg); + unlock_page_memcg(page); } /* -- cgit v0.10.2 From fdf1cdb91b6ab7a8a91df68c384f36b8a0909cab Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:25 -0700 Subject: mm: remove unnecessary uses of lock_page_memcg() There are several users that nest lock_page_memcg() inside lock_page() to prevent page->mem_cgroup from changing. But the page lock prevents pages from moving between cgroups, so that is unnecessary overhead. Remove lock_page_memcg() in contexts with locked contexts and fix the debug code in the page stat functions to be okay with the page lock. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d560c9a..f0c4bec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -28,6 +28,7 @@ #include #include #include +#include struct mem_cgroup; struct page; @@ -464,18 +465,19 @@ void unlock_page_memcg(struct page *page); * @idx: page state item to account * @val: number of pages (positive or negative) * - * Callers must use lock_page_memcg() to prevent double accounting - * when the page is concurrently being moved to another memcg: + * The @page must be locked or the caller must use lock_page_memcg() + * to prevent double accounting when the page is concurrently being + * moved to another memcg: * - * lock_page_memcg(page); + * lock_page(page) or lock_page_memcg(page) * if (TestClearPageState(page)) * mem_cgroup_update_page_stat(page, state, -1); - * unlock_page_memcg(page); + * unlock_page(page) or unlock_page_memcg(page) */ static inline void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_stat_index idx, int val) { - VM_BUG_ON(!rcu_read_lock_held()); + VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page))); if (page->mem_cgroup) this_cpu_add(page->mem_cgroup->stat->count[idx], val); diff --git a/mm/filemap.c b/mm/filemap.c index 8e629c4..61b441b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -176,8 +176,7 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock and - * lock_page_memcg(). + * is safe. The caller must hold the mapping's tree_lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -260,11 +259,9 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); if (freepage) freepage(page); @@ -557,7 +554,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - lock_page_memcg(old); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); @@ -572,7 +568,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(old); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d7cf2c5..11ff8f7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2700,7 +2700,6 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); @@ -2709,7 +2708,6 @@ int clear_page_dirty_for_io(struct page *page) ret = 1; } unlocked_inode_to_wb_end(inode, locked); - unlock_page_memcg(page); return ret; } return TestClearPageDirty(page); diff --git a/mm/truncate.c b/mm/truncate.c index 87311af..7598b55 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -527,7 +527,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; @@ -535,7 +534,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -544,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 34f7e2d..dd98447 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -607,7 +607,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -647,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -675,7 +673,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); if (freepage != NULL) freepage(page); @@ -685,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); return 0; } -- cgit v0.10.2 From 88193f7ce6657ec4197b1f26b73b37197373b8e6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 15 Mar 2016 14:57:28 -0700 Subject: mm: use linear_page_index() in do_fault() do_fault() assumes that PAGE_SIZE is the same as PAGE_CACHE_SIZE. Use linear_page_index() to calculate pgoff in the correct units. Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index 8adb5b7..032f05c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3124,8 +3124,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { - pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pgoff_t pgoff = linear_page_index(vma, address); pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ -- cgit v0.10.2 From 8df651c7059e7980f08430d4ebbd134b046657ee Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 15 Mar 2016 14:57:30 -0700 Subject: thp: cleanup split_huge_page() After one of bugfixes to freeze_page(), we don't have freezed pages in rmap, therefore mapcount of all subpages of freezed THP is zero. And we have assert for that. Let's drop code which deal with non-zero mapcount of subpages. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e10a4fe..1ea21e2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) } } -static int __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { - int mapcount; struct page *page_tail = head + tail; - mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_add(), we + * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_add(). + * it's safer to use atomic_inc(). */ - atomic_add(mapcount + 1, &page_tail->_count); - + atomic_inc(&page_tail->_count); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); lru_add_page_tail(head, page_tail, lruvec, list); - - return mapcount; } static void __split_huge_page(struct page *page, struct list_head *list) @@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - int i, tail_mapcount; + int i; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - tail_mapcount = 0; for (i = HPAGE_PMD_NR - 1; i >= 1; i--) - tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); - atomic_sub(tail_mapcount, &head->_count); + __split_huge_page_tail(head, i, lruvec, list); ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); -- cgit v0.10.2 From 288cf3c64e4522d28349de5345348574cbe9df83 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 15 Mar 2016 14:57:33 -0700 Subject: x86: query dynamic DEBUG_PAGEALLOC setting We can use debug_pagealloc_enabled() to check if we can map the identity mapping with 2MB pages. We can also add the state into the dump_stack output. The patch does not touch the code for the 1GB pages, which ignored CONFIG_DEBUG_PAGEALLOC. Do we need to fence this as well? Signed-off-by: Christian Borntraeger Reviewed-by: Thomas Gleixner Acked-by: David Rientjes Cc: Laura Abbott Cc: Joonsoo Kim Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9c30acf..32e5699 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -265,9 +265,8 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_SMP printk("SMP "); #endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC "); -#endif + if (debug_pagealloc_enabled()) + printk("DEBUG_PAGEALLOC "); #ifdef CONFIG_KASAN printk("KASAN"); #endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 493f541..39823fd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -150,13 +150,14 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { -#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) +#if !defined(CONFIG_KMEMCHECK) /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will + * use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse) + if (cpu_has_pse && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1c37e65..e64a470 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void) #endif -#ifdef CONFIG_DEBUG_PAGEALLOC -# define debug_pagealloc 1 -#else -# define debug_pagealloc 0 -#endif - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -714,10 +708,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, { struct page *base; - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; @@ -1337,10 +1331,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); - if (!debug_pagealloc) + if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) return ret; -- cgit v0.10.2 From 10917b8393521b608f26294b3815261d472f7e9b Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 15 Mar 2016 14:57:36 -0700 Subject: s390: query dynamic DEBUG_PAGEALLOC setting We can use debug_pagealloc_enabled() to check if we can map the identity mapping with 1MB/2GB pages as well as to print the current setting in dump_stack. Signed-off-by: Christian Borntraeger Reviewed-by: Heiko Carstens Cc: Thomas Gleixner Acked-by: David Rientjes Cc: Laura Abbott Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 02bd02f..14c1ed3 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -184,9 +185,8 @@ void die(struct pt_regs *regs, const char *str) #ifdef CONFIG_SMP printk("SMP "); #endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif + if (debug_pagealloc_enabled()) + printk("DEBUG_PAGEALLOC"); printk("\n"); notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV); print_modules(); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index ef7d6c8..d27fccba 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -94,16 +94,15 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) pgd_populate(&init_mm, pg_dir, pu_dir); } pu_dir = pud_offset(pg_dir, address); -#ifndef CONFIG_DEBUG_PAGEALLOC if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && - !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) { + !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && + !debug_pagealloc_enabled()) { pud_val(*pu_dir) = __pa(address) | _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE | (ro ? _REGION_ENTRY_PROTECT : 0); address += PUD_SIZE; continue; } -#endif if (pud_none(*pu_dir)) { pm_dir = vmem_pmd_alloc(); if (!pm_dir) @@ -111,9 +110,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) pud_populate(&init_mm, pu_dir, pm_dir); } pm_dir = pmd_offset(pu_dir, address); -#ifndef CONFIG_DEBUG_PAGEALLOC if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && - !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) { + !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && + !debug_pagealloc_enabled()) { pmd_val(*pm_dir) = __pa(address) | _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_YOUNG | @@ -121,7 +120,6 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) address += PMD_SIZE; continue; } -#endif if (pmd_none(*pm_dir)) { pt_dir = vmem_pte_alloc(address); if (!pt_dir) -- cgit v0.10.2 From a75e1f637cf137f82cf025321e7a53adeeed7029 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 15 Mar 2016 14:57:39 -0700 Subject: x86: also use debug_pagealloc_enabled() for free_init_pages we want to couple all debugging features with debug_pagealloc_enabled() and not with the config option CONFIG_DEBUG_PAGEALLOC. Signed-off-by: Christian Borntraeger Suggested-by: David Rientjes Acked-by: David Rientjes Reviewed-by: Thomas Gleixner Cc: Laura Abbott Cc: Joonsoo Kim Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 39823fd..9d56f27 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -667,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) * mark them not present - any buggy init-section access will * create a kernel page fault: */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", - begin, end - 1); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable and non-executable first. - */ - set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + if (debug_pagealloc_enabled()) { + pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", + begin, end - 1); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); + } else { + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable and non-executable first. + */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); -#endif + free_reserved_area((void *)begin, (void *)end, + POISON_FREE_INITMEM, what); + } } void free_initmem(void) -- cgit v0.10.2 From 5aa174801fc85b4b48a2520b2a50ee78990b7925 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 15 Mar 2016 14:57:42 -0700 Subject: mm/memblock.c: remove unnecessary memblock_type variable We define struct memblock_type *type in the memblock_add_region() and memblock_reserve_region() functions only for passing it to the memlock_add_range() and memblock_reserve_range() functions. Let's remove these variables and will pass a type directly. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memblock.c b/mm/memblock.c index dd79899..fc7824f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.memory; - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) -- cgit v0.10.2 From 623446e4dc45b37740268165107cc63abb3022f0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:45 -0700 Subject: mm/compaction: fix invalid free_pfn and compact_cached_free_pfn free_pfn and compact_cached_free_pfn are the pointer that remember restart position of freepage scanner. When they are reset or invalid, we set them to zone_end_pfn because freepage scanner works in reverse direction. But, because zone range is defined as [zone_start_pfn, zone_end_pfn), zone_end_pfn is invalid to access. Therefore, we should not store it to free_pfn and compact_cached_free_pfn. Instead, we need to store zone_end_pfn - 1 to them. There is one more thing we should consider. Freepage scanner scan reversely by pageblock unit. If free_pfn and compact_cached_free_pfn are set to middle of pageblock, it regards that sitiation as that it already scans front part of pageblock so we lose opportunity to scan there. To fix-up, this patch do round_down() to guarantee that reset position will be pageblock aligned. Note that thanks to the current pageblock_pfn_to_page() implementation, actual access to zone_end_pfn doesn't happen until now. But, following patch will change pageblock_pfn_to_page() so this patch is needed from now on. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 585de54..56fa321 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -200,7 +200,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); } /* @@ -1371,11 +1372,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; -- cgit v0.10.2 From e1409c325fdc1fef7b3d8025c51892355f065d15 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:48 -0700 Subject: mm/compaction: pass only pageblock aligned range to pageblock_pfn_to_page pageblock_pfn_to_page() is used to check there is valid pfn and all pages in the pageblock is in a single zone. If there is a hole in the pageblock, passing arbitrary position to pageblock_pfn_to_page() could cause to skip whole pageblock scanning, instead of just skipping the hole page. For deterministic behaviour, it's better to always pass pageblock aligned range to pageblock_pfn_to_page(). It will also help further optimization on pageblock_pfn_to_page() in the following patch. Signed-off-by: Joonsoo Kim Cc: Aaron Lu Cc: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 56fa321..8ce36eb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -555,13 +555,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -574,11 +578,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -864,18 +870,23 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, @@ -1104,7 +1115,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = @@ -1116,16 +1129,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1136,7 +1154,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1155,8 +1174,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); -- cgit v0.10.2 From 7cf91a98e607c2f935dbcc177d70011e95b8faff Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:51 -0700 Subject: mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous There is a performance drop report due to hugepage allocation and in there half of cpu time are spent on pageblock_pfn_to_page() in compaction [1]. In that workload, compaction is triggered to make hugepage but most of pageblocks are un-available for compaction due to pageblock type and skip bit so compaction usually fails. Most costly operations in this case is to find valid pageblock while scanning whole zone range. To check if pageblock is valid to compact, valid pfn within pageblock is required and we can obtain it by calling pageblock_pfn_to_page(). This function checks whether pageblock is in a single zone and return valid pfn if possible. Problem is that we need to check it every time before scanning pageblock even if we re-visit it and this turns out to be very expensive in this workload. Although we have no way to skip this pageblock check in the system where hole exists at arbitrary position, we can use cached value for zone continuity and just do pfn_to_page() in the system where hole doesn't exist. This optimization considerably speeds up in above workload. Before vs After Max: 1096 MB/s vs 1325 MB/s Min: 635 MB/s 1015 MB/s Avg: 899 MB/s 1194 MB/s Avg is improved by roughly 30% [2]. [1]: http://www.spinics.net/lists/linux-mm/msg97378.html [2]: https://lkml.org/lkml/2015/12/9/23 [akpm@linux-foundation.org: don't forget to restore zone->contiguous on error path, per Vlastimil] Signed-off-by: Joonsoo Kim Reported-by: Aaron Lu Acked-by: Vlastimil Babka Tested-by: Aaron Lu Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 06546b3..bb16dfe 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -519,13 +519,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void page_alloc_init_late(void); -#else -static inline void page_alloc_init_late(void) -{ -} -#endif /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 769d768..adbef58 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -198,6 +198,9 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 03cbdd9..6de02ac3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -522,6 +522,8 @@ struct zone { bool compact_blockskip_flush; #endif + bool contiguous; + ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; diff --git a/mm/compaction.c b/mm/compaction.c index 8ce36eb..93f71d9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* - * Check that the whole (or subset of) a pageblock given by the interval of - * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. - * - * Return struct page pointer of start_pfn, or NULL if checks were not passed. - * - * It's possible on some configurations to have a setup like node0 node1 node0 - * i.e. it's possible that all pages within a zones range of pages do not - * belong to a single zone. We assume that a border between node0 and node1 - * can occur within a single pageblock, but not a node0 node1 node0 - * interleaving within a single pageblock. It is therefore sufficient to check - * the first and last page of a pageblock and avoid checking each individual - * page in a pageblock. - */ -static struct page *pageblock_pfn_to_page(unsigned long start_pfn, - unsigned long end_pfn, struct zone *zone) -{ - struct page *start_page; - struct page *end_page; - - /* end_pfn is one past the range we are checking */ - end_pfn--; - - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) - return NULL; - - start_page = pfn_to_page(start_pfn); - - if (page_zone(start_page) != zone) - return NULL; - - end_page = pfn_to_page(end_pfn); - - /* This gives a shorter code than deriving page_zone(end_page) */ - if (page_zone_id(start_page) != page_zone_id(end_page)) - return NULL; - - return start_page; -} - #ifdef CONFIG_COMPACTION /* Do not skip compaction more than 64 times */ diff --git a/mm/internal.h b/mm/internal.h index 6636e1d..ad9400d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -132,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) return page_idx ^ (1 << order); } +extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone); + +static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + if (zone->contiguous) + return pfn_to_page(start_pfn); + + return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 484e867..24ea063 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -512,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; + clear_zone_contiguous(zone); + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -524,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, if (altmap->base_pfn != phys_start_pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - return -EINVAL; + err = -EINVAL; + goto out; } altmap->alloc = 0; } @@ -542,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, err = 0; } vmemmap_populate_print_last(); - +out: + set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -814,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } } + clear_zone_contiguous(zone); + /* * We can only remove entire sections */ @@ -829,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, if (ret) break; } + + set_zone_contiguous(zone); + return ret; } EXPORT_SYMBOL_GPL(__remove_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 50897dc..c46b75d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1128,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, return __free_pages_boot_core(page, pfn, order); } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) @@ -1278,9 +1347,13 @@ free_range: pgdat_init_report_one_done(); return 0; } +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) { + struct zone *zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT int nid; /* There will be num_node_state(N_MEMORY) threads */ @@ -1294,8 +1367,11 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); +#endif + + for_each_populated_zone(zone) + set_zone_contiguous(zone); } -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ -- cgit v0.10.2 From 74485cf2bc85d2a10c3653fff4fe956db67ce2a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:54 -0700 Subject: mm: migrate: consolidate mem_cgroup_migrate() calls Rather than scattering mem_cgroup_migrate() calls all over the place, have a single call from a safe place where every migration operation eventually ends up in - migrate_page_copy(). Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Mateusz Guzik Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 848327d..568284e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -331,8 +331,6 @@ int migrate_page_move_mapping(struct address_space *mapping, if (PageSwapBacked(page)) SetPageSwapBacked(newpage); - mem_cgroup_migrate(page, newpage); - return MIGRATEPAGE_SUCCESS; } @@ -428,8 +426,6 @@ int migrate_page_move_mapping(struct address_space *mapping, } local_irq_enable(); - mem_cgroup_migrate(page, newpage); - return MIGRATEPAGE_SUCCESS; } @@ -471,8 +467,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_migrate(page, newpage); - return MIGRATEPAGE_SUCCESS; } @@ -586,6 +580,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) end_page_writeback(newpage); copy_page_owner(page, newpage); + + mem_cgroup_migrate(page, newpage); } /************************************************************ @@ -1846,7 +1842,6 @@ fail_putback: } mlock_migrate_page(new_page, page); - mem_cgroup_migrate(page, new_page); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); -- cgit v0.10.2 From 9cf7666ace654cc50f815e95f0415e3fcdd6fcfe Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:58 -0700 Subject: mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate() Migration accounting in the memory controller used to have to handle both oldpage and newpage being on the LRU already; fuse's page cache replacement used to pass a recycled newpage that had been uncharged but not freed and removed from the LRU, and the memcg migration code used to uncharge oldpage to "pass on" the existing charge to newpage. Nowadays, pages are no longer uncharged when truncated from the page cache, but rather only at free time, so if a LRU page is recycled in page cache replacement it'll also still be charged. And we bail out of the charge transfer altogether in that case. Tell commit_charge() that we know newpage is not on the LRU, to avoid taking the zone->lru_lock unnecessarily from the migration path. But also, oldpage is no longer uncharged inside migration. We only use oldpage for its page->mem_cgroup and page size, so we don't care about its LRU state anymore either. Remove any mention from the kernel doc. Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Mateusz Guzik Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e41998..42882c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5489,7 +5489,6 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. - * Either or both pages might be on the LRU already. */ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { @@ -5524,7 +5523,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, true); + commit_charge(newpage, memcg, false); local_irq_disable(); mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); -- cgit v0.10.2 From 42e152931deba7500e756c39e8e2df2e244860f2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 15 Mar 2016 14:58:01 -0700 Subject: checkpatch: exclude asm volatile from complex macro check asm volatile and all its variants like __asm__ __volatile__ ("") are reported as errors with "Macros with with complex values should be enclosed in parentheses". Make an exception for these asm volatile macro definitions by converting the "asm volatile" to "asm_volatile" so it appears as a single function call and the error isn't reported. Signed-off-by: Joe Perches Reported-by: Jeff Merkey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0147c91..5c00c1c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4560,6 +4560,9 @@ sub process { { } + # Make asm volatile uses seem like a generic function + $dstat =~ s/\b_*asm_*\s+_*volatile_*\b/asm_volatile/g; + my $exceptions = qr{ $Declare| module_param_named| -- cgit v0.10.2 From a1ce18e4f941d2039aa3bdeee17db968919eac2f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 15 Mar 2016 14:58:03 -0700 Subject: checkpatch: warn on bare unsigned or signed declarations without int Kernel style prefers "unsigned int " over "unsigned " and "signed int " over "signed ". Emit a warning for these simple signed/unsigned declarations. Fix it too if desired. Signed-off-by: Joe Perches Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5c00c1c..4b314bd 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3239,6 +3239,26 @@ sub process { #ignore lines not being added next if ($line =~ /^[^\+]/); +# check for declarations of signed or unsigned without int + while ($line =~ m{($Declare++)\s*($Ident)\s*[=,;\[\)]}g) { + my $type = $1; + my $var = $2; + if ($type =~ /^((?:un)?signed)((?:\s*\*)*)\s*$/) { + my $sign = $1; + my $pointer = $2; + + $pointer = "" if (!defined $pointer); + + if (WARN("UNSPECIFIED_INT", + "Prefer '" . trim($sign) . " int" . rtrim($pointer) . "' to bare use of '$sign" . rtrim($pointer) . "'\n" . $herecurr) && + $fix) { + my $decl = trim($sign) . " int "; + $decl .= trim($pointer) if (rtrim($pointer) ne ""); + $fixed[$fixlinenr] =~ s@\b\Q$type\E\s*$var\b@$decl$var@; + } + } + } + # TEST: allow direct testing of the type matcher. if ($dbg_type) { if ($line =~ /^.\s*$Declare\s*$/) { -- cgit v0.10.2 From 207a8e8465f91dc5ab12151ee36db3d509a4928a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 15 Mar 2016 14:58:06 -0700 Subject: checkpatch: improve UNSPECIFIED_INT test for bare signed/unsigned uses Improve the test to allow casts to (unsigned) or (signed) to be found and fixed if desired. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4b314bd..b7f44b2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3240,10 +3240,11 @@ sub process { next if ($line =~ /^[^\+]/); # check for declarations of signed or unsigned without int - while ($line =~ m{($Declare++)\s*($Ident)\s*[=,;\[\)]}g) { + while ($line =~ m{($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) { my $type = $1; my $var = $2; - if ($type =~ /^((?:un)?signed)((?:\s*\*)*)\s*$/) { + $var = "" if (!defined $var); + if ($type =~ /^(?:(?:$Storage|$Inline|$Attribute)\s+)*((?:un)?signed)((?:\s*\*)*)\s*$/) { my $sign = $1; my $pointer = $2; @@ -3253,8 +3254,11 @@ sub process { "Prefer '" . trim($sign) . " int" . rtrim($pointer) . "' to bare use of '$sign" . rtrim($pointer) . "'\n" . $herecurr) && $fix) { my $decl = trim($sign) . " int "; - $decl .= trim($pointer) if (rtrim($pointer) ne ""); - $fixed[$fixlinenr] =~ s@\b\Q$type\E\s*$var\b@$decl$var@; + my $comp_pointer = $pointer; + $comp_pointer =~ s/\s//g; + $decl .= $comp_pointer; + $decl = rtrim($decl) if ($var eq ""); + $fixed[$fixlinenr] =~ s@\b$sign\s*\Q$pointer\E\s*$var\b@$decl$var@; } } } -- cgit v0.10.2 From 6b8c69e4384b0bcf1936c2137804840212daaf69 Mon Sep 17 00:00:00 2001 From: "Geyslan G. Bem" Date: Tue, 15 Mar 2016 14:58:09 -0700 Subject: checkpatch: fix another left brace warning This patch escapes a regex that uses left brace. Using checkpatch.pl with Perl 5.22.0 generates the warning: "Unescaped left brace in regex is deprecated, passed through in regex;" Comment from regcomp.c in Perl source: "Currently we don't warn when the lbrace is at the start of a construct. This catches it in the middle of a literal string, or when it's the first thing after something like "\b"." This works as a complement to 4e5d56bd ("checkpatch: fix left brace warning"). Signed-off-by: Geyslan G. Bem Signed-off-by: Joe Perches Suggested-by: Peter Senna Tschudin Cc: Eddie Kovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b7f44b2..c64e604 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4132,7 +4132,7 @@ sub process { ## } #need space before brace following if, while, etc - if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) || + if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\)\{/) || $line =~ /do\{/) { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && -- cgit v0.10.2 From 4d5d5664c9008c30ade92a56f722223d251883d7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 15 Mar 2016 14:58:12 -0700 Subject: x86: kallsyms: disable absolute percpu symbols on !SMP scripts/kallsyms.c has a special --absolute-percpu command line option which deals with the zero based per cpu offsets that are used when building for SMP on x86_64. This means that the option should only be passed in that case, so add a Kconfig symbol with the correct predicate, and use that instead. Signed-off-by: Ard Biesheuvel Tested-by: Guenter Roeck Reviewed-by: Kees Cook Tested-by: Kees Cook Acked-by: Rusty Russell Cc: Heiko Carstens Cc: Michael Ellerman Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Michal Marek Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/Kconfig b/init/Kconfig index 2232080..b17824a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1420,6 +1420,10 @@ config KALLSYMS_ALL Say N unless you really need all symbols. +config KALLSYMS_ABSOLUTE_PERCPU + bool + default X86_64 && SMP + config PRINTK default y bool "Enable support for printk" if EXPERT diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index ba6c34e..7a08bf9 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -86,7 +86,7 @@ kallsyms() kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET" fi - if [ -n "${CONFIG_X86_64}" ]; then + if [ -n "${CONFIG_KALLSYMS_ABSOLUTE_PERCPU}" ]; then kallsymopt="${kallsymopt} --absolute-percpu" fi -- cgit v0.10.2 From 8c996940b3be9c3ac40ce558c270817e1722a95b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 15 Mar 2016 14:58:15 -0700 Subject: kallsyms: don't overload absolute symbol type for percpu symbols Commit c6bda7c988a5 ("kallsyms: fix percpu vars on x86-64 with relocation") overloaded the 'A' (absolute) symbol type to signify that a symbol is not subject to dynamic relocation. However, the original A type does not imply that at all, and depending on the version of the toolchain, many A type symbols are emitted that are in fact relative to the kernel text, i.e., if the kernel is relocated at runtime, these symbols should be updated as well. For instance, on sparc32, the following symbols are emitted as absolute (kindly provided by Guenter Roeck): f035a420 A _etext f03d9000 A _sdata f03de8c4 A jiffies f03f8860 A _edata f03fc000 A __init_begin f041bdc8 A __init_text_end f0423000 A __bss_start f0423000 A __init_end f044457d A __bss_stop f044457d A _end On x86_64, similar behavior can be observed: ffffffff81a00000 A __end_rodata_hpage_align ffffffff81b19000 A __vvar_page ffffffff81d3d000 A _end Even if only a couple of them pass the symbol range check that results in them to be taken into account for the final kallsyms symbol table, it is obvious that 'A' does not mean the symbol does not need to be updated at relocation time, and overloading its meaning to signify that is perhaps not a good idea. So instead, add a new percpu_absolute member to struct sym_entry, and when --absolute-percpu is in effect, use it to record symbols whose addresses should be emitted as final values rather than values that still require relocation at runtime. That way, we can drop the check against the 'A' type. Signed-off-by: Ard Biesheuvel Tested-by: Guenter Roeck Reviewed-by: Kees Cook Tested-by: Kees Cook Cc: Heiko Carstens Cc: Michael Ellerman Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Michal Marek Acked-by: Rusty Russell Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 8fa81e8..d39a1ee 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -34,6 +34,7 @@ struct sym_entry { unsigned int len; unsigned int start_pos; unsigned char *sym; + unsigned int percpu_absolute; }; struct addr_range { @@ -171,6 +172,8 @@ static int read_symbol(FILE *in, struct sym_entry *s) strcpy((char *)s->sym + 1, str); s->sym[0] = stype; + s->percpu_absolute = 0; + /* Record if we've found __per_cpu_start/end. */ check_symbol_range(sym, s->addr, &percpu_range, 1); @@ -325,7 +328,7 @@ static int expand_symbol(unsigned char *data, int len, char *result) static int symbol_absolute(struct sym_entry *s) { - return toupper(s->sym[0]) == 'A'; + return s->percpu_absolute; } static void write_src(void) @@ -681,8 +684,15 @@ static void make_percpus_absolute(void) unsigned int i; for (i = 0; i < table_cnt; i++) - if (symbol_in_range(&table[i], &percpu_range, 1)) + if (symbol_in_range(&table[i], &percpu_range, 1)) { + /* + * Keep the 'A' override for percpu symbols to + * ensure consistent behavior compared to older + * versions of this tool. + */ table[i].sym[0] = 'A'; + table[i].percpu_absolute = 1; + } } int main(int argc, char **argv) -- cgit v0.10.2 From 2213e9a66bb87d8344a1256b4ef568220d9587fb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 15 Mar 2016 14:58:19 -0700 Subject: kallsyms: add support for relative offsets in kallsyms address table Similar to how relative extables are implemented, it is possible to emit the kallsyms table in such a way that it contains offsets relative to some anchor point in the kernel image rather than absolute addresses. On 64-bit architectures, it cuts the size of the kallsyms address table in half, since offsets between kernel symbols can typically be expressed in 32 bits. This saves several hundreds of kilobytes of permanent .rodata on average. In addition, the kallsyms address table is no longer subject to dynamic relocation when CONFIG_RELOCATABLE is in effect, so the relocation work done after decompression now doesn't have to do relocation updates for all these values. This saves up to 24 bytes (i.e., the size of a ELF64 RELA relocation table entry) per value, which easily adds up to a couple of megabytes of uncompressed __init data on ppc64 or arm64. Even if these relocation entries typically compress well, the combined size reduction of 2.8 MB uncompressed for a ppc64_defconfig build (of which 2.4 MB is __init data) results in a ~500 KB space saving in the compressed image. Since it is useful for some architectures (like x86) to retain the ability to emit absolute values as well, this patch also adds support for capturing both absolute and relative values when KALLSYMS_ABSOLUTE_PERCPU is in effect, by emitting absolute per-cpu addresses as positive 32-bit values, and addresses relative to the lowest encountered relative symbol as negative values, which are subtracted from the runtime address of this base symbol to produce the actual address. Support for the above is enabled by default for all architectures except IA-64 and Tile-GX, whose symbols are too far apart to capture in this manner. Signed-off-by: Ard Biesheuvel Tested-by: Guenter Roeck Reviewed-by: Kees Cook Tested-by: Kees Cook Cc: Heiko Carstens Cc: Michael Ellerman Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Michal Marek Cc: Rusty Russell Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/Kconfig b/init/Kconfig index b17824a..fd664b3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1424,6 +1424,24 @@ config KALLSYMS_ABSOLUTE_PERCPU bool default X86_64 && SMP +config KALLSYMS_BASE_RELATIVE + bool + depends on KALLSYMS + default !IA64 && !(TILE && 64BIT) + help + Instead of emitting them as absolute values in the native word size, + emit the symbol references in the kallsyms table as 32-bit entries, + each containing a relative value in the range [base, base + U32_MAX] + or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either + an absolute value in the range [0, S32_MAX] or a relative value in the + range [base, base + S32_MAX], where base is the lowest relative symbol + address encountered in the image. + + On 64-bit builds, this reduces the size of the address table by 50%, + but more importantly, it results in entries whose values are build + time constants, and no relocation pass is required at runtime to fix + up the entries based on the runtime load address of the kernel. + config PRINTK default y bool "Enable support for printk" if EXPERT diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5c5987f..fafd1a3 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -38,6 +38,7 @@ * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* @@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak; extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); +extern const unsigned long kallsyms_relative_base +__attribute__((weak, section(".rodata"))); + extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } +static unsigned long kallsyms_sym_address(int idx) +{ + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + return kallsyms_addresses[idx]; + + /* values are unsigned offsets if --absolute-percpu is not in effect */ + if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + + /* ...otherwise, positive offsets are absolute values */ + if (kallsyms_offsets[idx] >= 0) + return kallsyms_offsets[idx]; + + /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ + return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name) off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } @@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long i, low, high, mid; /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + BUG_ON(!kallsyms_addresses); + else + BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; @@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr, while (high - low > 1) { mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) + if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; @@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr, * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; - symbol_start = kallsyms_addresses[low]; + symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; + if (kallsyms_sym_address(i) > symbol_start) { + symbol_end = kallsyms_sym_address(i); break; } } @@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) unsigned off = iter->nameoff; iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; + iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index d39a1ee..638b143e 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -22,6 +22,7 @@ #include #include #include +#include #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -43,6 +44,7 @@ struct addr_range { }; static unsigned long long _text; +static unsigned long long relative_base; static struct addr_range text_ranges[] = { { "_stext", "_etext" }, { "_sinittext", "_einittext" }, @@ -62,6 +64,7 @@ static int all_symbols = 0; static int absolute_percpu = 0; static char symbol_prefix_char = '\0'; static unsigned long long kernel_start_addr = 0; +static int base_relative = 0; int token_profit[0x10000]; @@ -75,7 +78,7 @@ static void usage(void) fprintf(stderr, "Usage: kallsyms [--all-symbols] " "[--symbol-prefix=] " "[--page-offset=] " - "< in.map > out.S\n"); + "[--base-relative] < in.map > out.S\n"); exit(1); } @@ -205,6 +208,8 @@ static int symbol_valid(struct sym_entry *s) */ static char *special_symbols[] = { "kallsyms_addresses", + "kallsyms_offsets", + "kallsyms_relative_base", "kallsyms_num_syms", "kallsyms_names", "kallsyms_markers", @@ -349,16 +354,48 @@ static void write_src(void) printf("\t.section .rodata, \"a\"\n"); - /* Provide proper symbols relocatability by their '_text' - * relativeness. The symbol names cannot be used to construct - * normal symbol references as the list of symbols contains - * symbols that are declared static and are private to their - * .o files. This prevents .tmp_kallsyms.o or any other - * object from referencing them. + /* Provide proper symbols relocatability by their relativeness + * to a fixed anchor point in the runtime image, either '_text' + * for absolute address tables, in which case the linker will + * emit the final addresses at build time. Otherwise, use the + * offset relative to the lowest value encountered of all relative + * symbols, and emit non-relocatable fixed offsets that will be fixed + * up at runtime. + * + * The symbol names cannot be used to construct normal symbol + * references as the list of symbols contains symbols that are + * declared static and are private to their .o files. This prevents + * .tmp_kallsyms.o or any other object from referencing them. */ - output_label("kallsyms_addresses"); + if (!base_relative) + output_label("kallsyms_addresses"); + else + output_label("kallsyms_offsets"); + for (i = 0; i < table_cnt; i++) { - if (!symbol_absolute(&table[i])) { + if (base_relative) { + long long offset; + int overflow; + + if (!absolute_percpu) { + offset = table[i].addr - relative_base; + overflow = (offset < 0 || offset > UINT_MAX); + } else if (symbol_absolute(&table[i])) { + offset = table[i].addr; + overflow = (offset < 0 || offset > INT_MAX); + } else { + offset = relative_base - table[i].addr - 1; + overflow = (offset < INT_MIN || offset >= 0); + } + if (overflow) { + fprintf(stderr, "kallsyms failure: " + "%s symbol value %#llx out of range in relative mode\n", + symbol_absolute(&table[i]) ? "absolute" : "relative", + table[i].addr); + exit(EXIT_FAILURE); + } + printf("\t.long\t%#x\n", (int)offset); + } else if (!symbol_absolute(&table[i])) { if (_text <= table[i].addr) printf("\tPTR\t_text + %#llx\n", table[i].addr - _text); @@ -371,6 +408,12 @@ static void write_src(void) } printf("\n"); + if (base_relative) { + output_label("kallsyms_relative_base"); + printf("\tPTR\t_text - %#llx\n", _text - relative_base); + printf("\n"); + } + output_label("kallsyms_num_syms"); printf("\tPTR\t%d\n", table_cnt); printf("\n"); @@ -695,6 +738,18 @@ static void make_percpus_absolute(void) } } +/* find the minimum non-absolute symbol address */ +static void record_relative_base(void) +{ + unsigned int i; + + relative_base = -1ULL; + for (i = 0; i < table_cnt; i++) + if (!symbol_absolute(&table[i]) && + table[i].addr < relative_base) + relative_base = table[i].addr; +} + int main(int argc, char **argv) { if (argc >= 2) { @@ -713,7 +768,9 @@ int main(int argc, char **argv) } else if (strncmp(argv[i], "--page-offset=", 14) == 0) { const char *p = &argv[i][14]; kernel_start_addr = strtoull(p, NULL, 16); - } else + } else if (strcmp(argv[i], "--base-relative") == 0) + base_relative = 1; + else usage(); } } else if (argc != 1) @@ -722,6 +779,8 @@ int main(int argc, char **argv) read_map(stdin); if (absolute_percpu) make_percpus_absolute(); + if (base_relative) + record_relative_base(); sort_symbols(); optimize_token_table(); write_src(); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7a08bf9..453ede9 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -90,6 +90,10 @@ kallsyms() kallsymopt="${kallsymopt} --absolute-percpu" fi + if [ -n "${CONFIG_KALLSYMS_BASE_RELATIVE}" ]; then + kallsymopt="${kallsymopt} --base-relative" + fi + local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \ ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}" diff --git a/scripts/namespace.pl b/scripts/namespace.pl index a71be6b..9f3c9d4 100755 --- a/scripts/namespace.pl +++ b/scripts/namespace.pl @@ -117,6 +117,8 @@ my %nameexception = ( 'kallsyms_names' => 1, 'kallsyms_num_syms' => 1, 'kallsyms_addresses'=> 1, + 'kallsyms_offsets' => 1, + 'kallsyms_relative_base'=> 1, '__this_module' => 1, '_etext' => 1, '_edata' => 1, -- cgit v0.10.2 From c83aa55d0bb7ec5b07fdf2346af66bc33ba3cb3d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskiy Date: Tue, 15 Mar 2016 14:58:22 -0700 Subject: autofs: show pipe inode in mount options This is required for CRIU (Checkpoint Restart In Userspace) to migrate a mount point when write end in user space is closed. Below is a brief description of the problem. To migrate a non-catatonic autofs mount point, one has to restore the control pipe between kernel and autofs master process. One of the autofs masters is systemd, which closes pipe write end after passing it to the kernel with mount call. To be able to restore the systemd control pipe one has to know which read pipe end in systemd corresponds to the write pipe end in the kernel. The pipe "fd" in mount options is not enough because it was closed and probably replaced by some other descriptor. Thus, some other attribute is required to be able to find the read pipe end. The best attribute to use to find the correct pipe end is inode number becuase it's unique for the whole system and can't be reused while the autofs mount exists. This attribute can also be used to recognize a situation where an autofs mount has no master (no process with specified "pgrp" or no file descriptor with "pipe_ino", specified in autofs mount options). Signed-off-by: Stanislav Kinsburskiy Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index a3ae0b2..4320faa 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -94,7 +94,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); - +#ifdef CONFIG_CHECKPOINT_RESTORE + if (sbi->pipe) + seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino); + else + seq_printf(m, ",pipe_ino=-1"); +#endif return 0; } -- cgit v0.10.2 From e9a7c2f1a548f34bcaa7640094201e8b29247940 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:25 -0700 Subject: autofs4: coding style fixes Try and make the coding style completely consistent throughtout the autofs module and inline with kernel coding style recommendations. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c37149b..e50cfae 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -1,15 +1,11 @@ -/* -*- c -*- ------------------------------------------------------------- * - * - * linux/fs/autofs/autofs_i.h - * - * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved - * Copyright 2005-2006 Ian Kent +/* + * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved + * Copyright 2005-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ + */ /* Internal header file for autofs */ @@ -35,7 +31,7 @@ #include #include #include -#include +#include /* #define DEBUG */ @@ -51,12 +47,14 @@ printk(KERN_ERR "pid %d: %s: " fmt "\n", \ current->pid, __func__, ##__VA_ARGS__) -/* Unified info structure. This is pointed to by both the dentry and - inode structures. Each file in the filesystem has an instance of this - structure. It holds a reference to the dentry, so dentries are never - flushed while the file exists. All name lookups are dealt with at the - dentry level, although the filesystem can interfere in the validation - process. Readdir is implemented by traversing the dentry lists. */ +/* + * Unified info structure. This is pointed to by both the dentry and + * inode structures. Each file in the filesystem has an instance of this + * structure. It holds a reference to the dentry, so dentries are never + * flushed while the file exists. All name lookups are dealt with at the + * dentry level, although the filesystem can interfere in the validation + * process. Readdir is implemented by traversing the dentry lists. + */ struct autofs_info { struct dentry *dentry; struct inode *inode; @@ -78,7 +76,7 @@ struct autofs_info { kgid_t gid; }; -#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ #define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered * for expiry, so RCU_walk is * not permitted @@ -140,10 +138,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) } /* autofs4_oz_mode(): do we see the man behind the curtain? (The - processes which do manipulations for us in user space sees the raw - filesystem without "magic".) */ - -static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { + * processes which do manipulations for us in user space sees the raw + * filesystem without "magic".) + */ +static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) +{ return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } @@ -154,12 +153,12 @@ void autofs4_free_ino(struct autofs_info *); int is_autofs4_dentry(struct dentry *); int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); + struct autofs_sb_info *, + struct autofs_packet_expire __user *); int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int when); int autofs4_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); + struct autofs_sb_info *, int __user *); struct dentry *autofs4_expire_direct(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int how); @@ -224,8 +223,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); -int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); +int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify); +int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs4_catatonic_mode(struct autofs_sb_info *); static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) @@ -242,37 +241,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); } - return; } static inline void autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { spin_lock(&sbi->lookup_lock); if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); spin_unlock(&sbi->lookup_lock); } - return; } static inline void autofs4_del_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { spin_lock(&sbi->lookup_lock); if (!list_empty(&ino->expiring)) list_del_init(&ino->expiring); spin_unlock(&sbi->lookup_lock); } - return; } extern void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index ac7d921..c64b9fa 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -72,8 +72,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) { int err = 0; - if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || - (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { + if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || + (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { AUTOFS_WARN("ioctl control interface version mismatch: " "kernel(%u.%u), user(%u.%u), cmd(%d)", AUTOFS_DEV_IOCTL_VERSION_MAJOR, @@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) * Copy parameter control struct, including a possible path allocated * at the end of the struct. */ -static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +static struct autofs_dev_ioctl * + copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; @@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) { kfree(param); - return; } /* @@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname, void *data) { struct path path; - int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + int err; + + err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); if (err) return err; err = -ENOENT; @@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p) static int test_by_type(struct path *path, void *p) { struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + return ino && ino->sbi->type & *(unsigned *)p; } @@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(path.dentry, 0); spin_lock(&sbi->fs_lock); - param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); - param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); + param->requester.uid = + from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = + from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); @@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) } /* ioctl dispatcher */ -static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +static int _autofs_dev_ioctl(unsigned int command, + struct autofs_dev_ioctl __user *user) { struct autofs_dev_ioctl *param; struct file *fp; @@ -711,6 +717,7 @@ out: static long autofs_dev_ioctl(struct file *file, uint command, ulong u) { int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); return (long) err; } @@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = { static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, - .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); @@ -757,6 +764,5 @@ int __init autofs_dev_ioctl_init(void) void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); - return; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 1cebc3c..8ba37c7 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -1,16 +1,12 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/expire.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 1999-2000 Jeremy Fitzhardinge - * Copyright 2001-2006 Ian Kent +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include "autofs_i.h" @@ -18,7 +14,7 @@ static unsigned long now; /* Check if a dentry can be expired */ static inline int autofs4_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) + unsigned long timeout, int do_now) { struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -58,7 +54,9 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) /* Update the expiry counter if fs is busy */ if (!may_umount_tree(path.mnt)) { - struct autofs_info *ino = autofs4_dentry_ino(top); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(top); ino->last_used = jiffies; goto done; } @@ -74,7 +72,7 @@ done: * Calculate and dget next entry in the subdirs list under root. */ static struct dentry *get_next_positive_subdir(struct dentry *prev, - struct dentry *root) + struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; @@ -121,7 +119,7 @@ cont: * Calculate and dget next entry in top down tree traversal. */ static struct dentry *get_next_positive_dentry(struct dentry *prev, - struct dentry *root) + struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; @@ -187,15 +185,17 @@ again: * autofs submounts. */ static int autofs4_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) + struct dentry *top, + unsigned long timeout, + int do_now) { DPRINTK("top %p %pd", top, top); /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { - struct autofs_info *ino = autofs4_dentry_ino(top); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; @@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt, return 0; } -/* Check a directory tree of mount points for busyness +/* + * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ static int autofs4_tree_busy(struct vfsmount *mnt, @@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry, } else { /* Path walk currently on this dentry? */ struct dentry *expired; + ino_count = atomic_read(&ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; @@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) /* Perform an expiry operation */ int autofs4_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; struct dentry *dentry; int ret = 0; - memset(&pkt,0,sizeof pkt); + memset(&pkt, 0, sizeof(pkt)); pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) + dentry = autofs4_expire_indirect(sb, mnt, sbi, 0); + if (!dentry) return -EAGAIN; pkt.len = dentry->d_name.len; @@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.name[pkt.len] = '\0'; dput(dentry); - if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) + if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; spin_lock(&sbi->fs_lock); @@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_info *ino = autofs4_dentry_ino(dentry); /* This is synchronous because it makes the daemon a - little easier */ + * little easier + */ ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); spin_lock(&sbi->fs_lock); @@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, return ret; } -/* Call repeatedly until it returns -EAGAIN, meaning there's nothing - more to be done */ +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more to be done. + */ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index b3db517..8cf0e63 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -1,14 +1,10 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/init.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include #include diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 4320faa..ad03705 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -1,15 +1,11 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/inode.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2005-2006 Ian Kent +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2005-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include #include @@ -24,7 +20,9 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) { - struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); + struct autofs_info *ino; + + ino = kzalloc(sizeof(*ino), GFP_KERNEL); if (ino) { INIT_LIST_HEAD(&ino->active); INIT_LIST_HEAD(&ino->expiring); @@ -152,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -209,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, int autofs4_fill_super(struct super_block *s, void *data, int silent) { - struct inode * root_inode; - struct dentry * root; - struct file * pipe; + struct inode *root_inode; + struct dentry *root; + struct file *pipe; int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -222,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - DPRINTK("starting up, sbi = %p",sbi); + DPRINTK("starting up, sbi = %p", sbi); s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c6d7d3d..bdeb883 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -1,16 +1,12 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/root.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 1999-2000 Jeremy Fitzhardinge - * Copyright 2001-2006 Ian Kent +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include #include @@ -23,16 +19,18 @@ #include "autofs_i.h" -static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); -static int autofs4_dir_unlink(struct inode *,struct dentry *); -static int autofs4_dir_rmdir(struct inode *,struct dentry *); -static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); -static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); +static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs4_dir_unlink(struct inode *, struct dentry *); +static int autofs4_dir_rmdir(struct inode *, struct dentry *); +static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); +static long autofs4_root_compat_ioctl(struct file *, + unsigned int, unsigned long); #endif static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); +static struct dentry *autofs4_lookup(struct inode *, + struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); static int autofs4_d_manage(struct dentry *, bool); static void autofs4_dentry_release(struct dentry *); @@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = { static void autofs4_add_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!ino->active_count) { @@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry) ino->active_count++; spin_unlock(&sbi->lookup_lock); } - return; } static void autofs4_del_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); ino->active_count--; @@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry) } spin_unlock(&sbi->lookup_lock); } - return; } static int autofs4_dir_open(struct inode *inode, struct file *file) @@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; struct autofs_info *ino; - struct dentry *new = d_lookup(parent, &dentry->d_name); + struct dentry *new; + + new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; ino = autofs4_dentry_ino(new); @@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * a mount-trap. */ struct inode *inode; + if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) return 0; if (d_mountpoint(dentry)) @@ -494,7 +497,8 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +static struct dentry *autofs4_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -513,9 +517,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u autofs4_oz_mode(sbi)); active = autofs4_lookup_active(dentry); - if (active) { + if (active) return active; - } else { + else { /* * A dentry that is not within the root can never trigger a * mount operation, unless the directory already exists, so we @@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u return ERR_PTR(-ENOENT); /* Mark entries in the root as mount triggers */ - if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) + if (IS_ROOT(dentry->d_parent) && + autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); ino = autofs4_new_ino(sbi); @@ -664,7 +669,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry) if (IS_ROOT(parent->d_parent)) return; managed_dentry_clear_managed(parent); - return; } static void autofs_clear_leaf_automount_flags(struct dentry *dentry) @@ -687,7 +691,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) if (d_child->next == &parent->d_subdirs && d_child->prev == &parent->d_subdirs) managed_dentry_set_managed(parent); - return; } static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) @@ -728,7 +731,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int autofs4_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -768,7 +772,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, - compat_ulong_t __user *p) + compat_ulong_t __user *p) { int rv; unsigned long ntimeout; @@ -787,7 +791,7 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, #endif static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, - unsigned long __user *p) + unsigned long __user *p) { int rv; unsigned long ntimeout; @@ -805,13 +809,15 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, } /* Return protocol version */ -static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p) +static inline int autofs4_get_protover(struct autofs_sb_info *sbi, + int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ -static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p) +static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, + int __user *p) { return put_user(sbi->sub_version, p); } @@ -834,9 +840,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) } /* Identify autofs4_dentries - this is so we can tell if there's - an extra dentry refcount or not. We only hold a refcount on the - dentry if its non-negative (ie, d_inode != NULL) -*/ + * an extra dentry refcount or not. We only hold a refcount on the + * dentry if its non-negative (ie, d_inode != NULL) + */ int is_autofs4_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && @@ -855,7 +861,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, void __user *p = (void __user *)arg; DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", - cmd,arg,sbi,task_pgrp_nr(current)); + cmd, arg, sbi, task_pgrp_nr(current)); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) @@ -864,11 +870,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; - switch(cmd) { + switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0); + return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); + return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ autofs4_catatonic_mode(sbi); return 0; @@ -888,10 +894,12 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); + return autofs4_expire_run(inode->i_sb, + filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); + return autofs4_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); default: return -ENOSYS; @@ -902,12 +910,13 @@ static long autofs4_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); + return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT static long autofs4_root_compat_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; @@ -916,7 +925,7 @@ static long autofs4_root_compat_ioctl(struct file *filp, ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); else ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, - (unsigned long)compat_ptr(arg)); + (unsigned long) compat_ptr(arg)); return ret; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 84e037d..99aab00 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -1,14 +1,10 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/symlink.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include "autofs_i.h" @@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry, { struct autofs_sb_info *sbi; struct autofs_info *ino; + if (!dentry) return ERR_PTR(-ECHILD); sbi = autofs4_sbi(dentry->d_sb); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35b755e..4e0c8d6 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -1,15 +1,11 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/waitq.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2001-2006 Ian Kent +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2001-2006 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include #include @@ -18,7 +14,8 @@ #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it - is better if we don't reassign numbers easily even across filesystems */ + * is better if we don't reassign numbers easily even across filesystems + */ static autofs_wqt_t autofs4_next_wait_queue = 1; /* These are the signals we allow interrupting a pending mount */ @@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi, set_fs(KERNEL_DS); mutex_lock(&sbi->pipe_mutex); - while (bytes && - (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) { + wr = __vfs_write(file, data, bytes, &file->f_pos); + while (bytes && wr) { data += wr; bytes -= wr; + wr = __vfs_write(file, data, bytes, &file->f_pos); } mutex_unlock(&sbi->pipe_mutex); set_fs(fs); /* Keep the currently executing process from receiving a - SIGPIPE unless it was already supposed to get one */ + * SIGPIPE unless it was already supposed to get one + */ if (wr == -EPIPE && !sigpipe) { spin_lock_irqsave(¤t->sighand->siglock, flags); sigdelset(¤t->pending.signal, SIGPIPE); @@ -103,9 +102,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, size_t pktsz; DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", - (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); + (unsigned long) wq->wait_queue_token, + wq->name.len, wq->name.name, type); - memset(&pkt,0,sizeof pkt); /* For security reasons */ + memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; @@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } case autofs_ptype_expire_multi: { - struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; + struct autofs_packet_expire_multi *ep = + &pkt.v4_pkt.expire_multi; pktsz = sizeof(*ep); @@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) if (wq->name.hash == qstr->hash && wq->name.len == qstr->len && wq->name.name && - !memcmp(wq->name.name, qstr->name, qstr->len)) + !memcmp(wq->name.name, qstr->name, qstr->len)) break; } return wq; @@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, struct qstr *qstr, - struct dentry*dentry, enum autofs_notify notify) + struct dentry *dentry, enum autofs_notify notify) { struct autofs_wait_queue *wq; struct autofs_info *ino; @@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait, * continue on and create a new request. */ if (!IS_ROOT(dentry)) { - if (d_really_is_positive(dentry) && d_unhashed(dentry)) { + if (d_unhashed(dentry) && + d_really_is_positive(dentry)) { struct dentry *parent = dentry->d_parent; + new = d_lookup(parent, &dentry->d_name); if (new) dentry = new; @@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait, return 1; } -int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, - enum autofs_notify notify) +int autofs4_wait(struct autofs_sb_info *sbi, + struct dentry *dentry, enum autofs_notify notify) { struct autofs_wait_queue *wq; struct qstr qstr; @@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, if (!wq) { /* Create a new wait queue */ - wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); + wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { kfree(qstr.name); mutex_unlock(&sbi->wq_mutex); @@ -454,7 +457,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); - /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ + /* + * autofs4_notify_daemon() may block; it will unlock ->wq_mutex + */ autofs4_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h index 850f39b..6427816 100644 --- a/include/linux/auto_dev-ioctl.h +++ b/include/linux/auto_dev-ioctl.h @@ -125,7 +125,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; in->size = sizeof(struct autofs_dev_ioctl); in->ioctlfd = -1; - return; } /* diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h index fcd704d..b4066bb 100644 --- a/include/linux/auto_fs.h +++ b/include/linux/auto_fs.h @@ -1,14 +1,10 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * linux/include/linux/auto_fs.h - * - * Copyright 1997 Transmeta Corporation - All Rights Reserved +/* + * Copyright 1997 Transmeta Corporation - All Rights Reserved * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ + */ #ifndef _LINUX_AUTO_FS_H #define _LINUX_AUTO_FS_H diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index bb991df..5fe176a 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -1,7 +1,4 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * linux/include/linux/auto_fs.h - * +/* * Copyright 1997 Transmeta Corporation - All Rights Reserved * * This file is part of the Linux kernel and is made available under @@ -63,12 +60,12 @@ struct autofs_packet_expire { char name[NAME_MAX+1]; }; -#define AUTOFS_IOC_READY _IO(0x93,0x60) -#define AUTOFS_IOC_FAIL _IO(0x93,0x61) -#define AUTOFS_IOC_CATATONIC _IO(0x93,0x62) -#define AUTOFS_IOC_PROTOVER _IOR(0x93,0x63,int) -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,compat_ulong_t) -#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93,0x64,unsigned long) -#define AUTOFS_IOC_EXPIRE _IOR(0x93,0x65,struct autofs_packet_expire) +#define AUTOFS_IOC_READY _IO(0x93, 0x60) +#define AUTOFS_IOC_FAIL _IO(0x93, 0x61) +#define AUTOFS_IOC_CATATONIC _IO(0x93, 0x62) +#define AUTOFS_IOC_PROTOVER _IOR(0x93, 0x63, int) +#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t) +#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long) +#define AUTOFS_IOC_EXPIRE _IOR(0x93, 0x65, struct autofs_packet_expire) #endif /* _UAPI_LINUX_AUTO_FS_H */ diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index e02982f..924fb1a 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -1,6 +1,4 @@ -/* -*- c -*- - * linux/include/linux/auto_fs4.h - * +/* * Copyright 1999-2000 Jeremy Fitzhardinge * * This file is part of the Linux kernel and is made available under @@ -38,7 +36,6 @@ static inline void set_autofs_type_indirect(unsigned int *type) { *type = AUTOFS_TYPE_INDIRECT; - return; } static inline unsigned int autofs_type_indirect(unsigned int type) @@ -49,7 +46,6 @@ static inline unsigned int autofs_type_indirect(unsigned int type) static inline void set_autofs_type_direct(unsigned int *type) { *type = AUTOFS_TYPE_DIRECT; - return; } static inline unsigned int autofs_type_direct(unsigned int type) @@ -60,7 +56,6 @@ static inline unsigned int autofs_type_direct(unsigned int type) static inline void set_autofs_type_offset(unsigned int *type) { *type = AUTOFS_TYPE_OFFSET; - return; } static inline unsigned int autofs_type_offset(unsigned int type) @@ -81,7 +76,6 @@ static inline unsigned int autofs_type_trigger(unsigned int type) static inline void set_autofs_type_any(unsigned int *type) { *type = AUTOFS_TYPE_ANY; - return; } static inline unsigned int autofs_type_any(unsigned int type) @@ -154,11 +148,10 @@ union autofs_v5_packet_union { autofs_packet_expire_direct_t expire_direct; }; -#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int) +#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93, 0x66, int) #define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI #define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI -#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93,0x67,int) -#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93,0x70,int) - +#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93, 0x67, int) +#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93, 0x70, int) #endif /* _LINUX_AUTO_FS4_H */ -- cgit v0.10.2 From aa330ddc5393acd32b227918650536070e3a9e54 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:28 -0700 Subject: autofs4: fix coding style problem in autofs4_get_set_timeout() Refactor autofs4_get_set_timeout() to eliminate coding style error. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index bdeb883..101879ed 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -774,12 +774,16 @@ static int autofs4_dir_mkdir(struct inode *dir, static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, compat_ulong_t __user *p) { - int rv; unsigned long ntimeout; + int rv; + + rv = get_user(ntimeout, p); + if (rv) + goto error; - if ((rv = get_user(ntimeout, p)) || - (rv = put_user(sbi->exp_timeout/HZ, p))) - return rv; + rv = put_user(sbi->exp_timeout/HZ, p); + if (rv) + goto error; if (ntimeout > UINT_MAX/HZ) sbi->exp_timeout = 0; @@ -787,18 +791,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, sbi->exp_timeout = ntimeout * HZ; return 0; +error: + return rv; } #endif static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { - int rv; unsigned long ntimeout; + int rv; + + rv = get_user(ntimeout, p); + if (rv) + goto error; - if ((rv = get_user(ntimeout, p)) || - (rv = put_user(sbi->exp_timeout/HZ, p))) - return rv; + rv = put_user(sbi->exp_timeout/HZ, p); + if (rv) + goto error; if (ntimeout > ULONG_MAX/HZ) sbi->exp_timeout = 0; @@ -806,6 +816,8 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, sbi->exp_timeout = ntimeout * HZ; return 0; +error: + return rv; } /* Return protocol version */ -- cgit v0.10.2 From b3f67a988c595e1fd5808008cea098a66cfadb9b Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:30 -0700 Subject: autofs4: fix coding style line length in autofs4_wait() The need for this is questionable but checkpatch.pl complains about the line length and it's a straightfoward change. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4e0c8d6..4aeae3b 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -476,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, */ if (wq->name.name) { /* Block all but "shutdown" signals while waiting */ - sigset_t oldset; + unsigned long shutdown_sigs_mask; unsigned long irqflags; + sigset_t oldset; spin_lock_irqsave(¤t->sighand->siglock, irqflags); oldset = current->blocked; - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); + shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; + siginitsetinv(¤t->blocked, shutdown_sigs_mask); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); -- cgit v0.10.2 From e3cd8067c1cd686fed2fe75f6cf98c62fc6ff57e Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:33 -0700 Subject: autofs4: fix invalid ioctl return in autofs4_root_ioctl_unlocked() The return from an ioctl if an invalid ioctl is passed in should be EINVAL not ENOSYS. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 101879ed..aa8228e 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -914,7 +914,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, filp->f_path.mnt, sbi, p); default: - return -ENOSYS; + return -EINVAL; } } -- cgit v0.10.2 From 0266725ad4ee0f8fcf2ee73be8e68c4adbf2ac79 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:36 -0700 Subject: autofs4: fix some white space errors Fix some white space format errors. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index c64b9fa..b8d0329 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -765,4 +765,3 @@ void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); } - diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index ad03705..7872830 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -327,7 +327,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) */ s->s_root = root; return 0; - + /* * Failure ... clean up. */ diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index aa8228e..18c3982 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -618,7 +618,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - + /* This allows root to remove symlinks */ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -698,7 +698,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - + DPRINTK("dentry %p, removing %pd", dentry, dentry); if (!autofs4_oz_mode(sbi)) @@ -878,10 +878,10 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; - + switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4aeae3b..a8a9462 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -88,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return (bytes > 0); } - + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) @@ -569,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok return 0; } - diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 5fe176a..9175a1b 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -48,7 +48,7 @@ struct autofs_packet_hdr { struct autofs_packet_missing { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_token; int len; char name[NAME_MAX+1]; }; diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index 924fb1a..8f8f1bd 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -108,7 +108,7 @@ enum autofs_notify { /* v4 multi expire (via pipe) */ struct autofs_packet_expire_multi { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_token; int len; char name[NAME_MAX+1]; }; -- cgit v0.10.2 From cab49f9ed880be69fd8f6a3419347a368ff0ec32 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:39 -0700 Subject: autofs4: make autofs log prints consistent Use the pr_*() print in AUTOFS_*() macros instead of printks and include the module name in log message macros. Also use the AUTOFS_*() macros everywhere instead of raw printks. Signed-off-by: Ian Kent Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index e50cfae..ba6d4eb 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -36,15 +36,15 @@ /* #define DEBUG */ #define DPRINTK(fmt, ...) \ - pr_debug("pid %d: %s: " fmt "\n", \ + pr_debug(KBUILD_MODNAME ":pid:%d:%s: " fmt "\n",\ current->pid, __func__, ##__VA_ARGS__) #define AUTOFS_WARN(fmt, ...) \ - printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ + pr_warn(KBUILD_MODNAME ":pid:%d:%s: " fmt "\n", \ current->pid, __func__, ##__VA_ARGS__) #define AUTOFS_ERROR(fmt, ...) \ - printk(KERN_ERR "pid %d: %s: " fmt "\n", \ + pr_err(KBUILD_MODNAME ":pid:%d:%s: " fmt "\n", \ current->pid, __func__, ##__VA_ARGS__) /* diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index b8d0329..1299e7a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -373,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { - AUTOFS_WARN("Not allowed to change PID namespace"); + AUTOFS_WARN("not allowed to change PID namespace"); err = -EINVAL; goto out; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 7872830..df06c9a 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -270,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { - printk("autofs: called with bogus options\n"); + AUTOFS_ERROR("called with bogus options"); goto fail_dput; } if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { - pr_warn("autofs: could not find process group %d\n", + AUTOFS_ERROR("could not find process group %d", pgrp); goto fail_dput; } @@ -294,8 +294,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Couldn't this be tested earlier? */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - printk("autofs: kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)\n", + AUTOFS_ERROR("kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); goto fail_dput; @@ -312,7 +312,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) pipe = fget(pipefd); if (!pipe) { - printk("autofs: could not open pipe file descriptor\n"); + AUTOFS_ERROR("could not open pipe file descriptor"); goto fail_dput; } ret = autofs_prepare_pipe(pipe); @@ -332,7 +332,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) * Failure ... clean up. */ fail_fput: - printk("autofs: pipe file descriptor does not contain proper ops\n"); + AUTOFS_ERROR("pipe file descriptor does not contain proper ops"); fput(pipe); /* fall through */ fail_dput: diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index a8a9462..314230d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -164,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, break; } default: - printk("autofs4_notify_daemon: bad type %d!\n", type); + AUTOFS_WARN("bad type %d!", type); mutex_unlock(&sbi->wq_mutex); return; } @@ -453,7 +453,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, autofs_ptype_expire_indirect; } - DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); -- cgit v0.10.2 From 90967c87e3383c6b8400803ed8e28f2903e279ec Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:42 -0700 Subject: autofs4: change log print macros to not insert newline Common kernel coding practice is to include the newline of log prints within the log text rather than hidden away in a macro. To avoid introducing inconsistencies as changes are made change the log macros to not include the newline. Signed-off-by: Ian Kent Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index ba6d4eb..9a12212 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -36,15 +36,15 @@ /* #define DEBUG */ #define DPRINTK(fmt, ...) \ - pr_debug(KBUILD_MODNAME ":pid:%d:%s: " fmt "\n",\ + pr_debug(KBUILD_MODNAME ":pid:%d:%s: " fmt, \ current->pid, __func__, ##__VA_ARGS__) #define AUTOFS_WARN(fmt, ...) \ - pr_warn(KBUILD_MODNAME ":pid:%d:%s: " fmt "\n", \ + pr_warn(KBUILD_MODNAME ":pid:%d:%s: " fmt, \ current->pid, __func__, ##__VA_ARGS__) #define AUTOFS_ERROR(fmt, ...) \ - pr_err(KBUILD_MODNAME ":pid:%d:%s: " fmt "\n", \ + pr_err(KBUILD_MODNAME ":pid:%d:%s: " fmt, \ current->pid, __func__, ##__VA_ARGS__) /* diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 1299e7a..4ccbb01 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -75,7 +75,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { AUTOFS_WARN("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(%d)", + "kernel(%u.%u), user(%u.%u), cmd(%d)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); @@ -130,7 +130,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) err = check_dev_ioctl_version(cmd, param); if (err) { AUTOFS_WARN("invalid device control module version " - "supplied for cmd(0x%08x)", cmd); + "supplied for cmd(0x%08x)\n", cmd); goto out; } @@ -138,14 +138,14 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) err = invalid_str(param->path, param->size - sizeof(*param)); if (err) { AUTOFS_WARN( - "path string terminator missing for cmd(0x%08x)", + "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; } err = check_name(param->path); if (err) { - AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)\n", cmd); goto out; } @@ -373,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { - AUTOFS_WARN("not allowed to change PID namespace"); + AUTOFS_WARN("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } @@ -661,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, fn = lookup_dev_ioctl(cmd); if (!fn) { - AUTOFS_WARN("unknown command 0x%08x", command); + AUTOFS_WARN("unknown command 0x%08x\n", command); return -ENOTTY; } @@ -754,7 +754,7 @@ int __init autofs_dev_ioctl_init(void) r = misc_register(&_autofs_dev_ioctl_misc); if (r) { - AUTOFS_ERROR("misc_register failed for control device"); + AUTOFS_ERROR("misc_register failed for control device\n"); return r; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 8ba37c7..107a38f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -37,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; - DPRINTK("dentry %p %pd", dentry, dentry); + DPRINTK("dentry %p %pd\n", dentry, dentry); path_get(&path); @@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) status = 0; done: - DPRINTK("returning = %d", status); + DPRINTK("returning = %d\n", status); path_put(&path); return status; } @@ -189,7 +189,7 @@ static int autofs4_direct_busy(struct vfsmount *mnt, unsigned long timeout, int do_now) { - DPRINTK("top %p %pd", top, top); + DPRINTK("top %p %pd\n", top, top); /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { @@ -220,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, struct autofs_info *top_ino = autofs4_dentry_ino(top); struct dentry *p; - DPRINTK("top %p %pd", top, top); + DPRINTK("top %p %pd\n", top, top); /* Negative dentry - give up */ if (!simple_positive(top)) @@ -228,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, p = NULL; while ((p = get_next_positive_dentry(p, top))) { - DPRINTK("dentry %p %pd", p, p); + DPRINTK("dentry %p %pd\n", p, p); /* * Is someone visiting anywhere in the subtree ? @@ -274,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, { struct dentry *p; - DPRINTK("parent %p %pd", parent, parent); + DPRINTK("parent %p %pd\n", parent, parent); p = NULL; while ((p = get_next_positive_dentry(p, parent))) { - DPRINTK("dentry %p %pd", p, p); + DPRINTK("dentry %p %pd\n", p, p); if (d_mountpoint(p)) { /* Can we umount this guy */ @@ -363,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry, * offset (autofs-5.0+). */ if (d_mountpoint(dentry)) { - DPRINTK("checking mountpoint %p %pd", dentry, dentry); + DPRINTK("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) @@ -376,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry, } if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { - DPRINTK("checking symlink %p %pd", dentry, dentry); + DPRINTK("checking symlink %p %pd\n", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. @@ -473,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, return NULL; found: - DPRINTK("returning %p %pd", expired, expired); + DPRINTK("returning %p %pd\n", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; smp_mb(); ino->flags &= ~AUTOFS_INF_NO_RCU; @@ -505,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); - DPRINTK("waiting for expire %p name=%pd", dentry, dentry); + DPRINTK("waiting for expire %p name=%pd\n", dentry, dentry); status = autofs4_wait(sbi, dentry, NFY_NONE); wait_for_completion(&ino->expire_complete); - DPRINTK("expire done status=%d", status); + DPRINTK("expire done status=%d\n", status); if (d_unhashed(dentry)) return -EAGAIN; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index df06c9a..75f6ef4 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -60,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb) put_pid(sbi->oz_pgrp); } - DPRINTK("shutting down"); + DPRINTK("shutting down\n"); kill_litter_super(sb); if (sbi) kfree_rcu(sbi, rcu); @@ -221,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - DPRINTK("starting up, sbi = %p", sbi); + DPRINTK("starting up, sbi = %p\n", sbi); s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; @@ -270,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { - AUTOFS_ERROR("called with bogus options"); + AUTOFS_ERROR("called with bogus options\n"); goto fail_dput; } if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { - AUTOFS_ERROR("could not find process group %d", + AUTOFS_ERROR("could not find process group %d\n", pgrp); goto fail_dput; } @@ -295,7 +295,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { AUTOFS_ERROR("kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)", + "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); goto fail_dput; @@ -308,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); + DPRINTK("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); if (!pipe) { - AUTOFS_ERROR("could not open pipe file descriptor"); + AUTOFS_ERROR("could not open pipe file descriptor\n"); goto fail_dput; } ret = autofs_prepare_pipe(pipe); @@ -332,7 +332,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) * Failure ... clean up. */ fail_fput: - AUTOFS_ERROR("pipe file descriptor does not contain proper ops"); + AUTOFS_ERROR("pipe file descriptor does not contain proper ops\n"); fput(pipe); /* fall through */ fail_dput: diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 18c3982..a71bcf8 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); + DPRINTK("file=%p dentry=%p %pd\n", file, dentry, dentry); if (autofs4_oz_mode(sbi)) goto out; @@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de) struct autofs_info *ino = autofs4_dentry_ino(de); struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); - DPRINTK("releasing %p", de); + DPRINTK("releasing %p\n", de); if (!ino) return; @@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - DPRINTK("waiting for mount name=%pd", dentry); + DPRINTK("waiting for mount name=%pd\n", dentry); status = autofs4_wait(sbi, dentry, NFY_MOUNT); - DPRINTK("mount wait done status=%d", status); + DPRINTK("mount wait done status=%d\n", status); } ino->last_used = jiffies; return status; @@ -340,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %pd", dentry, dentry); + DPRINTK("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ if (autofs4_oz_mode(sbi)) @@ -427,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %pd", dentry, dentry); + DPRINTK("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { @@ -504,7 +504,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct autofs_info *ino; struct dentry *active; - DPRINTK("name = %pd", dentry); + DPRINTK("name = %pd\n", dentry); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) @@ -512,7 +512,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, sbi = autofs4_sbi(dir->i_sb); - DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", + DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, task_pgrp_nr(current), sbi->catatonic, autofs4_oz_mode(sbi)); @@ -559,7 +559,7 @@ static int autofs4_dir_symlink(struct inode *dir, size_t size = strlen(symname); char *cp; - DPRINTK("%s <- %pd", symname, dentry); + DPRINTK("%s <- %pd\n", symname, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - DPRINTK("dentry %p, removing %pd", dentry, dentry); + DPRINTK("dentry %p, removing %pd\n", dentry, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -742,7 +742,7 @@ static int autofs4_dir_mkdir(struct inode *dir, if (!autofs4_oz_mode(sbi)) return -EACCES; - DPRINTK("dentry %p, creating %pd", dentry, dentry); + DPRINTK("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); @@ -844,7 +844,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) if (may_umount(mnt)) status = 1; - DPRINTK("returning %d", status); + DPRINTK("returning %d\n", status); status = put_user(status, p); @@ -872,7 +872,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); void __user *p = (void __user *)arg; - DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", + DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", cmd, arg, sbi, task_pgrp_nr(current)); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 314230d..169ba87 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -31,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) return; } - DPRINTK("entering catatonic mode"); + DPRINTK("entering catatonic mode\n"); sbi->catatonic = 1; wq = sbi->queues; @@ -101,7 +101,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct file *pipe = NULL; size_t pktsz; - DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", + DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); @@ -164,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, break; } default: - AUTOFS_WARN("bad type %d!", type); + AUTOFS_WARN("bad type %d!\n", type); mutex_unlock(&sbi->wq_mutex); return; } @@ -453,7 +453,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, autofs_ptype_expire_indirect; } - DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d", + DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); @@ -463,7 +463,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, autofs4_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; - DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", + DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); @@ -494,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } else { - DPRINTK("skipped sleeping"); + DPRINTK("skipped sleeping\n"); } status = wq->status; -- cgit v0.10.2 From 8a78d59304102d36a0cd9b4d81e8e48087ffafd2 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:45 -0700 Subject: autofs4: use pr_xxx() macros directly for logging Use the standard pr_xxx() log macros directly for log prints instead of the AUTOFS_XXX() macros. Signed-off-by: Ian Kent Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 9a12212..f0d268b 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -35,17 +35,10 @@ /* #define DEBUG */ -#define DPRINTK(fmt, ...) \ - pr_debug(KBUILD_MODNAME ":pid:%d:%s: " fmt, \ - current->pid, __func__, ##__VA_ARGS__) - -#define AUTOFS_WARN(fmt, ...) \ - pr_warn(KBUILD_MODNAME ":pid:%d:%s: " fmt, \ - current->pid, __func__, ##__VA_ARGS__) - -#define AUTOFS_ERROR(fmt, ...) \ - pr_err(KBUILD_MODNAME ":pid:%d:%s: " fmt, \ - current->pid, __func__, ##__VA_ARGS__) +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__ /* * Unified info structure. This is pointed to by both the dentry and diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 4ccbb01..c7fcc74 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -74,11 +74,11 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { - AUTOFS_WARN("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(%d)\n", - AUTOFS_DEV_IOCTL_VERSION_MAJOR, - AUTOFS_DEV_IOCTL_VERSION_MINOR, - param->ver_major, param->ver_minor, cmd); + pr_warn("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)\n", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); err = -EINVAL; } @@ -129,15 +129,15 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) err = check_dev_ioctl_version(cmd, param); if (err) { - AUTOFS_WARN("invalid device control module version " - "supplied for cmd(0x%08x)\n", cmd); + pr_warn("invalid device control module version " + "supplied for cmd(0x%08x)\n", cmd); goto out; } if (param->size > sizeof(*param)) { err = invalid_str(param->path, param->size - sizeof(*param)); if (err) { - AUTOFS_WARN( + pr_warn( "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; @@ -145,8 +145,8 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) err = check_name(param->path); if (err) { - AUTOFS_WARN("invalid path supplied for cmd(0x%08x)\n", - cmd); + pr_warn("invalid path supplied for cmd(0x%08x)\n", + cmd); goto out; } } @@ -373,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { - AUTOFS_WARN("not allowed to change PID namespace\n"); + pr_warn("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } @@ -661,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, fn = lookup_dev_ioctl(cmd); if (!fn) { - AUTOFS_WARN("unknown command 0x%08x\n", command); + pr_warn("unknown command 0x%08x\n", command); return -ENOTTY; } @@ -754,7 +754,7 @@ int __init autofs_dev_ioctl_init(void) r = misc_register(&_autofs_dev_ioctl_misc); if (r) { - AUTOFS_ERROR("misc_register failed for control device\n"); + pr_err("misc_register failed for control device\n"); return r; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 107a38f..9510d8d 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -37,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; - DPRINTK("dentry %p %pd\n", dentry, dentry); + pr_debug("dentry %p %pd\n", dentry, dentry); path_get(&path); @@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) status = 0; done: - DPRINTK("returning = %d\n", status); + pr_debug("returning = %d\n", status); path_put(&path); return status; } @@ -189,7 +189,7 @@ static int autofs4_direct_busy(struct vfsmount *mnt, unsigned long timeout, int do_now) { - DPRINTK("top %p %pd\n", top, top); + pr_debug("top %p %pd\n", top, top); /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { @@ -220,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, struct autofs_info *top_ino = autofs4_dentry_ino(top); struct dentry *p; - DPRINTK("top %p %pd\n", top, top); + pr_debug("top %p %pd\n", top, top); /* Negative dentry - give up */ if (!simple_positive(top)) @@ -228,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, p = NULL; while ((p = get_next_positive_dentry(p, top))) { - DPRINTK("dentry %p %pd\n", p, p); + pr_debug("dentry %p %pd\n", p, p); /* * Is someone visiting anywhere in the subtree ? @@ -274,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, { struct dentry *p; - DPRINTK("parent %p %pd\n", parent, parent); + pr_debug("parent %p %pd\n", parent, parent); p = NULL; while ((p = get_next_positive_dentry(p, parent))) { - DPRINTK("dentry %p %pd\n", p, p); + pr_debug("dentry %p %pd\n", p, p); if (d_mountpoint(p)) { /* Can we umount this guy */ @@ -363,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry, * offset (autofs-5.0+). */ if (d_mountpoint(dentry)) { - DPRINTK("checking mountpoint %p %pd\n", dentry, dentry); + pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) @@ -376,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry, } if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { - DPRINTK("checking symlink %p %pd\n", dentry, dentry); + pr_debug("checking symlink %p %pd\n", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. @@ -473,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, return NULL; found: - DPRINTK("returning %p %pd\n", expired, expired); + pr_debug("returning %p %pd\n", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; smp_mb(); ino->flags &= ~AUTOFS_INF_NO_RCU; @@ -505,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); - DPRINTK("waiting for expire %p name=%pd\n", dentry, dentry); + pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); status = autofs4_wait(sbi, dentry, NFY_NONE); wait_for_completion(&ino->expire_complete); - DPRINTK("expire done status=%d\n", status); + pr_debug("expire done status=%d\n", status); if (d_unhashed(dentry)) return -EAGAIN; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 75f6ef4..61b2105 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -60,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb) put_pid(sbi->oz_pgrp); } - DPRINTK("shutting down\n"); + pr_debug("shutting down\n"); kill_litter_super(sb); if (sbi) kfree_rcu(sbi, rcu); @@ -221,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - DPRINTK("starting up, sbi = %p\n", sbi); + pr_debug("starting up, sbi = %p\n", sbi); s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; @@ -270,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { - AUTOFS_ERROR("called with bogus options\n"); + pr_err("called with bogus options\n"); goto fail_dput; } if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { - AUTOFS_ERROR("could not find process group %d\n", + pr_err("could not find process group %d\n", pgrp); goto fail_dput; } @@ -294,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Couldn't this be tested earlier? */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - AUTOFS_ERROR("kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)\n", - sbi->min_proto, sbi->max_proto, - AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + pr_err("kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); goto fail_dput; } @@ -308,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); + pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); if (!pipe) { - AUTOFS_ERROR("could not open pipe file descriptor\n"); + pr_err("could not open pipe file descriptor\n"); goto fail_dput; } ret = autofs_prepare_pipe(pipe); @@ -332,7 +332,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) * Failure ... clean up. */ fail_fput: - AUTOFS_ERROR("pipe file descriptor does not contain proper ops\n"); + pr_err("pipe file descriptor does not contain proper ops\n"); fput(pipe); /* fall through */ fail_dput: diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a71bcf8..9328b58 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - DPRINTK("file=%p dentry=%p %pd\n", file, dentry, dentry); + pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); if (autofs4_oz_mode(sbi)) goto out; @@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de) struct autofs_info *ino = autofs4_dentry_ino(de); struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); - DPRINTK("releasing %p\n", de); + pr_debug("releasing %p\n", de); if (!ino) return; @@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - DPRINTK("waiting for mount name=%pd\n", dentry); + pr_debug("waiting for mount name=%pd\n", dentry); status = autofs4_wait(sbi, dentry, NFY_MOUNT); - DPRINTK("mount wait done status=%d\n", status); + pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; return status; @@ -340,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %pd\n", dentry, dentry); + pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ if (autofs4_oz_mode(sbi)) @@ -427,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %pd\n", dentry, dentry); + pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { @@ -504,7 +504,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct autofs_info *ino; struct dentry *active; - DPRINTK("name = %pd\n", dentry); + pr_debug("name = %pd\n", dentry); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) @@ -512,9 +512,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, sbi = autofs4_sbi(dir->i_sb); - DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", - current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs4_oz_mode(sbi)); + pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs4_oz_mode(sbi)); active = autofs4_lookup_active(dentry); if (active) @@ -559,7 +559,7 @@ static int autofs4_dir_symlink(struct inode *dir, size_t size = strlen(symname); char *cp; - DPRINTK("%s <- %pd\n", symname, dentry); + pr_debug("%s <- %pd\n", symname, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - DPRINTK("dentry %p, removing %pd\n", dentry, dentry); + pr_debug("dentry %p, removing %pd\n", dentry, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -742,7 +742,7 @@ static int autofs4_dir_mkdir(struct inode *dir, if (!autofs4_oz_mode(sbi)) return -EACCES; - DPRINTK("dentry %p, creating %pd\n", dentry, dentry); + pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); @@ -844,7 +844,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) if (may_umount(mnt)) status = 1; - DPRINTK("returning %d\n", status); + pr_debug("returning %d\n", status); status = put_user(status, p); @@ -872,8 +872,8 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); void __user *p = (void __user *)arg; - DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", - cmd, arg, sbi, task_pgrp_nr(current)); + pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", + cmd, arg, sbi, task_pgrp_nr(current)); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 169ba87..0146d91 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -31,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) return; } - DPRINTK("entering catatonic mode\n"); + pr_debug("entering catatonic mode\n"); sbi->catatonic = 1; wq = sbi->queues; @@ -101,9 +101,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct file *pipe = NULL; size_t pktsz; - DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d\n", - (unsigned long) wq->wait_queue_token, - wq->name.len, wq->name.name, type); + pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", + (unsigned long) wq->wait_queue_token, + wq->name.len, wq->name.name, type); memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ @@ -164,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, break; } default: - AUTOFS_WARN("bad type %d!\n", type); + pr_warn("bad type %d!\n", type); mutex_unlock(&sbi->wq_mutex); return; } @@ -453,9 +453,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, autofs_ptype_expire_indirect; } - DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, - wq->name.name, notify); + pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); /* * autofs4_notify_daemon() may block; it will unlock ->wq_mutex @@ -463,9 +463,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, autofs4_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; - DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, - wq->name.name, notify); + pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); } @@ -494,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } else { - DPRINTK("skipped sleeping\n"); + pr_debug("skipped sleeping\n"); } status = wq->status; -- cgit v0.10.2 From 63c06227a22b098a3849c5c99e836aea161ca0d7 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 15 Mar 2016 14:58:47 -0700 Subject: autofs4: fix string.h include in auto_dev-ioctl.h Since including linux/string.h will now do the right thing remove the conditional check. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h index 6427816..7caaf29 100644 --- a/include/linux/auto_dev-ioctl.h +++ b/include/linux/auto_dev-ioctl.h @@ -11,12 +11,7 @@ #define _LINUX_AUTO_DEV_IOCTL_H #include - -#ifdef __KERNEL__ #include -#else -#include -#endif /* __KERNEL__ */ #define AUTOFS_DEVICE_NAME "autofs" -- cgit v0.10.2