From 6933599697c96c3213c95f5f1fc7cb6abfd08c54 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:43:43 -0800 Subject: inotify: hide internal kernel bits from fdinfo There was a report that my patch: inotify: actually check for invalid bits in sys_inotify_add_watch() broke CRIU. The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/* to figure out how to rebuild inotify watches and then passes those flags directly back in to the inotify API. One of those flags (FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the inotify API. It is used inside the kernel to _implement_ inotify but it is not and has never been part of the API. My patch above ensured that we only allow bits which are part of the API (IN_ALL_EVENTS). This broke CRIU. FS_EVENT_ON_CHILD is really internal to the kernel. It is set _anyway_ on all inotify marks. So, CRIU was really just trying to set a bit that was already set. This patch hides that bit from fdinfo. CRIU will not see the bit, not try to set it, and should work as before. We should not have been exposing this bit in the first place, so this is a good patch independent of the CRIU problem. Signed-off-by: Dave Hansen Reported-by: Andrey Wagin Acked-by: Andrey Vagin Acked-by: Cyrill Gorcunov Acked-by: Eric Paris Cc: Pavel Emelyanov Cc: John McCutchan Cc: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6b6f0d47..fd98e51 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(mark->inode); if (inode) { + /* + * IN_ALL_EVENTS represents all of the mask bits + * that we expose to userspace. There is at + * least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ + u32 mask = mark->mask & IN_ALL_EVENTS; seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mark->mask, mark->ignored_mask); + mask, mark->ignored_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); -- cgit v0.10.2 From d30e2c05a1a231452c273e74851d6b70d516f7f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:43:46 -0800 Subject: inotify: actually check for invalid bits in sys_inotify_add_watch() The comment here says that it is checking for invalid bits. But, the mask is *actually* checking to ensure that _any_ valid bit is set, which is quite different. Without this check, an unexpected bit could get set on an inotify object. Since these bits are also interpreted by the fsnotify/dnotify code, there is the potential for an object to be mishandled inside the kernel. For instance, can we be sure that setting the dnotify flag FS_DN_RENAME on an inotify watch is harmless? Add the actual check which was intended. Retain the existing inotify bits are being added to the watch. Plus, this is existing behavior which would be nice to preserve. I did a quick sniff test that inotify functions and that my 'inotify-tools' package passes 'make check'. Signed-off-by: Dave Hansen Cc: John McCutchan Cc: Robert Love Cc: Eric Paris Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5b1e2a4..b8d08d0 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; - /* don't allow invalid bits: we don't want flags set */ + /* + * We share a lot of code with fs/dnotify. We also share + * the bit layout between inotify's IN_* and the fsnotify + * FS_*. This check ensures that only the inotify IN_* + * bits get passed in and set in watches/events. + */ + if (unlikely(mask & ~ALL_INOTIFY_BITS)) + return -EINVAL; + /* + * Require at least one valid bit set in the mask. + * Without _something_ set, we would have no events to + * watch for. + */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; -- cgit v0.10.2 From ce4f2fd7eacd84d78530fb97621621ae50d167f1 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 5 Nov 2015 18:43:49 -0800 Subject: logfs: fix build warning fs/logfs/dev_bdev.c: In function '__bdev_writeseg': include/linux/kernel.h:601:17: warning: comparison of distinct pointer types lacks a cast [enabled by default] (void) (&_min1 == &_min2); \ fs/logfs/dev_bdev.c:84:14: note: in expansion of macro 'min' max_pages = min(nr_pages, BIO_MAX_PAGES); fs/logfs/dev_bdev.c: In function 'do_erase': include/linux/kernel.h:601:17: warning: comparison of distinct pointer types lacks a cast [enabled by default] (void) (&_min1 == &_min2); \ fs/logfs/dev_bdev.c:174:14: note: in expansion of macro 'min' max_pages = min(nr_pages, BIO_MAX_PAGES); Lets use min_t and mention the type. Signed-off-by: Sudip Mukherjee Cc: Joern Engel Cc: Prasad Joshi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a7fdbd8..a709d80 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); @@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); -- cgit v0.10.2 From d162eaad7726e5f10a2b5813bdfac9d55c8eba69 Mon Sep 17 00:00:00 2001 From: "Norton.Zhu" Date: Thu, 5 Nov 2015 18:43:52 -0800 Subject: ocfs2_direct_IO_write() misses ocfs2_is_overwrite() error code If ocfs2_is_overwrite failed, ocfs2_direct_IO_write mays till return success to the caller. Signed-off-by: Norton.Zhu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 64b11d9..f04914c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -864,6 +864,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, is_overwrite = ocfs2_is_overwrite(osb, inode, offset); if (is_overwrite < 0) { mlog_errno(is_overwrite); + ret = is_overwrite; ocfs2_inode_unlock(inode, 1); goto clean_orphan; } -- cgit v0.10.2 From 4e357b932a665e62eb0642633057b8d7156ed8db Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Thu, 5 Nov 2015 18:43:55 -0800 Subject: ocfs2: fill in the unused portion of the block with zeros by dio_zero_block() A simplified test case is (this case from Ryan): 1) dd if=/dev/zero of=/mnt/hello bs=512 count=1 oflag=direct; 2) truncate /mnt/hello -s 2097152 file 'hello' is not exist before test. After this command, file 'hello' should be all zero. But 512~4096 is some random data. Setting bh state to new when get a new block, if so, direct_io_worker()->dio_zero_block() will fill-in the unused portion of the block with zero. Signed-off-by: Yiwen Jiang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f04914c..7f60472 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = -EIO; goto bail; } + set_buffer_new(bh_result); up_write(&OCFS2_I(inode)->ip_alloc_sem); } -- cgit v0.10.2 From 1d1aff8cf367d2216a678c722161784e207965c4 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:43:58 -0800 Subject: ocfs2: improve performance for localalloc Currently cluster allocation is always trying to find a victim chain (a chian has most space), and this may lead to poor performance because of discontiguous allocation in some scenarios. Our test case is block size 4k, cluster size 1M and mount option with localalloc=2048 (2G), since a gd is 32256M (about 31.5G) and a localalloc window is only 2G, creating 50G file will result in 2G from gd0, 2G from gd1, ... One way to improve performance is enlarge localalloc window size (max 31104M), but this will make end user feel that about 30G is suddenly "missing", and localalloc currently do not support steal, which means one node cannot use another node's localalloc even it is not used in fact. So using the last gd to record the allocation and continues with the gd if it has enough space for a localalloc window can make the allocation as more contiguous as possible. Our test result is below (evaluated in IOPS), which is using iometer running in VM, dynamic vhd virtual disk stored in ocfs2. IO model Original After Improved(%) 16K60%Write100%Random 703 876 24.59% 8K90%Write100%Random 735 827 12.59% 4K100%Write100%Random 859 915 6.52% 4K100%Read100%Random 2092 2600 24.30% Signed-off-by: Joseph Qi Tested-by: Norton Zhu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d83d260..fc6d25f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { - hint = ocfs2_group_from_res(res); + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) + hint = res->sr_bg_blkno; + else + hint = ocfs2_group_from_res(res); goto set_hint; } if (status < 0 && status != -ENOSPC) { -- cgit v0.10.2 From 30edc43c7ff0760f6896c37c06a84533546588fa Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:01 -0800 Subject: ocfs2: do not include dio entry in case of orphan scan dio entry will only do truncate in case of ORPHAN_NEED_TRUNCATE. So do not include it when doing normal orphan scan to reduce contention. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff82b28..4bac600 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv { struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; + enum ocfs2_orphan_reco_type orphan_reco_type; }; static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, @@ -2036,6 +2037,12 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (name_len == 2 && !strncmp("..", name, 2)) return 0; + /* do not include dio entry in case of orphan scan */ + if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && + (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN))) + return 0; + /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); @@ -2060,14 +2067,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, static int ocfs2_queue_orphans(struct ocfs2_super *osb, int slot, - struct inode **head) + struct inode **head, + enum ocfs2_orphan_reco_type orphan_reco_type) { int status; struct inode *orphan_dir_inode = NULL; struct ocfs2_orphan_filldir_priv priv = { .ctx.actor = ocfs2_orphan_filldir, .osb = osb, - .head = *head + .head = *head, + .orphan_reco_type = orphan_reco_type }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, @@ -2170,7 +2179,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, trace_ocfs2_recover_orphans(slot); ocfs2_mark_recovering_orphan_dir(osb, slot); - ret = ocfs2_queue_orphans(osb, slot, &inode); + ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type); ocfs2_clear_recovering_orphan_dir(osb, slot); /* Error here should be noted, but we want to continue with as diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b7dfac2..1206392 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) -#define OCFS2_DIO_ORPHAN_PREFIX "dio-" -#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e173329..1155918 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -26,6 +26,9 @@ #ifndef OCFS2_NAMEI_H #define OCFS2_NAMEI_H +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 + extern const struct inode_operations ocfs2_dir_iops; struct dentry *ocfs2_get_parent(struct dentry *child); -- cgit v0.10.2 From 93d911fcce259a3f950ee20592beee31b855cd96 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:04 -0800 Subject: ocfs2: only take lock if dio entry when recover orphans We have no need to take inode mutex, rw and inode lock if it is not dio entry when recover orphans. Optimize it by adding a flag OCFS2_INODE_DIO_ORPHAN_ENTRY to ocfs2_inode_info to reduce contention. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431e..aac8b86 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -112,6 +112,8 @@ struct ocfs2_inode_info #define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ #define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 +/* Entry in orphan dir with 'dio-' prefix */ +#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4bac600..04d6303 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2049,6 +2049,10 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (IS_ERR(iter)) return 0; + if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN)) + OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY; + /* Skip inodes which are already added to recover list, since dio may * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { @@ -2195,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; - mutex_lock(&inode->i_mutex); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto next; - } - /* - * We need to take and drop the inode lock to - * force read inode from disk. - */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); - goto unlock_rw; - } + if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { + mutex_lock(&inode->i_mutex); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto unlock_mutex; + } + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto unlock_rw; + } - di = (struct ocfs2_dinode *)di_bh->b_data; + di = (struct ocfs2_dinode *)di_bh->b_data; - if (inode->i_nlink == 0) { + if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto unlock_inode; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, + di_bh, 0, 0); + if (ret) + mlog_errno(ret); + } +unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; +unlock_rw: + ocfs2_rw_unlock(inode, 1); +unlock_mutex: + mutex_unlock(&inode->i_mutex); + + /* clear dio flag in ocfs2_inode_info */ + oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; + } else { spin_lock(&oi->ip_lock); /* Set the proper information to get us going into * ocfs2_delete_inode. */ @@ -2221,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, spin_unlock(&oi->ip_lock); } - if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && - (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ret = ocfs2_truncate_file(inode, di_bh, - i_size_read(inode)); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto unlock_inode; - } - - ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); - if (ret) - mlog_errno(ret); - } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ -unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; -unlock_rw: - ocfs2_rw_unlock(inode, 1); -next: - mutex_unlock(&inode->i_mutex); iput(inode); inode = iter; } -- cgit v0.10.2 From 0986fe9b50f425ec81f25a1a85aaf3574b31d801 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:07 -0800 Subject: ocfs2: fix race between mount and delete node/cluster There is a race case between mount and delete node/cluster, which will lead o2hb_thread to malfunctioning dead loop. o2hb_thread { o2nm_depend_this_node(); <<<<<< race window, node may have already been deleted, and then enter the loop, o2hb thread will be malfunctioning because of no configured nodes found. while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { } So check the return value of o2nm_depend_this_node() is needed. If node has been deleted, do not enter the loop and let mount fail. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index fa15deb..ddddef0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -219,7 +219,8 @@ struct o2hb_region { unsigned hr_unclean_stop:1, hr_aborted_start:1, hr_item_pinned:1, - hr_item_dropped:1; + hr_item_dropped:1, + hr_node_deleted:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data) set_user_nice(current, MIN_NICE); /* Pin node */ - o2nm_depend_this_node(); + ret = o2nm_depend_this_node(); + if (ret) { + mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); + reg->hr_node_deleted = 1; + wake_up(&o2hb_steady_queue); + return 0; + } while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { @@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, spin_unlock(&o2hb_live_lock); ret = wait_event_interruptible(o2hb_steady_queue, - atomic_read(®->hr_steady_iterations) == 0); + atomic_read(®->hr_steady_iterations) == 0 || + reg->hr_node_deleted); if (ret) { atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; @@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out3; } + if (reg->hr_node_deleted) { + ret = -EINVAL; + goto out3; + } + /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; -- cgit v0.10.2 From b1529a41f777a48f95d4af29668b70ffe3360e1b Mon Sep 17 00:00:00 2001 From: alex chen Date: Thu, 5 Nov 2015 18:44:10 -0800 Subject: ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error In ocfs2_mknod_locked if '__ocfs2_mknod_locke d' returns an error, we should reclaim the inode successfully claimed above, otherwise, the inode never be reused. The case is described below: ocfs2_mknod ocfs2_mknod_locked ocfs2_claim_new_inode Successfully claim the inode __ocfs2_mknod_locked ocfs2_journal_access_di Failed because of -ENOMEM or other reasons, the inode lockres has not been initialized yet. iput(inode) ocfs2_evict_inode ocfs2_delete_inode ocfs2_inode_lock ocfs2_inode_lock_full_nested __ocfs2_cluster_lock Return -EINVAL because of the inode lockres has not been initialized. So the following operations are not performed ocfs2_wipe_inode ocfs2_remove_inode ocfs2_free_dinode ocfs2_free_suballoc_bits Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 1206392..3b48ac2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -655,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); + if (status < 0) { + u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); + int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, + inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); + if (tmp) + mlog_errno(tmp); + } + + return status; } static int ocfs2_mkdir(struct inode *dir, -- cgit v0.10.2 From 5afc44e2e9678c0808211f1662732b368cc25f76 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:13 -0800 Subject: ocfs2: add uuid to ocfs2 thread name for problem analysis A node can mount multiple ocfs2 volumes. And if thread names are same for each volume/domain, it will bring inconvenience when analyzing problems because we have to identify which volume/domain the messages belong to. Since thread name will be printed to messages, so add volume uuid or dlm name to thread name can benefit problem analysis. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Gang He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6918f30..2ee7fe7 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) int status; unsigned int backoff; unsigned int total_backoff = 0; + char wq_name[O2NM_MAX_NAME_LEN]; BUG_ON(!dlm); @@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); + dlm->dlm_worker = create_singlethread_workqueue(wq_name); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 58eaa5c..9e4f862 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) mlog(0, "starting dlm recovery thread...\n"); dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, - "dlm_reco_thread"); + "dlm_reco-%s", dlm->name); if (IS_ERR(dlm->dlm_reco_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); dlm->dlm_reco_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 2e5e6d5..c5f6c24 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) { mlog(0, "Starting dlm_thread...\n"); - dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s", + dlm->name); if (IS_ERR(dlm->dlm_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_thread_task)); dlm->dlm_thread_task = NULL; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c91103..20276e3 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) } /* launch downconvert thread */ - osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", + osb->uuid_str); if (IS_ERR(osb->dc_task)) { status = PTR_ERR(osb->dc_task); osb->dc_task = NULL; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 04d6303..13534f4 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) /* Launch the commit thread */ if (!local) { osb->commit_task = kthread_run(ocfs2_commit_thread, osb, - "ocfs2cmt"); + "ocfs2cmt-%s", osb->uuid_str); if (IS_ERR(osb->commit_task)) { status = PTR_ERR(osb->commit_task); osb->commit_task = NULL; @@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) goto out; osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, - "ocfs2rec"); + "ocfs2rec-%s", osb->uuid_str); if (IS_ERR(osb->recovery_thread_task)) { mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); osb->recovery_thread_task = NULL; -- cgit v0.10.2 From 262d8a8779e24596ec3452ebb7d0a989de91089a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:16 -0800 Subject: ocfs2: clean up unused variable in ocfs2_duplicate_clusters_by_page() readahead_pages in ocfs2_duplicate_clusters_by_page is defined but not used, so clean it up. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e5d57cd..2521198 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to, readahead_pages; + unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); - readahead_pages = - (ocfs2_cow_contig_clusters(sb) << - OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* -- cgit v0.10.2 From 720abae3d68ae966044497dd9ee5ccf3b16e700c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 5 Nov 2015 18:44:18 -0800 Subject: rcu: force alignment on struct callback_head/rcu_head Make struct callback_head aligned to size of pointer. On most architectures it happens naturally due ABI requirements, but some architectures (like CRIS) have weird ABI and we need to ask it explicitly. The alignment is required to guarantee that bits 0 and 1 of @next will be clear under normal conditions -- as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. This guarantee is important for few reasons: - future call_rcu_lazy() will make use of lower bits in the pointer; - the structure shares storage spacer in struct page with @compound_head, which encode PageTail() in bit 0. The guarantee is needed to avoid false-positive PageTail(). False postive PageTail() caused crash on crisv32[1]. It happend due misaligned task_struct->rcu, which was byte-aligned. [1] http://lkml.kernel.org/r/55FAEA67.9000102@roeck-us.net Signed-off-by: Kirill A. Shutemov Reported-by: Guenter Roeck Tested-by: Guenter Roeck Acked-by: Paul E. McKenney Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/types.h b/include/linux/types.h index c314989..70d8500 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -205,11 +205,25 @@ struct ustat { * struct callback_head - callback structure for use with RCU and task_work * @next: next update requests in a list * @func: actual update function to call after the grace period. + * + * The struct is aligned to size of pointer. On most architectures it happens + * naturally due ABI requirements, but some architectures (like CRIS) have + * weird ABI and we need to ask it explicitly. + * + * The alignment is required to guarantee that bits 0 and 1 of @next will be + * clear under normal conditions -- as long as we use call_rcu(), + * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. + * + * This guarantee is important for few reasons: + * - future call_rcu_lazy() will make use of lower bits in the pointer; + * - the structure shares storage spacer in struct page with @compound_head, + * which encode PageTail() in bit 0. The guarantee is needed to avoid + * false-positive PageTail(). */ struct callback_head { struct callback_head *next; void (*func)(struct callback_head *head); -}; +} __attribute__((aligned(sizeof(void *)))); #define rcu_head callback_head typedef void (*rcu_callback_t)(struct rcu_head *head); -- cgit v0.10.2 From b64787401fd85b66403dd05159a749e333059c0a Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Thu, 5 Nov 2015 18:44:21 -0800 Subject: 9p: do not overwrite return code when locking fails If the remote locking fail, we run a local vfs unlock that should work and return success to userland when we didn't actually lock at all. We need to tell the application that tried to lock that it didn't get it, not that all went well. Signed-off-by: Dominique Martinet Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3abc447..6b74739 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -231,7 +231,8 @@ out_unlock: if (res < 0 && fl->fl_type != F_UNLCK) { fl_type = fl->fl_type; fl->fl_type = F_UNLCK; - res = posix_lock_file_wait(filp, fl); + /* Even if this fails we want to return the remote error */ + posix_lock_file_wait(filp, fl); fl->fl_type = fl_type; } out: -- cgit v0.10.2 From 451637e454f0b41689cd07cdc3fa53388c22890d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:44:24 -0800 Subject: kernel/watchdog.c: is_hardlockup can be boolean Make is_hardlockup return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 64ed1c3..568ba64 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -263,15 +263,15 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static int is_hardlockup(void) +static bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return 1; + return true; __this_cpu_write(hrtimer_interrupts_saved, hrint); - return 0; + return false; } #endif -- cgit v0.10.2 From d283c640cee6472852b95036ddd512c2ba0c1139 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:27 -0800 Subject: watchdog: fix error handling in proc_watchdog_thresh() The original watchdog_park_threads() function that was introduced by commit 81a4beef91ba ("watchdog: introduce watchdog_park_threads() and watchdog_unpark_threads()") takes a very simple approach to handle errors returned by kthread_park(): It attempts to roll back all watchdog threads to the unparked state. However, this may be undesired behaviour from the perspective of the caller which may want to handle errors as appropriate in its specific context. Currently, there are two possible call chains: - watchdog suspend/resume interface lockup_detector_suspend watchdog_park_threads - write to parameters in /proc/sys/kernel proc_watchdog_update watchdog_enable_all_cpus update_watchdog_all_cpus watchdog_park_threads Instead of 'blindly' attempting to unpark the watchdog threads if a kthread_park() call fails, the new approach is to disable the lockup detectors in the above call chains. Failure becomes visible to the user as follows: - error messages from lockup_detector_suspend() or watchdog_enable_all_cpus() - the state that can be read from /proc/sys/kernel/watchdog_enabled - the 'write' system call in the latter call chain returns an error I did not experience kthread_park() failures in practice, I used some instrumentation to fake error returns from kthread_park() in order to test the patches. This patch (of 5): Restore the previous value of watchdog_thresh _and_ sample_period if proc_watchdog_update() returns an error. The variables must be consistent to avoid false positives of the lockup detectors. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 568ba64..284f0e6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -914,13 +914,14 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, goto out; /* - * Update the sample period. - * Restore 'watchdog_thresh' on failure. + * Update the sample period. Restore on failure. */ set_sample_period(); err = proc_watchdog_update(); - if (err) + if (err) { watchdog_thresh = old; + set_sample_period(); + } out: mutex_unlock(&watchdog_proc_mutex); return err; -- cgit v0.10.2 From 58cf690a09987c9a56933df05c0369d691d6224d Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:30 -0800 Subject: watchdog: move watchdog_disable_all_cpus() outside of ifdef Move watchdog_disable_all_cpus() outside of the ifdef so that it is available if CONFIG_SYSCTL is not defined. This is preparation for "watchdog: implement error handling in update_watchdog_all_cpus() and callers". Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 284f0e6..f0f8a78 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -347,6 +347,9 @@ static void watchdog_interrupt_count(void) static int watchdog_nmi_enable(unsigned int cpu); static void watchdog_nmi_disable(unsigned int cpu); +static int watchdog_enable_all_cpus(void); +static void watchdog_disable_all_cpus(void); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -756,9 +759,6 @@ static int watchdog_enable_all_cpus(void) return err; } -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { if (watchdog_running) { @@ -767,6 +767,8 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL + /* * Update the run state of the lockup detectors. */ -- cgit v0.10.2 From b43cb43cb85b91d79d9f0719ff581e8cb6dfbb8f Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:33 -0800 Subject: watchdog: implement error handling in update_watchdog_all_cpus() and callers update_watchdog_all_cpus() now passes errors from watchdog_park_threads() up to functions in the call chain. This allows watchdog_enable_all_cpus() and proc_watchdog_update() to handle such errors too. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f0f8a78..704f933 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -731,10 +731,17 @@ void lockup_detector_resume(void) mutex_unlock(&watchdog_proc_mutex); } -static void update_watchdog_all_cpus(void) +static int update_watchdog_all_cpus(void) { - watchdog_park_threads(); + int ret; + + ret = watchdog_park_threads(); + if (ret) + return ret; + watchdog_unpark_threads(); + + return 0; } static int watchdog_enable_all_cpus(void) @@ -753,9 +760,17 @@ static int watchdog_enable_all_cpus(void) * Enable/disable the lockup detectors or * change the sample period 'on the fly'. */ - update_watchdog_all_cpus(); + err = update_watchdog_all_cpus(); + + if (err) { + watchdog_disable_all_cpus(); + pr_err("Failed to update lockup detectors, disabled\n"); + } } + if (err) + watchdog_enabled = 0; + return err; } @@ -851,12 +866,13 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } while (cmpxchg(&watchdog_enabled, old, new) != old); /* - * Update the run state of the lockup detectors. - * Restore 'watchdog_enabled' on failure. + * Update the run state of the lockup detectors. There is _no_ + * need to check the value returned by proc_watchdog_update() + * and to restore the previous value of 'watchdog_enabled' as + * both lockup detectors are disabled if proc_watchdog_update() + * returns an error. */ err = proc_watchdog_update(); - if (err) - watchdog_enabled = old; } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v0.10.2 From c993590c6ae6273681d9fb2a8d26dce03bf9d96c Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:36 -0800 Subject: watchdog: implement error handling in lockup_detector_suspend() lockup_detector_suspend() now handles errors from watchdog_park_threads(). Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 704f933..e8b19db 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -707,6 +707,11 @@ int lockup_detector_suspend(void) if (ret == 0) watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } mutex_unlock(&watchdog_proc_mutex); -- cgit v0.10.2 From ee7fed540563b27e1028bec0b509921496c91bf9 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:39 -0800 Subject: watchdog: do not unpark threads in watchdog_park_threads() on error If kthread_park() returns an error, watchdog_park_threads() should not blindly 'roll back' the already parked threads to the unparked state. Instead leave it up to the callers to handle such errors appropriately in their context. For example, it is redundant to unpark the threads if the lockup detectors will soon be disabled by the callers anyway. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Acked-by: Don Zickus Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e8b19db..452e4ed 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -654,6 +654,12 @@ static struct smp_hotplug_thread watchdog_threads = { /* * park all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function returns an error if kthread_park() of a watchdog thread + * fails. In this situation, the watchdog threads of some CPUs can already + * be parked and the watchdog threads of other CPUs can still be runnable. + * Callers are expected to handle this special condition as appropriate in + * their context. */ static int watchdog_park_threads(void) { @@ -665,10 +671,6 @@ static int watchdog_park_threads(void) if (ret) break; } - if (ret) { - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); - } put_online_cpus(); return ret; -- cgit v0.10.2 From 55537871ef666b4153fd1ef8782e4a13fee142cc Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 5 Nov 2015 18:44:41 -0800 Subject: kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup In many cases of hardlockup reports, it's actually not possible to know why it triggered, because the CPU that got stuck is usually waiting on a resource (with IRQs disabled) in posession of some other CPU is holding. IOW, we are often looking at the stacktrace of the victim and not the actual offender. Introduce sysctl / cmdline parameter that makes it possible to have hardlockup detector perform all-CPU backtrace. Signed-off-by: Jiri Kosina Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6263a2d..0231f45 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1269,6 +1269,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: such that (rxsize & ~0x1fffc0) == 0. Default: 1024 + hardlockup_all_cpu_backtrace= + [KNL] Should the hard-lockup detector generate + backtraces on all cpus. + Format: + hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on for 64-bit NUMA, off otherwise. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6fccb69..af70d15 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -33,6 +33,7 @@ show up in /proc/sys/kernel: - domainname - hostname - hotplug +- hardlockup_all_cpu_backtrace - hung_task_panic - hung_task_check_count - hung_task_timeout_secs @@ -293,6 +294,17 @@ domain names are in general different. For a detailed discussion see the hostname(1) man page. ============================================================== +hardlockup_all_cpu_backtrace: + +This value controls the hard lockup detector behavior when a hard +lockup condition is detected as to whether or not to gather further +debug information. If enabled, arch-specific all-CPU stack dumping +will be initiated. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. +============================================================== hotplug: diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 78488e0..7ec5b86 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -73,6 +73,7 @@ extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long *watchdog_cpumask_bits; extern int sysctl_softlockup_all_cpu_backtrace; +extern int sysctl_hardlockup_all_cpu_backtrace; struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 96c856b..1a5faa3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -898,6 +898,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_SMP */ #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 452e4ed..f6b32b8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif /* @@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); + struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + panic("Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; -- cgit v0.10.2 From ac1f591249d95372f3a5ab3828d4af5dfbf5efd3 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 5 Nov 2015 18:44:44 -0800 Subject: kernel/watchdog.c: add sysctl knob hardlockup_panic The only way to enable a hardlockup to panic the machine is to set 'nmi_watchdog=panic' on the kernel command line. This makes it awkward for end users and folks who want to run automate tests (like myself). Mimic the softlockup_panic knob and create a /proc/sys/kernel/hardlockup_panic knob. Signed-off-by: Don Zickus Cc: Ulrich Obergfell Acked-by: Jiri Kosina Reviewed-by: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt index 22dd6af..4a6e33e 100644 --- a/Documentation/lockup-watchdogs.txt +++ b/Documentation/lockup-watchdogs.txt @@ -20,8 +20,9 @@ kernel mode for more than 10 seconds (see "Implementation" below for details), without letting other interrupts have a chance to run. Similarly to the softlockup case, the current stack trace is displayed upon detection and the system will stay locked up unless the default -behavior is changed, which can be done through a compile time knob, -"BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog" +behavior is changed, which can be done through a sysctl, +'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", +and a kernel parameter, "nmi_watchdog" (see "Documentation/kernel-parameters.txt" for details). The panic option can be used in combination with panic_timeout (this diff --git a/include/linux/sched.h b/include/linux/sched.h index c115d61..5423b9c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -384,6 +384,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; +extern unsigned int hardlockup_panic; void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a5faa3..dc6858d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -888,6 +888,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f6b32b8..0a23125 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -112,7 +112,7 @@ static unsigned long soft_lockup_nmi_warn; * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic = +unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; /* -- cgit v0.10.2 From ee89e71eb091d3ef8ca2be8bd4ec77ccfa91334c Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:47 -0800 Subject: kernel/watchdog.c: avoid race between lockup detector suspend/resume and CPU hotplug The lockup detector suspend/resume interface that was introduced by commit 8c073d27d7ad ("watchdog: introduce watchdog_suspend() and watchdog_resume()") does not protect itself against races with CPU hotplug. Hence, theoretically it is possible that a new watchdog thread is started on a hotplugged CPU while the lockup detector is suspended, and the thread could thus interfere unexpectedly with the code that requested to suspend the lockup detector. Avoid the race by calling get_online_cpus() in lockup_detector_suspend() put_online_cpus() in lockup_detector_resume() Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0a23125..7357842 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -719,6 +719,7 @@ int lockup_detector_suspend(void) { int ret = 0; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); /* * Multiple suspend requests can be active in parallel (counted by @@ -759,6 +760,7 @@ void lockup_detector_resume(void) watchdog_unpark_threads(); mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); } static int update_watchdog_all_cpus(void) -- cgit v0.10.2 From 8614ddef82139d08234dbf681188f9bcddae9f03 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:50 -0800 Subject: kernel/watchdog.c: avoid races between /proc handlers and CPU hotplug The handler functions for watchdog parameters in /proc/sys/kernel do not protect themselves against races with CPU hotplug. Hence, theoretically it is possible that a new watchdog thread is started on a hotplugged CPU while a parameter is being modified, and the thread could thus use a parameter value that is 'in transition'. For example, if 'watchdog_thresh' is being set to zero (note: this disables the lockup detectors) the thread would erroneously use the value zero as the sample period. To avoid such races and to keep the /proc handler code consistent, call {get|put}_online_cpus() in proc_watchdog_common() {get|put}_online_cpus() in proc_watchdog_thresh() {get|put}_online_cpus() in proc_watchdog_cpumask() Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7357842..13fdda1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -857,6 +857,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); if (watchdog_suspended) { @@ -908,6 +909,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -949,6 +951,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); if (watchdog_suspended) { @@ -974,6 +977,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -988,6 +992,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); if (watchdog_suspended) { @@ -1015,6 +1020,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } -- cgit v0.10.2 From a2a45b85ec45db4b041ea5d93b21033dbc3cc0fc Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:53 -0800 Subject: kernel/watchdog.c: remove {get|put}_online_cpus() from watchdog_{park|unpark}_threads() watchdog_{park|unpark}_threads() are now called in code paths that protect themselves against CPU hotplug, so {get|put}_online_cpus() calls are redundant and can be removed. Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 13fdda1..84c4744 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -683,33 +683,35 @@ static struct smp_hotplug_thread watchdog_threads = { * be parked and the watchdog threads of other CPUs can still be runnable. * Callers are expected to handle this special condition as appropriate in * their context. + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). */ static int watchdog_park_threads(void) { int cpu, ret = 0; - get_online_cpus(); for_each_watchdog_cpu(cpu) { ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); if (ret) break; } - put_online_cpus(); return ret; } /* * unpark all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). */ static void watchdog_unpark_threads(void) { int cpu; - get_online_cpus(); for_each_watchdog_cpu(cpu) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); - put_online_cpus(); } /* -- cgit v0.10.2 From 39d2da2161d35de301ec5397ce9103c68b883054 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Thu, 5 Nov 2015 18:44:56 -0800 Subject: kernel/watchdog.c: fix race between proc_watchdog_thresh() and watchdog_timer_fn() Theoretically it is possible that the watchdog timer expires right at the time when a user sets 'watchdog_thresh' to zero (note: this disables the lockup detectors). In this scenario, the is_softlockup() function - which is called by the timer - could produce a false positive. Fix this by checking the current value of 'watchdog_thresh'. Signed-off-by: Ulrich Obergfell Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84c4744..18f34cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -289,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; -- cgit v0.10.2 From fda901241fb89449244537db4fb27b06e491b74f Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 5 Nov 2015 18:44:59 -0800 Subject: slab: convert slab_is_available() to boolean A good candidate to return a boolean result. Signed-off-by: Denis Kirjanov Cc: Christoph Lameter Reviewed-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/slab.h b/include/linux/slab.h index 7e37d44..7c82e3b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -111,7 +111,7 @@ struct mem_cgroup; * struct kmem_cache related prototypes */ void __init kmem_cache_init(void); -int slab_is_available(void); +bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, diff --git a/mm/slab_common.c b/mm/slab_common.c index 5ce4fae..113a6fd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -692,7 +692,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); -int slab_is_available(void) +bool slab_is_available(void) { return slab_state >= UP; } -- cgit v0.10.2 From a744fd17b5233360681ce03e43804406745b680b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 5 Nov 2015 18:45:02 -0800 Subject: compiler.h: add support for function attribute assume_aligned gcc 4.9 added the function attribute assume_aligned, indicating to the caller that the returned pointer may be assumed to have a certain minimal alignment. This is useful if, for example, the return value is passed to memset(). Add a shorthand macro for that. Signed-off-by: Rasmus Villemoes Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 8efb40e..02fa617 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -210,6 +210,23 @@ #define __visible __attribute__((externally_visible)) #endif + +#if GCC_VERSION >= 40900 +/* + * __assume_aligned(n, k): Tell the optimizer that the returned + * pointer can be assumed to be k modulo n. The second argument is + * optional (default 0), so we use a variadic macro to make the + * shorthand. + * + * Beware: Do not apply this to functions which may return + * ERR_PTRs. Also, it is probably unwise to apply it to functions + * returning extra information in the low bits (but in that case the + * compiler should see some alignment anyway, when the return value is + * massaged by 'flags = ptr & 3; ptr &= ~3;'). + */ +#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) +#endif + /* * GCC 'asm goto' miscompiles certain code sequences: * diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 52a459f..4dac103 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -417,6 +417,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define __visible #endif +/* + * Assume alignment of return value. + */ +#ifndef __assume_aligned +#define __assume_aligned(a, ...) +#endif + + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -- cgit v0.10.2 From 8748dd5c98f7fe506ccb31a094833401ed120915 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 5 Nov 2015 18:45:05 -0800 Subject: include/linux/compiler-gcc.h: hide assume_aligned attribute from sparse The patch "slab.h: sprinkle __assume_aligned attributes" causes *tons* of whinges if you do 'make C=2' with sparse 0.5.0: CHECK drivers/media/usb/pwc/pwc-if.c include/linux/slab.h:307:43: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:308:58: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:337:73: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:375:74: error: attribute '__assume_aligned__': unknown attribute include/linux/slab.h:378:80: error: attribute '__assume_aligned__': unknown attribute sparse apparently pretends to be gcc >= 4.9, yet isn't prepared to handle all the function attributes supported by those gccs and complains loudly. So hide the definition of __assume_aligned from it (so that the generic one in compiler.h gets used). Signed-off-by: Rasmus Villemoes Reported-by: Valdis Kletnieks Tested-By: Valdis Kletnieks Cc: Christopher Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 02fa617..0e3110a 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -211,7 +211,7 @@ #endif -#if GCC_VERSION >= 40900 +#if GCC_VERSION >= 40900 && !defined(__CHECKER__) /* * __assume_aligned(n, k): Tell the optimizer that the returned * pointer can be assumed to be k modulo n. The second argument is -- cgit v0.10.2 From c9a77a792003ce9d70df8937c8c87aee6e177149 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:45:08 -0800 Subject: mm/slab_common.c: rename cache create/destroy helpers do_kmem_cache_create(), do_kmem_cache_shutdown(), and do_kmem_cache_release() sound awkward for static helper functions that are not supposed to be used outside slab_common.c. Rename them to create_cache(), shutdown_cache(), and release_caches(), respectively. This patch is a pure cleanup and does not introduce any functional changes. Signed-off-by: Vladimir Davydov Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab_common.c b/mm/slab_common.c index 113a6fd..c8d2ed7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static struct kmem_cache * -do_kmem_cache_create(const char *name, size_t object_size, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct mem_cgroup *memcg, struct kmem_cache *root_cache) +static struct kmem_cache *create_cache(const char *name, + size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; int err; @@ -418,9 +418,9 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } - s = do_kmem_cache_create(cache_name, size, size, - calculate_alignment(flags, align, size), - flags, ctor, NULL, NULL); + s = create_cache(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); @@ -448,7 +448,7 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); -static int do_kmem_cache_shutdown(struct kmem_cache *s, +static int shutdown_cache(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { if (__kmem_cache_shutdown(s) != 0) { @@ -469,8 +469,7 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, return 0; } -static void do_kmem_cache_release(struct list_head *release, - bool need_rcu_barrier) +static void release_caches(struct list_head *release, bool need_rcu_barrier) { struct kmem_cache *s, *s2; @@ -536,10 +535,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, if (!cache_name) goto out_unlock; - s = do_kmem_cache_create(cache_name, root_cache->object_size, - root_cache->size, root_cache->align, - root_cache->flags, root_cache->ctor, - memcg, root_cache); + s = create_cache(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -615,14 +614,14 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + BUG_ON(shutdown_cache(s, &release, &need_rcu_barrier)); } mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - do_kmem_cache_release(&release, need_rcu_barrier); + release_caches(&release, need_rcu_barrier); } #endif /* CONFIG_MEMCG_KMEM */ @@ -655,12 +654,12 @@ void kmem_cache_destroy(struct kmem_cache *s) goto out_unlock; for_each_memcg_cache_safe(c, c2, s) { - if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + if (shutdown_cache(c, &release, &need_rcu_barrier)) busy = true; } if (!busy) - do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); + shutdown_cache(s, &release, &need_rcu_barrier); out_unlock: mutex_unlock(&slab_mutex); @@ -668,7 +667,7 @@ out_unlock: put_online_mems(); put_online_cpus(); - do_kmem_cache_release(&release, need_rcu_barrier); + release_caches(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v0.10.2 From d60fdcc9e3febde2ebd49fe517e13f428bc12843 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:45:11 -0800 Subject: mm/slab_common.c: clear pointers to per memcg caches on destroy Currently, we do not clear pointers to per memcg caches in the memcg_params.memcg_caches array when a global cache is destroyed with kmem_cache_destroy. This is fine if the global cache does get destroyed. However, a cache can be left on the list if it still has active objects when kmem_cache_destroy is called (due to a memory leak). If this happens, the entries in the array will point to already freed areas, which is likely to result in data corruption when the cache is reused (via slab merging). Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.h b/mm/slab.h index a3a967d..bf51a8d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); list_for_each_entry(iter, &(root)->memcg_params.list, \ memcg_params.list) -#define for_each_memcg_cache_safe(iter, tmp, root) \ - list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ - memcg_params.list) - static inline bool is_root_cache(struct kmem_cache *s) { return s->memcg_params.is_root_cache; @@ -265,8 +261,6 @@ extern void slab_init_memcg_params(struct kmem_cache *); #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) -#define for_each_memcg_cache_safe(iter, tmp, root) \ - for ((void)(iter), (void)(tmp), (void)(root); 0; ) static inline bool is_root_cache(struct kmem_cache *s) { diff --git a/mm/slab_common.c b/mm/slab_common.c index c8d2ed7..ab1f20e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -461,10 +461,6 @@ static int shutdown_cache(struct kmem_cache *s, if (s->flags & SLAB_DESTROY_BY_RCU) *need_rcu_barrier = true; -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - list_del(&s->memcg_params.list); -#endif list_move(&s->list, release); return 0; } @@ -597,6 +593,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } +static int __shutdown_memcg_cache(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + BUG_ON(is_root_cache(s)); + + if (shutdown_cache(s, release, need_rcu_barrier)) + return -EBUSY; + + list_del(&s->memcg_params.list); + return 0; +} + void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { LIST_HEAD(release); @@ -614,7 +622,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(shutdown_cache(s, &release, &need_rcu_barrier)); + BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); } mutex_unlock(&slab_mutex); @@ -623,6 +631,68 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) release_caches(&release, need_rcu_barrier); } + +static int shutdown_memcg_caches(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + struct memcg_cache_array *arr; + struct kmem_cache *c, *c2; + LIST_HEAD(busy); + int i; + + BUG_ON(!is_root_cache(s)); + + /* + * First, shutdown active caches, i.e. caches that belong to online + * memory cgroups. + */ + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = arr->entries[i]; + if (!c) + continue; + if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) + /* + * The cache still has objects. Move it to a temporary + * list so as not to try to destroy it for a second + * time while iterating over inactive caches below. + */ + list_move(&c->memcg_params.list, &busy); + else + /* + * The cache is empty and will be destroyed soon. Clear + * the pointer to it in the memcg_caches array so that + * it will never be accessed even if the root cache + * stays alive. + */ + arr->entries[i] = NULL; + } + + /* + * Second, shutdown all caches left from memory cgroups that are now + * offline. + */ + list_for_each_entry_safe(c, c2, &s->memcg_params.list, + memcg_params.list) + __shutdown_memcg_cache(c, release, need_rcu_barrier); + + list_splice(&busy, &s->memcg_params.list); + + /* + * A cache being destroyed must be empty. In particular, this means + * that all per memcg caches attached to it must be empty too. + */ + if (!list_empty(&s->memcg_params.list)) + return -EBUSY; + return 0; +} +#else +static inline int shutdown_memcg_caches(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + return 0; +} #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) @@ -634,16 +704,13 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - struct kmem_cache *c, *c2; LIST_HEAD(release); bool need_rcu_barrier = false; - bool busy = false; + int err; if (unlikely(!s)) return; - BUG_ON(!is_root_cache(s)); - get_online_cpus(); get_online_mems(); @@ -653,12 +720,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - for_each_memcg_cache_safe(c, c2, s) { - if (shutdown_cache(c, &release, &need_rcu_barrier)) - busy = true; - } - - if (!busy) + err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); + if (!err) shutdown_cache(s, &release, &need_rcu_barrier); out_unlock: -- cgit v0.10.2 From cd918c557439c8f0750f64883367aeff264b5fd8 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:45:14 -0800 Subject: mm/slab_common.c: do not warn that cache is busy on destroy more than once Currently, when kmem_cache_destroy() is called for a global cache, we print a warning for each per memcg cache attached to it that has active objects (see shutdown_cache). This is redundant, because it gives no new information and only clutters the log. If a cache being destroyed has active objects, there must be a memory leak in the module that created the cache, and it does not matter if the cache was used by users in memory cgroups or not. This patch moves the warning from shutdown_cache(), which is called for shutting down both global and per memcg caches, to kmem_cache_destroy(), so that the warning is only printed once if there are objects left in the cache being destroyed. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab_common.c b/mm/slab_common.c index ab1f20e..fba78e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -451,12 +451,8 @@ EXPORT_SYMBOL(kmem_cache_create); static int shutdown_cache(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); + if (__kmem_cache_shutdown(s) != 0) return -EBUSY; - } if (s->flags & SLAB_DESTROY_BY_RCU) *need_rcu_barrier = true; @@ -722,8 +718,13 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); if (!err) - shutdown_cache(s, &release, &need_rcu_barrier); + err = shutdown_cache(s, &release, &need_rcu_barrier); + if (err) { + pr_err("kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + } out_unlock: mutex_unlock(&slab_mutex); -- cgit v0.10.2 From 2b10075539d334b4732a07857672972fe971f7df Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:17 -0800 Subject: tools/vm/slabinfo: use getopt no_argument/optional_argument This patchset adds 'extended' slabinfo mode that provides additional information: -- totals summary -- slabs sorted by size -- slabs sorted by loss (waste) The patches also introduces several new slabinfo options to limit the number of slabs reported, sort slabs by loss (waste); and some fixes. Extended output example (slabinfo -X -N 2): Slabcache Totals ---------------- Slabcaches : 91 Aliases : 119->69 Active: 63 Memory used: 199798784 # Loss : 10689376 MRatio: 5% # Objects : 324301 # PartObj: 18151 ORatio: 5% Per Cache Average Min Max Total ---------------------------------------------------------------------------- #Objects 5147 1 89068 324301 #Slabs 199 1 3886 12537 #PartSlab 12 0 240 778 %PartSlab 32% 0% 100% 6% PartObjs 5 0 4569 18151 % PartObj 26% 0% 100% 5% Memory 3171409 8192 127336448 199798784 Used 3001736 160 121429728 189109408 Loss 169672 0 5906720 10689376 Per Object Average Min Max ----------------------------------------------------------- Memory 585 8 8192 User 583 8 8192 Loss 2 0 64 Slabs sorted by size -------------------- Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg ext4_inode_cache 69948 1736 127336448 3871/0/15 18 3 0 95 a dentry 89068 288 26058752 3164/0/17 28 1 0 98 a Slabs sorted by loss -------------------- Name Objects Objsize Loss Slabs/Part/Cpu O/S O %Fr %Ef Flg ext4_inode_cache 69948 1736 5906720 3871/0/15 18 3 0 95 a inode_cache 11628 864 537472 642/0/4 18 2 0 94 a The last patch in the series addresses Linus' comment from http://marc.info/?l=linux-mm&m=144148518703321&w=2 (well, it's been some time. sorry.) gnuplot script takes the slabinfo records file, where every record is a `slabinfo -X' output. So the basic workflow is, for example, as follows: while [ 1 ]; do slabinfo -X -N 2 >> stats; sleep 1; done ^C slabinfo-gnuplot.sh stats The last command will produce 3 png files (and 3 stats files) -- graph of slabinfo totals -- graph of slabs by size -- graph of slabs by loss It's also possible to select a range of records for plotting (a range of collected slabinfo outputs) via `-r 10,100` (for example); and compare totals from several measurements (to visially compare slabs behaviour (10,50 range)) using pre-parsed totals files: slabinfo-gnuplot.sh -r 10,50 -t stats-totals1 .. stats-totals2 This also, technically, supports ktest. Upload new slabinfo to target, collect the stats and give the resulting stats file to slabinfo-gnuplot This patch (of 8): Use getopt constants in `struct option' ->has_arg instead of numerical representations. Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 808d5a9..258ed01 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -1268,23 +1268,23 @@ static void output_slabs(void) } struct option opts[] = { - { "aliases", 0, NULL, 'a' }, - { "activity", 0, NULL, 'A' }, - { "debug", 2, NULL, 'd' }, - { "display-activity", 0, NULL, 'D' }, - { "empty", 0, NULL, 'e' }, - { "first-alias", 0, NULL, 'f' }, - { "help", 0, NULL, 'h' }, - { "inverted", 0, NULL, 'i'}, - { "numa", 0, NULL, 'n' }, - { "ops", 0, NULL, 'o' }, - { "report", 0, NULL, 'r' }, - { "shrink", 0, NULL, 's' }, - { "slabs", 0, NULL, 'l' }, - { "track", 0, NULL, 't'}, - { "validate", 0, NULL, 'v' }, - { "zero", 0, NULL, 'z' }, - { "1ref", 0, NULL, '1'}, + { "aliases", no_argument, NULL, 'a' }, + { "activity", no_argument, NULL, 'A' }, + { "debug", optional_argument, NULL, 'd' }, + { "display-activity", no_argument, NULL, 'D' }, + { "empty", no_argument, NULL, 'e' }, + { "first-alias", no_argument, NULL, 'f' }, + { "help", no_argument, NULL, 'h' }, + { "inverted", no_argument, NULL, 'i'}, + { "numa", no_argument, NULL, 'n' }, + { "ops", no_argument, NULL, 'o' }, + { "report", no_argument, NULL, 'r' }, + { "shrink", no_argument, NULL, 's' }, + { "slabs", no_argument, NULL, 'l' }, + { "track", no_argument, NULL, 't'}, + { "validate", no_argument, NULL, 'v' }, + { "zero", no_argument, NULL, 'z' }, + { "1ref", no_argument, NULL, '1'}, { NULL, 0, NULL, 0 } }; -- cgit v0.10.2 From 4980a9639b4bf4948f4cba60e3e6c45a635b98ec Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:20 -0800 Subject: tools/vm/slabinfo: limit the number of reported slabs Introduce opt "-N|--lines=K" to limit the number of slabs being reported in output_slabs(). Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 258ed01..2ef7f0c 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -79,6 +79,7 @@ int sort_active = 0; int set_debug = 0; int show_ops = 0; int show_activity = 0; +int output_lines = -1; /* Debug options */ int sanity = 0; @@ -124,6 +125,7 @@ static void usage(void) "-v|--validate Validate slabs\n" "-z|--zero Include empty slabs\n" "-1|--1ref Single reference\n" + "-N|--lines=K Show the first K slabs\n" "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" @@ -1242,11 +1244,14 @@ static void output_slabs(void) { struct slabinfo *slab; - for (slab = slabinfo; slab < slabinfo + slabs; slab++) { + for (slab = slabinfo; (slab < slabinfo + slabs) && + output_lines != 0; slab++) { if (slab->alias) continue; + if (output_lines != -1) + output_lines--; if (show_numa) slab_numa(slab, 0); @@ -1285,6 +1290,7 @@ struct option opts[] = { { "validate", no_argument, NULL, 'v' }, { "zero", no_argument, NULL, 'z' }, { "1ref", no_argument, NULL, '1'}, + { "lines", required_argument, NULL, 'N'}, { NULL, 0, NULL, 0 } }; @@ -1296,7 +1302,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS", + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:", opts, NULL)) != -1) switch (c) { case '1': @@ -1358,7 +1364,13 @@ int main(int argc, char *argv[]) case 'S': sort_size = 1; break; - + case 'N': + if (optarg) { + output_lines = atoi(optarg); + if (output_lines < 1) + output_lines = 1; + } + break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); -- cgit v0.10.2 From 2651f6e7fe1c24b0a50691828edd392971d57d4f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:22 -0800 Subject: tools/vm/slabinfo: sort slabs by loss Introduce opt "-L|--sort-loss" to sort and output slabs by loss (waste) in slabcache(). Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 2ef7f0c..2e154f1 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -80,6 +80,7 @@ int set_debug = 0; int show_ops = 0; int show_activity = 0; int output_lines = -1; +int sort_loss; /* Debug options */ int sanity = 0; @@ -126,6 +127,7 @@ static void usage(void) "-z|--zero Include empty slabs\n" "-1|--1ref Single reference\n" "-N|--lines=K Show the first K slabs\n" + "-L|--Loss Sort by loss\n" "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" @@ -301,8 +303,9 @@ static void first_line(void) if (show_activity) printf("Name Objects Alloc Free %%Fast Fallb O CmpX UL\n"); else - printf("Name Objects Objsize Space " - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); + printf("Name Objects Objsize %s " + "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", + sort_loss ? "Loss" : "Space"); } /* @@ -335,6 +338,11 @@ static unsigned long slab_activity(struct slabinfo *s) s->alloc_slowpath + s->free_slowpath; } +static unsigned long slab_waste(struct slabinfo *s) +{ + return slab_size(s) - s->objects * s->object_size; +} + static void slab_numa(struct slabinfo *s, int mode) { int node; @@ -563,7 +571,10 @@ static void slabcache(struct slabinfo *s) if (show_empty && s->slabs) return; - store_size(size_str, slab_size(s)); + if (sort_loss == 0) + store_size(size_str, slab_size(s)); + else + store_size(size_str, slab_waste(s)); snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, s->partial, s->cpu_slabs); @@ -1013,6 +1024,8 @@ static void sort_slabs(void) result = slab_size(s1) < slab_size(s2); else if (sort_active) result = slab_activity(s1) < slab_activity(s2); + else if (sort_loss) + result = slab_waste(s1) < slab_waste(s2); else result = strcasecmp(s1->name, s2->name); @@ -1291,6 +1304,7 @@ struct option opts[] = { { "zero", no_argument, NULL, 'z' }, { "1ref", no_argument, NULL, '1'}, { "lines", required_argument, NULL, 'N'}, + { "Loss", no_argument, NULL, 'L'}, { NULL, 0, NULL, 0 } }; @@ -1302,7 +1316,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:", + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:L", opts, NULL)) != -1) switch (c) { case '1': @@ -1371,6 +1385,9 @@ int main(int argc, char *argv[]) output_lines = 1; } break; + case 'L': + sort_loss = 1; + break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); -- cgit v0.10.2 From 0d00bf589f0f8976b5bccbb5bd03a497762b5545 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:25 -0800 Subject: tools/vm/slabinfo: fix alternate opts names Fix mismatches between usage() output and real opts[] options. Add missing alternative opt names, e.g., '-S' had no '--Size' opts[] entry, etc. Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 2e154f1..de4974d 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -1294,12 +1294,14 @@ struct option opts[] = { { "first-alias", no_argument, NULL, 'f' }, { "help", no_argument, NULL, 'h' }, { "inverted", no_argument, NULL, 'i'}, + { "slabs", no_argument, NULL, 'l' }, { "numa", no_argument, NULL, 'n' }, { "ops", no_argument, NULL, 'o' }, - { "report", no_argument, NULL, 'r' }, { "shrink", no_argument, NULL, 's' }, - { "slabs", no_argument, NULL, 'l' }, - { "track", no_argument, NULL, 't'}, + { "report", no_argument, NULL, 'r' }, + { "Size", no_argument, NULL, 'S'}, + { "tracking", no_argument, NULL, 't'}, + { "Totals", no_argument, NULL, 'T'}, { "validate", no_argument, NULL, 'v' }, { "zero", no_argument, NULL, 'z' }, { "1ref", no_argument, NULL, '1'}, -- cgit v0.10.2 From 016c6cdf3df8b05a23ff7b0e77aa7f4fe83e700d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:28 -0800 Subject: tools/vm/slabinfo: introduce extended totals mode Add "-X|--Xtotals" opt to output extended totals summary, which includes: -- totals summary -- slabs sorted by size -- slabs sorted by loss (waste) Example: ======= slabinfo --X -N 1 Slabcache Totals ---------------- Slabcaches : 91 Aliases : 120->69 Active: 65 Memory used: 568.3M # Loss : 30.4M MRatio: 5% # Objects : 920.1K # PartObj: 161.2K ORatio: 17% Per Cache Average Min Max Total --------------------------------------------------------- #Objects 14.1K 1 227.8K 920.1K #Slabs 533 1 11.7K 34.7K #PartSlab 86 0 4.3K 5.6K %PartSlab 24% 0% 100% 16% PartObjs 17 0 129.3K 161.2K % PartObj 17% 0% 100% 17% Memory 8.7M 8.1K 384.7M 568.3M Used 8.2M 160 366.5M 537.9M Loss 468.8K 0 18.2M 30.4M Per Object Average Min Max --------------------------------------------- Memory 587 8 8.1K User 584 8 8.1K Loss 2 0 64 Slabs sorted by size ---------------------- Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg ext4_inode_cache 211142 1736 384.7M 11732/40/10 18 3 0 95 a Slabs sorted by loss ---------------------- Name Objects Objsize Loss Slabs/Part/Cpu O/S O %Fr %Ef Flg ext4_inode_cache 211142 1736 18.2M 11732/40/10 18 3 0 95 a Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index de4974d..44b83ce 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -81,6 +81,7 @@ int show_ops = 0; int show_activity = 0; int output_lines = -1; int sort_loss; +int extended_totals; /* Debug options */ int sanity = 0; @@ -128,6 +129,7 @@ static void usage(void) "-1|--1ref Single reference\n" "-N|--lines=K Show the first K slabs\n" "-L|--Loss Sort by loss\n" + "-X|--Xtotals Show extended summary information\n" "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" @@ -615,8 +617,7 @@ static void slabcache(struct slabinfo *s) total_free ? (s->free_fastpath * 100 / total_free) : 0, s->order_fallback, s->order, s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail); - } - else + } else { printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", s->name, s->objects, s->object_size, size_str, dist_str, s->objs_per_slab, s->order, @@ -624,6 +625,7 @@ static void slabcache(struct slabinfo *s) s->slabs ? (s->objects * s->object_size * 100) / (s->slabs * (page_size << s->order)) : 100, flags); + } } /* @@ -1256,15 +1258,16 @@ static void read_slab_dir(void) static void output_slabs(void) { struct slabinfo *slab; + int lines = output_lines; for (slab = slabinfo; (slab < slabinfo + slabs) && - output_lines != 0; slab++) { + lines != 0; slab++) { if (slab->alias) continue; - if (output_lines != -1) - output_lines--; + if (lines != -1) + lines--; if (show_numa) slab_numa(slab, 0); @@ -1285,6 +1288,30 @@ static void output_slabs(void) } } +static void xtotals(void) +{ + totals(); + + link_slabs(); + rename_slabs(); + + printf("\nSlabs sorted by size\n"); + printf("----------------------\n"); + sort_loss = 0; + sort_size = 1; + sort_slabs(); + output_slabs(); + + printf("\nSlabs sorted by loss\n"); + printf("----------------------\n"); + line = 0; + sort_loss = 1; + sort_size = 0; + sort_slabs(); + output_slabs(); + printf("\n"); +} + struct option opts[] = { { "aliases", no_argument, NULL, 'a' }, { "activity", no_argument, NULL, 'A' }, @@ -1307,6 +1334,7 @@ struct option opts[] = { { "1ref", no_argument, NULL, '1'}, { "lines", required_argument, NULL, 'N'}, { "Loss", no_argument, NULL, 'L'}, + { "Xtotals", no_argument, NULL, 'X'}, { NULL, 0, NULL, 0 } }; @@ -1318,7 +1346,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:L", + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LX", opts, NULL)) != -1) switch (c) { case '1': @@ -1390,6 +1418,11 @@ int main(int argc, char *argv[]) case 'L': sort_loss = 1; break; + case 'X': + if (output_lines == -1) + output_lines = 1; + extended_totals = 1; + break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); @@ -1409,12 +1442,13 @@ int main(int argc, char *argv[]) fatal("%s: Invalid pattern '%s' code %d\n", argv[0], pattern_source, err); read_slab_dir(); - if (show_alias) + if (show_alias) { alias(); - else - if (show_totals) + } else if (extended_totals) { + xtotals(); + } else if (show_totals) { totals(); - else { + } else { link_slabs(); rename_slabs(); sort_slabs(); -- cgit v0.10.2 From a8ea0bf1286a3a96466139a828f3d161d19ca015 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:31 -0800 Subject: tools/vm/slabinfo: output sizes in bytes Introduce "-B|--Bytes" opt to disable store_size() dynamic size scaling and report size in bytes instead. This `expands' the interface a bit, it's impossible to use printf("%6s") anymore to output sizes. Example: slabinfo -X -N 2 Slabcache Totals ---------------- Slabcaches : 91 Aliases : 119->69 Active: 63 Memory used: 199798784 # Loss : 10689376 MRatio: 5% # Objects : 324301 # PartObj: 18151 ORatio: 5% Per Cache Average Min Max Total ---------------------------------------------------------------------------- #Objects 5147 1 89068 324301 #Slabs 199 1 3886 12537 #PartSlab 12 0 240 778 %PartSlab 32% 0% 100% 6% PartObjs 5 0 4569 18151 % PartObj 26% 0% 100% 5% Memory 3171409 8192 127336448 199798784 Used 3001736 160 121429728 189109408 Loss 169672 0 5906720 10689376 Per Object Average Min Max ----------------------------------------------------------- Memory 585 8 8192 User 583 8 8192 Loss 2 0 64 Slabs sorted by size -------------------- Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg ext4_inode_cache 69948 1736 127336448 3871/0/15 18 3 0 95 a dentry 89068 288 26058752 3164/0/17 28 1 0 98 a Slabs sorted by loss -------------------- Name Objects Objsize Loss Slabs/Part/Cpu O/S O %Fr %Ef Flg ext4_inode_cache 69948 1736 5906720 3871/0/15 18 3 0 95 a inode_cache 11628 864 537472 642/0/4 18 2 0 94 a Besides, store_size() does not use powers of two for G/M/K if (value > 1000000000UL) { divisor = 100000000UL; trailer = 'G'; } else if (value > 1000000UL) { divisor = 100000UL; trailer = 'M'; } else if (value > 1000UL) { divisor = 100; trailer = 'K'; } Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 44b83ce..60beb91 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -82,6 +82,7 @@ int show_activity = 0; int output_lines = -1; int sort_loss; int extended_totals; +int show_bytes; /* Debug options */ int sanity = 0; @@ -130,6 +131,7 @@ static void usage(void) "-N|--lines=K Show the first K slabs\n" "-L|--Loss Sort by loss\n" "-X|--Xtotals Show extended summary information\n" + "-B|--Bytes Show size in bytes\n" "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" @@ -231,15 +233,17 @@ static int store_size(char *buffer, unsigned long value) char trailer = 0; int n; - if (value > 1000000000UL) { - divisor = 100000000UL; - trailer = 'G'; - } else if (value > 1000000UL) { - divisor = 100000UL; - trailer = 'M'; - } else if (value > 1000UL) { - divisor = 100; - trailer = 'K'; + if (!show_bytes) { + if (value > 1000000000UL) { + divisor = 100000000UL; + trailer = 'G'; + } else if (value > 1000000UL) { + divisor = 100000UL; + trailer = 'M'; + } else if (value > 1000UL) { + divisor = 100; + trailer = 'K'; + } } value /= divisor; @@ -303,11 +307,12 @@ int line = 0; static void first_line(void) { if (show_activity) - printf("Name Objects Alloc Free %%Fast Fallb O CmpX UL\n"); + printf("Name Objects Alloc Free" + " %%Fast Fallb O CmpX UL\n"); else - printf("Name Objects Objsize %s " + printf("Name Objects Objsize %s " "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n", - sort_loss ? "Loss" : "Space"); + sort_loss ? " Loss" : "Space"); } /* @@ -516,7 +521,7 @@ static void report(struct slabinfo *s) if (strcmp(s->name, "*") == 0) return; - printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n", + printf("\nSlabcache: %-15s Aliases: %2d Order : %2d Objects: %lu\n", s->name, s->aliases, s->order, s->objects); if (s->hwcache_align) printf("** Hardware cacheline aligned\n"); @@ -618,7 +623,7 @@ static void slabcache(struct slabinfo *s) s->order_fallback, s->order, s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail); } else { - printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", + printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n", s->name, s->objects, s->object_size, size_str, dist_str, s->objs_per_slab, s->order, s->slabs ? (s->partial * 100) / s->slabs : 100, @@ -933,84 +938,88 @@ static void totals(void) printf("Slabcache Totals\n"); printf("----------------\n"); - printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n", + printf("Slabcaches : %15d Aliases : %11d->%-3d Active: %3d\n", slabs, aliases, alias_targets, used_slabs); store_size(b1, total_size);store_size(b2, total_waste); store_size(b3, total_waste * 100 / total_used); - printf("Memory used: %6s # Loss : %6s MRatio:%6s%%\n", b1, b2, b3); + printf("Memory used: %15s # Loss : %15s MRatio:%6s%%\n", b1, b2, b3); store_size(b1, total_objects);store_size(b2, total_partobj); store_size(b3, total_partobj * 100 / total_objects); - printf("# Objects : %6s # PartObj: %6s ORatio:%6s%%\n", b1, b2, b3); + printf("# Objects : %15s # PartObj: %15s ORatio:%6s%%\n", b1, b2, b3); printf("\n"); - printf("Per Cache Average Min Max Total\n"); - printf("---------------------------------------------------------\n"); + printf("Per Cache Average " + "Min Max Total\n"); + printf("---------------------------------------" + "-------------------------------------\n"); store_size(b1, avg_objects);store_size(b2, min_objects); store_size(b3, max_objects);store_size(b4, total_objects); - printf("#Objects %10s %10s %10s %10s\n", + printf("#Objects %15s %15s %15s %15s\n", b1, b2, b3, b4); store_size(b1, avg_slabs);store_size(b2, min_slabs); store_size(b3, max_slabs);store_size(b4, total_slabs); - printf("#Slabs %10s %10s %10s %10s\n", + printf("#Slabs %15s %15s %15s %15s\n", b1, b2, b3, b4); store_size(b1, avg_partial);store_size(b2, min_partial); store_size(b3, max_partial);store_size(b4, total_partial); - printf("#PartSlab %10s %10s %10s %10s\n", + printf("#PartSlab %15s %15s %15s %15s\n", b1, b2, b3, b4); store_size(b1, avg_ppart);store_size(b2, min_ppart); store_size(b3, max_ppart); store_size(b4, total_partial * 100 / total_slabs); - printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n", + printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n", b1, b2, b3, b4); store_size(b1, avg_partobj);store_size(b2, min_partobj); store_size(b3, max_partobj); store_size(b4, total_partobj); - printf("PartObjs %10s %10s %10s %10s\n", + printf("PartObjs %15s %15s %15s %15s\n", b1, b2, b3, b4); store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj); store_size(b3, max_ppartobj); store_size(b4, total_partobj * 100 / total_objects); - printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n", + printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n", b1, b2, b3, b4); store_size(b1, avg_size);store_size(b2, min_size); store_size(b3, max_size);store_size(b4, total_size); - printf("Memory %10s %10s %10s %10s\n", + printf("Memory %15s %15s %15s %15s\n", b1, b2, b3, b4); store_size(b1, avg_used);store_size(b2, min_used); store_size(b3, max_used);store_size(b4, total_used); - printf("Used %10s %10s %10s %10s\n", + printf("Used %15s %15s %15s %15s\n", b1, b2, b3, b4); store_size(b1, avg_waste);store_size(b2, min_waste); store_size(b3, max_waste);store_size(b4, total_waste); - printf("Loss %10s %10s %10s %10s\n", + printf("Loss %15s %15s %15s %15s\n", b1, b2, b3, b4); printf("\n"); - printf("Per Object Average Min Max\n"); - printf("---------------------------------------------\n"); + printf("Per Object Average " + "Min Max\n"); + printf("---------------------------------------" + "--------------------\n"); store_size(b1, avg_memobj);store_size(b2, min_memobj); store_size(b3, max_memobj); - printf("Memory %10s %10s %10s\n", + printf("Memory %15s %15s %15s\n", b1, b2, b3); store_size(b1, avg_objsize);store_size(b2, min_objsize); store_size(b3, max_objsize); - printf("User %10s %10s %10s\n", + printf("User %15s %15s %15s\n", b1, b2, b3); store_size(b1, avg_objwaste);store_size(b2, min_objwaste); store_size(b3, max_objwaste); - printf("Loss %10s %10s %10s\n", + printf("Loss %15s %15s %15s\n", b1, b2, b3); } @@ -1112,7 +1121,7 @@ static void alias(void) active = a->slab->name; } else - printf("%-20s -> %s\n", a->name, a->slab->name); + printf("%-15s -> %s\n", a->name, a->slab->name); } if (active) printf("\n"); @@ -1296,14 +1305,14 @@ static void xtotals(void) rename_slabs(); printf("\nSlabs sorted by size\n"); - printf("----------------------\n"); + printf("--------------------\n"); sort_loss = 0; sort_size = 1; sort_slabs(); output_slabs(); printf("\nSlabs sorted by loss\n"); - printf("----------------------\n"); + printf("--------------------\n"); line = 0; sort_loss = 1; sort_size = 0; @@ -1335,6 +1344,7 @@ struct option opts[] = { { "lines", required_argument, NULL, 'N'}, { "Loss", no_argument, NULL, 'L'}, { "Xtotals", no_argument, NULL, 'X'}, + { "Bytes", no_argument, NULL, 'B'}, { NULL, 0, NULL, 0 } }; @@ -1346,7 +1356,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LX", + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB", opts, NULL)) != -1) switch (c) { case '1': @@ -1422,6 +1432,10 @@ int main(int argc, char *argv[]) if (output_lines == -1) output_lines = 1; extended_totals = 1; + show_bytes = 1; + break; + case 'B': + show_bytes = 1; break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); -- cgit v0.10.2 From 2cee611af8638d80d3abb71f878829a7e998bb44 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:34 -0800 Subject: tools/vm/slabinfo: cosmetic globals cleanup checkpatch.pl complains about globals being explicitly zeroed out: "ERROR: do not initialise globals to 0 or NULL". New globals, introduced in this patch set, have no explicit 0 initialization; clean up the old ones to make it less hairy. Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 60beb91..86e698d 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -53,43 +53,43 @@ struct aliasinfo { struct slabinfo *slab; } aliasinfo[MAX_ALIASES]; -int slabs = 0; -int actual_slabs = 0; -int aliases = 0; -int alias_targets = 0; -int highest_node = 0; +int slabs; +int actual_slabs; +int aliases; +int alias_targets; +int highest_node; char buffer[4096]; -int show_empty = 0; -int show_report = 0; -int show_alias = 0; -int show_slab = 0; +int show_empty; +int show_report; +int show_alias; +int show_slab; int skip_zero = 1; -int show_numa = 0; -int show_track = 0; -int show_first_alias = 0; -int validate = 0; -int shrink = 0; -int show_inverted = 0; -int show_single_ref = 0; -int show_totals = 0; -int sort_size = 0; -int sort_active = 0; -int set_debug = 0; -int show_ops = 0; -int show_activity = 0; +int show_numa; +int show_track; +int show_first_alias; +int validate; +int shrink; +int show_inverted; +int show_single_ref; +int show_totals; +int sort_size; +int sort_active; +int set_debug; +int show_ops; +int show_activity; int output_lines = -1; int sort_loss; int extended_totals; int show_bytes; /* Debug options */ -int sanity = 0; -int redzone = 0; -int poison = 0; -int tracking = 0; -int tracing = 0; +int sanity; +int redzone; +int poison; +int tracking; +int tracing; int page_size; -- cgit v0.10.2 From 4a981abd115d6ade5fe8a07d5ca1d1f987a0c2f7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:37 -0800 Subject: tools/vm/slabinfo: gnuplot slabifo extended stat GNUplot `slabinfo -X' stats, collected, for example, using the following command: while [ 1 ]; do slabinfo -X >> stats; sleep 1; done `slabinfo-gnuplot.sh stats' pre-processes collected records and generate graphs (totals, slabs sorted by size, slabs sorted by size). Graphs can be [individually] regenerate with different samples range and graph width-heigh (-r %d,%d and -s %d,%d options). To visually compare N `totals' graphs: slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals Signed-off-by: Sergey Senozhatsky Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh new file mode 100644 index 0000000..35b0398 --- /dev/null +++ b/tools/vm/slabinfo-gnuplot.sh @@ -0,0 +1,275 @@ +#!/bin/sh + +# Sergey Senozhatsky, 2015 +# sergey.senozhatsky.work@gmail.com +# +# This software is licensed under the terms of the GNU General Public +# License version 2, as published by the Free Software Foundation, and +# may be copied, distributed, and modified under those terms. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + + +# This program is intended to plot a `slabinfo -X' stats, collected, +# for example, using the following command: +# while [ 1 ]; do slabinfo -X >> stats; sleep 1; done +# +# Use `slabinfo-gnuplot.sh stats' to pre-process collected records +# and generate graphs (totals, slabs sorted by size, slabs sorted +# by size). +# +# Graphs can be [individually] regenerate with different ranges and +# size (-r %d,%d and -s %d,%d options). +# +# To visually compare N `totals' graphs, do +# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals +# + +min_slab_name_size=11 +xmin=0 +xmax=0 +width=1500 +height=700 +mode=preprocess + +usage() +{ + echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]" + echo "FILEs must contain 'slabinfo -X' samples" + echo "-t - plot totals for FILE(s)" + echo "-l - plot slabs stats for FILE(s)" + echo "-s %d,%d - set image width and height" + echo "-r %d,%d - use data samples from a given range" +} + +check_file_exist() +{ + if [ ! -f "$1" ]; then + echo "File '$1' does not exist" + exit 1 + fi +} + +do_slabs_plotting() +{ + local file=$1 + local out_file + local range="every ::$xmin" + local xtic="" + local xtic_rotate="norotate" + local lines=2000000 + local wc_lines + + check_file_exist "$file" + + out_file=`basename "$file"` + if [ $xmax -ne 0 ]; then + range="$range::$xmax" + lines=$((xmax-xmin)) + fi + + wc_lines=`cat "$file" | wc -l` + if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then + wc_lines=$lines + fi + + if [ "$wc_lines" -lt "$lines" ]; then + lines=$wc_lines + fi + + if [ $((width / lines)) -gt $min_slab_name_size ]; then + xtic=":xtic(1)" + xtic_rotate=90 + fi + +gnuplot -p << EOF +#!/usr/bin/env gnuplot + +set terminal png enhanced size $width,$height large +set output '$out_file.png' +set autoscale xy +set xlabel 'samples' +set ylabel 'bytes' +set style histogram columnstacked title textcolor lt -1 +set style fill solid 0.15 +set xtics rotate $xtic_rotate +set key left above Left title reverse + +plot "$file" $range u 2$xtic title 'SIZE' with boxes,\ + '' $range u 3 title 'LOSS' with boxes +EOF + + if [ $? -eq 0 ]; then + echo "$out_file.png" + fi +} + +do_totals_plotting() +{ + local gnuplot_cmd="" + local range="every ::$xmin" + local file="" + + if [ $xmax -ne 0 ]; then + range="$range::$xmax" + fi + + for i in "${t_files[@]}"; do + check_file_exist "$i" + + file="$file"`basename "$i"` + gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\ + '$i Memory usage' with lines," + gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \ + '$i Loss' with lines," + done + +gnuplot -p << EOF +#!/usr/bin/env gnuplot + +set terminal png enhanced size $width,$height large +set autoscale xy +set output '$file.png' +set xlabel 'samples' +set ylabel 'bytes' +set key left above Left title reverse + +plot $gnuplot_cmd +EOF + + if [ $? -eq 0 ]; then + echo "$file.png" + fi +} + +do_preprocess() +{ + local out + local lines + local in=$1 + + check_file_exist "$in" + + # use only 'TOP' slab (biggest memory usage or loss) + let lines=3 + out=`basename "$in"`"-slabs-by-loss" + `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ + egrep -iv '\-\-|Name|Slabs'\ + | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` + if [ $? -eq 0 ]; then + do_slabs_plotting "$out" + fi + + let lines=3 + out=`basename "$in"`"-slabs-by-size" + `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ + egrep -iv '\-\-|Name|Slabs'\ + | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` + if [ $? -eq 0 ]; then + do_slabs_plotting "$out" + fi + + out=`basename "$in"`"-totals" + `cat "$in" | grep "Memory used" |\ + awk '{print $3" "$7}' > "$out"` + if [ $? -eq 0 ]; then + t_files[0]=$out + do_totals_plotting + fi +} + +parse_opts() +{ + local opt + + while getopts "tlr::s::h" opt; do + case $opt in + t) + mode=totals + ;; + l) + mode=slabs + ;; + s) + array=(${OPTARG//,/ }) + width=${array[0]} + height=${array[1]} + ;; + r) + array=(${OPTARG//,/ }) + xmin=${array[0]} + xmax=${array[1]} + ;; + h) + usage + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + :) + echo "-$OPTARG requires an argument." >&2 + exit 1 + ;; + esac + done + + return $OPTIND +} + +parse_args() +{ + local idx=0 + local p + + for p in "$@"; do + case $mode in + preprocess) + files[$idx]=$p + idx=$idx+1 + ;; + totals) + t_files[$idx]=$p + idx=$idx+1 + ;; + slabs) + files[$idx]=$p + idx=$idx+1 + ;; + esac + done +} + +parse_opts "$@" +argstart=$? +parse_args "${@:$argstart}" + +if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then + usage + exit 1 +fi + +case $mode in + preprocess) + for i in "${files[@]}"; do + do_preprocess "$i" + done + ;; + totals) + do_totals_plotting + ;; + slabs) + for i in "${files[@]}"; do + do_slabs_plotting "$i" + done + ;; + *) + echo "Unknown mode $mode" >&2 + usage + exit 1 + ;; +esac -- cgit v0.10.2 From 76f8ec712aa94da9fbfc9c318edc89aa1e48006b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Nov 2015 18:45:40 -0800 Subject: Doc/slub: document slabinfo-gnuplot.sh script Add documentation on how to use slabinfo-gnuplot.sh script. Signed-off-by: Sergey Senozhatsky Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index b0c6d1b..699d8ea 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -280,4 +280,63 @@ of other objects. slub_debug=FZ,dentry +Extended slabinfo mode and plotting +----------------------------------- + +The slabinfo tool has a special 'extended' ('-X') mode that includes: + - Slabcache Totals + - Slabs sorted by size (up to -N slabs, default 1) + - Slabs sorted by loss (up to -N slabs, default 1) + +Additionally, in this mode slabinfo does not dynamically scale sizes (G/M/K) +and reports everything in bytes (this functionality is also available to +other slabinfo modes via '-B' option) which makes reporting more precise and +accurate. Moreover, in some sense the `-X' mode also simplifies the analysis +of slabs' behaviour, because its output can be plotted using the +slabinfo-gnuplot.sh script. So it pushes the analysis from looking through +the numbers (tons of numbers) to something easier -- visual analysis. + +To generate plots: +a) collect slabinfo extended records, for example: + + while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done + +b) pass stats file(-s) to slabinfo-gnuplot.sh script: + slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] + +The slabinfo-gnuplot.sh script will pre-processes the collected records +and generates 3 png files (and 3 pre-processing cache files) per STATS +file: + - Slabcache Totals: FOO_STATS-totals.png + - Slabs sorted by size: FOO_STATS-slabs-by-size.png + - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png + +Another use case, when slabinfo-gnuplot can be useful, is when you need +to compare slabs' behaviour "prior to" and "after" some code modification. +To help you out there, slabinfo-gnuplot.sh script can 'merge' the +`Slabcache Totals` sections from different measurements. To visually +compare N plots: + +a) Collect as many STATS1, STATS2, .. STATSN files as you need + while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done + +b) Pre-process those STATS files + slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN + +c) Execute slabinfo-gnuplot.sh in '-t' mode, passing all of the +generated pre-processed *-totals + slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals + +This will produce a single plot (png file). + +Plots, expectedly, can be large so some fluctuations or small spikes +can go unnoticed. To deal with that, `slabinfo-gnuplot.sh' has two +options to 'zoom-in'/'zoom-out': + a) -s %d,%d overwrites the default image width and heigh + b) -r %d,%d specifies a range of samples to use (for example, + in `slabinfo -X >> FOO_STATS; sleep 1;' case, using + a "-r 40,60" range will plot only samples collected + between 40th and 60th seconds). + Christoph Lameter, May 30, 2007 +Sergey Senozhatsky, October 23, 2015 -- cgit v0.10.2 From 40911a798b5abbbec6b2e271a42addd6b26228a0 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Thu, 5 Nov 2015 18:45:43 -0800 Subject: mm/slab_common.c: initialize kmem_cache pointer to NULL The assignment to NULL within the error condition was written in a 2014 patch to suppress a compiler warning. However it would be cleaner to just initialize the kmem_cache to NULL and just return it in case of an error condition. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab_common.c b/mm/slab_common.c index fba78e4..d88e97c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -384,7 +384,7 @@ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s = NULL; const char *cache_name; int err; @@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align, err = kmem_cache_sanity_check(name, size); if (err) { - s = NULL; /* suppress uninit var warning */ goto out_unlock; } -- cgit v0.10.2 From 422ff4d70c1b3b2deed431dc095432dc691f4269 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 5 Nov 2015 18:45:46 -0800 Subject: mm/slub: correct the comment in calculate_order() In calculate_order(), it tries to calculate the best order by adjusting the fraction and min_objects. On each iteration on min_objects, fraction iterates on 16, 8, 4. Which means the acceptable waste increases with 1/16, 1/8, 1/4. This patch corrects the comment according to the code. Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index f614b5d..a94b9f4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2943,7 +2943,7 @@ static inline int calculate_order(int size, int reserved) * works by first attempting to generate a layout with * the best configuration and backing off gradually. * - * First we reduce the acceptable waste in a slab. Then + * First we increase the acceptable waste in a slab. Then * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; -- cgit v0.10.2 From 033fd1bd3c50fdda267d27d02f9bc656f0b9ddb8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 5 Nov 2015 18:45:48 -0800 Subject: mm/slub: use get_order() instead of fls() get_order() is more easy to understand. This patch just replaces it. Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Reviewed-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index a94b9f4..e309ed1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2912,8 +2912,7 @@ static inline int slab_order(int size, int min_objects, if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, - fls(min_objects * size - 1) - PAGE_SHIFT); + for (order = max(min_order, get_order(min_objects * size)); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; -- cgit v0.10.2 From 9f835703ea67633617ca82bc150f6ee70831b40a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 5 Nov 2015 18:45:51 -0800 Subject: mm/slub: calculate start order with reserved in consideration In slub_order(), the order starts from max(min_order, get_order(min_objects * size)). When (min_objects * size) has different order from (min_objects * size + reserved), it will skip this order via a check in the loop. This patch optimizes this a little by calculating the start order with `reserved' in consideration and removing the check in loop. Signed-off-by: Wei Yang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index e309ed1..e1bb147 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2912,19 +2912,15 @@ static inline int slab_order(int size, int min_objects, if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, get_order(min_objects * size)); + for (order = max(min_order, get_order(min_objects * size + reserved)); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size + reserved) - continue; - rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; - } return order; -- cgit v0.10.2 From d4322d88f5fdf92729dd40f923013414fbb2184d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 5 Nov 2015 18:45:54 -0800 Subject: mm: slab: only move management objects off-slab for sizes larger than KMALLOC_MIN_SIZE On systems with a KMALLOC_MIN_SIZE of 128 (arm64, some mips and powerpc configurations defining ARCH_DMA_MINALIGN to 128), the first kmalloc_caches[] entry to be initialised after slab_early_init = 0 is "kmalloc-128" with index 7. Depending on the debug kernel configuration, sizeof(struct kmem_cache) can be larger than 128 resulting in an INDEX_NODE of 8. Commit 8fc9cf420b36 ("slab: make more slab management structure off the slab") enables off-slab management objects for sizes starting with PAGE_SIZE >> 5 (128 bytes for a 4KB page configuration) and the creation of the "kmalloc-128" cache would try to place the management objects off-slab. However, since KMALLOC_MIN_SIZE is already 128 and freelist_size == 32 in __kmem_cache_create(), kmalloc_slab(freelist_size) returns NULL (kmalloc_caches[7] not populated yet). This triggers the following bug on arm64: kernel BUG at /work/Linux/linux-2.6-aarch64/mm/slab.c:2283! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc4+ #540 Hardware name: Juno (DT) PC is at __kmem_cache_create+0x21c/0x280 LR is at __kmem_cache_create+0x210/0x280 [...] Call trace: __kmem_cache_create+0x21c/0x280 create_boot_cache+0x48/0x80 create_kmalloc_cache+0x50/0x88 create_kmalloc_caches+0x4c/0xf4 kmem_cache_init+0x100/0x118 start_kernel+0x214/0x33c This patch introduces an OFF_SLAB_MIN_SIZE definition to avoid off-slab management objects for sizes equal to or smaller than KMALLOC_MIN_SIZE. Fixes: 8fc9cf420b36 ("slab: make more slab management structure off the slab") Signed-off-by: Catalin Marinas Reported-by: Geert Uytterhoeven Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: [3.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 4fcc5dd..461935b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && + if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) /* * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ -- cgit v0.10.2 From 9fbed25407ccc87a7bb47ea3f411e1ca34a95f8b Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Thu, 5 Nov 2015 18:45:57 -0800 Subject: mm/kmemleak.c: remove unneeded initialization of object to NULL Few lines below object is reinitialized by lookup_object() so we don't need to init it by NULL in the beginning of find_and_get_object(). Signed-off-by: Alexey Klimov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 77191ec..19423a4 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object) static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { unsigned long flags; - struct kmemleak_object *object = NULL; + struct kmemleak_object *object; rcu_read_lock(); read_lock_irqsave(&kmemleak_lock, flags); -- cgit v0.10.2 From 86d2adccfbe7d5a1f050fa08db9638c9168736d9 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Thu, 5 Nov 2015 18:46:00 -0800 Subject: mm/mlock.c: reorganize mlockall() return values and remove goto-out label In mlockall syscall wrapper after out-label for goto code just doing return. Remove goto out statements and return error values directly. Also instead of rewriting ret variable before every if-check move returns to 'error'-like path under if-check. Objdump asm listing showed me reducing by few asm lines. Object file size descreased from 220592 bytes to 220528 bytes for me (for aarch64). Signed-off-by: Alexey Klimov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mlock.c b/mm/mlock.c index 25936680..7e6ad9c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -684,14 +684,13 @@ out: SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; - int ret = -EINVAL; + int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) - goto out; + return -EINVAL; - ret = -EPERM; if (!can_do_mlock()) - goto out; + return -EPERM; if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ @@ -708,7 +707,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) up_write(¤t->mm->mmap_sem); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); -out: + return ret; } -- cgit v0.10.2 From 0ab32b6f1b88444524e52429fab334ff96683a3f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:46:03 -0800 Subject: uaccess: reimplement probe_kernel_address() using probe_kernel_read() probe_kernel_address() is basically the same as the (later added) probe_kernel_read(). The return value on EFAULT is a bit different: probe_kernel_address() returns number-of-bytes-not-copied whereas probe_kernel_read() returns -EFAULT. All callers have been checked, none cared. probe_kernel_read() can be overridden by the architecture whereas probe_kernel_address() cannot. parisc, blackfin and um do this, to insert additional checking. Hence this patch possibly fixes obscure bugs, although there are only two probe_kernel_address() callsites outside arch/. My first attempt involved removing probe_kernel_address() entirely and converting all callsites to use probe_kernel_read() directly, but that got tiresome. This patch shrinks mm/slab_common.o by 218 bytes. For a single probe_kernel_address() callsite. Cc: Steven Miao Cc: Jeff Dike Cc: Richard Weinberger Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 00b7f7d..7d5f4c7 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -803,7 +803,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) } } } else { - fault = probe_kernel_address(instrptr, instr); + fault = probe_kernel_address((void *)instrptr, instr); instr = __mem_to_opcode_arm(instr); } diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index ebc1f412..13b9bcf 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -999,7 +999,7 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs) ret = get_user(regs->nip, &inst); pagefault_enable(); } else { - ret = probe_kernel_address(regs->nip, inst); + ret = probe_kernel_address((void *)regs->nip, inst); } if (mcheck_handle_load(regs, inst)) { diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index d6f2c2c..558129a 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -75,36 +75,6 @@ static inline unsigned long __copy_from_user_nocache(void *to, #endif /* ARCH_HAS_NOCACHE_UACCESS */ -/** - * probe_kernel_address(): safely attempt to read from a location - * @addr: address to read from - its type is type typeof(retval)* - * @retval: read into this variable - * - * Safely read from address @addr into variable @revtal. If a kernel fault - * happens, handle that and return -EFAULT. - * We ensure that the __get_user() is executed in atomic context so that - * do_page_fault() doesn't attempt to take mmap_sem. This makes - * probe_kernel_address() suitable for use within regions where the caller - * already holds mmap_sem, or other locks which nest inside mmap_sem. - * This must be a macro because __get_user() needs to know the types of the - * args. - * - * We don't include enough header files to be able to do the set_fs(). We - * require that the probe_kernel_address() caller will do that. - */ -#define probe_kernel_address(addr, retval) \ - ({ \ - long ret; \ - mm_segment_t old_fs = get_fs(); \ - \ - set_fs(KERNEL_DS); \ - pagefault_disable(); \ - ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ - pagefault_enable(); \ - set_fs(old_fs); \ - ret; \ - }) - /* * probe_kernel_read(): safely attempt to read from a location * @dst: pointer to the buffer that shall take the data @@ -131,4 +101,14 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +/** + * probe_kernel_address(): safely attempt to read from a location + * @addr: address to read from + * @retval: read into this variable + * + * Returns 0 on success, or -EFAULT. + */ +#define probe_kernel_address(addr, retval) \ + probe_kernel_read(&retval, addr, sizeof(retval)) + #endif /* __LINUX_UACCESS_H__ */ diff --git a/mm/maccess.c b/mm/maccess.c index 34fe247..1b13638 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -13,6 +13,11 @@ * * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. + * + * We ensure that the copy_from_user is executed in atomic context so that + * do_page_fault() doesn't attempt to take mmap_sem. This makes + * probe_kernel_read() suitable for use within regions where the caller + * already holds mmap_sem, or other locks which nest inside mmap_sem. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) -- cgit v0.10.2 From 55e1ceaf2586ab11aafba798a6b9499dd7c14441 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:46:06 -0800 Subject: mm/mmap.c: remove useless statement "vma = NULL" in find_vma() Before the main loop, vma is already is NULL. There is no need to set it to NULL again. Signed-off-by: Chen Gang Reviewed-by: Oleg Nesterov Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 79bcc9f..bd932c1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2047,7 +2047,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) return vma; rb_node = mm->mm_rb.rb_node; - vma = NULL; while (rb_node) { struct vm_area_struct *tmp; -- cgit v0.10.2 From 626ebc4100285be56fe3546f29b6afeb36b6871a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:09 -0800 Subject: memcg: flatten task_struct->memcg_oom task_struct->memcg_oom is a sub-struct containing fields which are used for async memcg oom handling. Most task_struct fields aren't packaged this way and it can lead to unnecessary alignment paddings. This patch flattens it. * task.memcg_oom.memcg -> task.memcg_in_oom * task.memcg_oom.gfp_mask -> task.memcg_oom_gfp_mask * task.memcg_oom.order -> task.memcg_oom_order * task.memcg_oom.may_oom -> task.memcg_may_oom In addition, task.memcg_may_oom is relocated to where other bitfields are which reduces the size of task_struct. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e3318dd..56174c7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -406,19 +406,19 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, static inline void mem_cgroup_oom_enable(void) { - WARN_ON(current->memcg_oom.may_oom); - current->memcg_oom.may_oom = 1; + WARN_ON(current->memcg_may_oom); + current->memcg_may_oom = 1; } static inline void mem_cgroup_oom_disable(void) { - WARN_ON(!current->memcg_oom.may_oom); - current->memcg_oom.may_oom = 0; + WARN_ON(!current->memcg_may_oom); + current->memcg_may_oom = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { - return p->memcg_oom.memcg; + return p->memcg_in_oom; } bool mem_cgroup_oom_synchronize(bool wait); diff --git a/include/linux/sched.h b/include/linux/sched.h index 5423b9c..17bf8b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1473,7 +1473,9 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; - +#ifdef CONFIG_MEMCG + unsigned memcg_may_oom:1; +#endif #ifdef CONFIG_MEMCG_KMEM unsigned memcg_kmem_skip_account:1; #endif @@ -1804,12 +1806,9 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG - struct memcg_oom_info { - struct mem_cgroup *memcg; - gfp_t gfp_mask; - int order; - unsigned int may_oom:1; - } memcg_oom; + struct mem_cgroup *memcg_in_oom; + gfp_t memcg_oom_gfp_mask; + int memcg_oom_order; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c57c442..47bd7f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1661,7 +1661,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_oom.may_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -1678,9 +1678,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * and when we know whether the fault was overall successful. */ css_get(&memcg->css); - current->memcg_oom.memcg = memcg; - current->memcg_oom.gfp_mask = mask; - current->memcg_oom.order = order; + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; } /** @@ -1702,7 +1702,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) */ bool mem_cgroup_oom_synchronize(bool handle) { - struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct mem_cgroup *memcg = current->memcg_in_oom; struct oom_wait_info owait; bool locked; @@ -1730,8 +1730,8 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, - current->memcg_oom.order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, + current->memcg_oom_order); } else { schedule(); mem_cgroup_unmark_under_oom(memcg); @@ -1748,7 +1748,7 @@ bool mem_cgroup_oom_synchronize(bool handle) memcg_oom_recover(memcg); } cleanup: - current->memcg_oom.memcg = NULL; + current->memcg_in_oom = NULL; css_put(&memcg->css); return true; } -- cgit v0.10.2 From b23afb93d317c65cef553b804f08dec8a7a0f7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:11 -0800 Subject: memcg: punt high overage reclaim to return-to-userland path Currently, try_charge() tries to reclaim memory synchronously when the high limit is breached; however, if the allocation doesn't have __GFP_WAIT, synchronous reclaim is skipped. If a process performs only speculative allocations, it can blow way past the high limit. This is actually easily reproducible by simply doing "find /". slab/slub allocator tries speculative allocations first, so as long as there's memory which can be consumed without blocking, it can keep allocating memory regardless of the high limit. This patch makes try_charge() always punt the over-high reclaim to the return-to-userland path. If try_charge() detects that high limit is breached, it adds the overage to current->memcg_nr_pages_over_high and schedules execution of mem_cgroup_handle_over_high() which performs synchronous reclaim from the return-to-userland path. As long as kernel doesn't have a run-away allocation spree, this should provide enough protection while making kmemcg behave more consistently. It also has the following benefits. - All over-high reclaims can use GFP_KERNEL regardless of the specific gfp mask in use, e.g. GFP_NOFS, when the limit was breached. - It copes with prio inversion. Previously, a low-prio task with small memory.high might perform over-high reclaim with a bunch of locks held. If a higher prio task needed any of these locks, it would have to wait until the low prio task finished reclaim and released the locks. By handing over-high reclaim to the task exit path this issue can be avoided. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 56174c7..77bf429 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -401,6 +401,8 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } +void mem_cgroup_handle_over_high(void); + void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -620,6 +622,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { } +static inline void mem_cgroup_handle_over_high(void) +{ +} + static inline void mem_cgroup_oom_enable(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 17bf8b8..055f2ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1809,6 +1809,9 @@ struct task_struct { struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; + + /* number of pages to reclaim on returning to userland */ + unsigned int memcg_nr_pages_over_high; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 84d4972..26c1521 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -50,6 +50,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) smp_mb__after_atomic(); if (unlikely(current->task_works)) task_work_run(); + + mem_cgroup_handle_over_high(); } #endif /* */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47bd7f1..327dcda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -1972,6 +1973,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg, *pos; + + if (likely(!nr_pages)) + return; + + pos = memcg = get_mem_cgroup_from_mm(current->mm); + + do { + if (page_counter_read(&pos->memory) <= pos->high) + continue; + mem_cgroup_events(pos, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); + } while ((pos = parent_mem_cgroup(pos))); + + css_put(&memcg->css); + current->memcg_nr_pages_over_high = 0; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -2080,17 +2106,22 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - if (!(gfp_mask & __GFP_WAIT)) - goto done; + /* - * If the hierarchy is above the normal consumption range, - * make the charging task trim their excess contribution. + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_WAIT but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) <= memcg->high) - continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + if (page_counter_read(&memcg->memory) > memcg->high) { + current->memcg_nr_pages_over_high += nr_pages; + set_notify_resume(current); + break; + } } while ((memcg = parent_mem_cgroup(memcg))); done: return ret; -- cgit v0.10.2 From cbfb479809c1b8d871cb9a31832e065e900a24c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:14 -0800 Subject: memcg: collect kmem bypass conditions into __memcg_kmem_bypass() memcg_kmem_newpage_charge() and memcg_kmem_get_cache() are testing the same series of conditions to decide whether to bypass kmem accounting. Collect the tests into __memcg_kmem_bypass(). This is pure refactoring. Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 77bf429..d8174e8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -777,20 +777,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -/** - * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. - * @gfp: the gfp allocation flags. - * @memcg: a pointer to the memcg this was charged against. - * @order: allocation order. - * - * returns true if the memcg where the current task belongs can hold this - * allocation. - * - * We return true automatically if this allocation is not to be accounted to - * any memcg. - */ -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; @@ -812,6 +799,26 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) if (unlikely(fatal_signal_pending(current))) return true; + return false; +} + +/** + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. + * @gfp: the gfp allocation flags. + * @memcg: a pointer to the memcg this was charged against. + * @order: allocation order. + * + * returns true if the memcg where the current task belongs can hold this + * allocation. + * + * We return true automatically if this allocation is not to be accounted to + * any memcg. + */ +static inline bool +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +{ + if (__memcg_kmem_bypass(gfp)) + return true; return __memcg_kmem_newpage_charge(gfp, memcg, order); } @@ -854,17 +861,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) static __always_inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { - if (!memcg_kmem_enabled()) - return cachep; - if (gfp & __GFP_NOACCOUNT) - return cachep; - if (gfp & __GFP_NOFAIL) + if (__memcg_kmem_bypass(gfp)) return cachep; - if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) - return cachep; - if (unlikely(fatal_signal_pending(current))) - return cachep; - return __memcg_kmem_get_cache(cachep); } -- cgit v0.10.2 From 10d53c748bc9531f47e13f98e32ef28be4399862 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:17 -0800 Subject: memcg: ratify and consolidate over-charge handling try_charge() is the main charging logic of memcg. When it hits the limit but either can't fail the allocation due to __GFP_NOFAIL or the task is likely to free memory very soon, being OOM killed, has SIGKILL pending or exiting, it "bypasses" the charge to the root memcg and returns -EINTR. While this is one approach which can be taken for these situations, it has several issues. * It unnecessarily lies about the reality. The number itself doesn't go over the limit but the actual usage does. memcg is either forced to or actively chooses to go over the limit because that is the right behavior under the circumstances, which is completely fine, but, if at all avoidable, it shouldn't be misrepresenting what's happening by sneaking the charges into the root memcg. * Despite trying, we already do over-charge. kmemcg can't deal with switching over to the root memcg by the point try_charge() returns -EINTR, so it open-codes over-charing. * It complicates the callers. Each try_charge() user has to handle the weird -EINTR exception. memcg_charge_kmem() does the manual over-charging. mem_cgroup_do_precharge() performs unnecessary uncharging of root memcg, which BTW is inconsistent with what memcg_charge_kmem() does but not broken as [un]charging are noops on root memcg. mem_cgroup_try_charge() needs to switch the returned cgroup to the root one. The reality is that in memcg there are cases where we are forced and/or willing to go over the limit. Each such case needs to be scrutinized and justified but there definitely are situations where that is the right thing to do. We alredy do this but with a superficial and inconsistent disguise which leads to unnecessary complications. This patch updates try_charge() so that it over-charges and returns 0 when deemed necessary. -EINTR return is removed along with all special case handling in the callers. While at it, remove the local variable @ret, which was initialized to zero and never changed, along with done: label which just returned the always zero @ret. Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 327dcda..b952abe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2008,13 +2008,12 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - int ret = 0; if (mem_cgroup_is_root(memcg)) - goto done; + return 0; retry: if (consume_stock(memcg, nr_pages)) - goto done; + return 0; if (!do_swap_account || !page_counter_try_charge(&memcg->memsw, batch, &counter)) { @@ -2042,7 +2041,7 @@ retry: if (unlikely(test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current) || current->flags & PF_EXITING)) - goto bypass; + goto force; if (unlikely(task_in_memcg_oom(current))) goto nomem; @@ -2088,10 +2087,10 @@ retry: goto retry; if (gfp_mask & __GFP_NOFAIL) - goto bypass; + goto force; if (fatal_signal_pending(current)) - goto bypass; + goto force; mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); @@ -2099,8 +2098,18 @@ retry: nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -bypass: - return -EINTR; +force: + /* + * The allocation either can't fail or will lead to more memory + * being freed very soon. Allow memory usage go over the limit + * temporarily by force charging it. + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + + return 0; done_restock: css_get_many(&memcg->css, batch); @@ -2123,8 +2132,8 @@ done_restock: break; } } while ((memcg = parent_mem_cgroup(memcg))); -done: - return ret; + + return 0; } static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2216,28 +2225,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, return ret; ret = try_charge(memcg, gfp, nr_pages); - if (ret == -EINTR) { - /* - * try_charge() chose to bypass to root due to OOM kill or - * fatal signal. Since our only options are to either fail - * the allocation or charge it to this cgroup, do it as a - * temporary condition. But we can't fail. From a kmem/slab - * perspective, the cache has already been selected, by - * mem_cgroup_kmem_get_cache(), so it is too late to change - * our minds. - * - * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed - * during try_charge() above. Tasks that were already dying - * when the allocation triggers should have been already - * directed to the root cgroup in memcontrol.h - */ - page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); - ret = 0; - } else if (ret) + if (ret) page_counter_uncharge(&memcg->kmem, nr_pages); return ret; @@ -4438,22 +4426,10 @@ static int mem_cgroup_do_precharge(unsigned long count) mc.precharge += count; return ret; } - if (ret == -EINTR) { - cancel_charge(root_mem_cgroup, count); - return ret; - } /* Try charges one by one with reclaim */ while (count--) { ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); - /* - * In case of failure, any residual charges against - * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. However, cancel any charges that are - * bypassed to root right away or they'll be lost. - */ - if (ret == -EINTR) - cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; @@ -5358,11 +5334,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, ret = try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); - - if (ret == -EINTR) { - memcg = root_mem_cgroup; - ret = 0; - } out: *memcgp = memcg; return ret; -- cgit v0.10.2 From 7f822c24c2b32a9feafb33e3b9a9e23ef4278c2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 18:46:20 -0800 Subject: memcg: drop unnecessary cold-path tests from __memcg_kmem_bypass() __memcg_kmem_bypass() decides whether a kmem allocation should be bypassed to the root memcg. Some conditions that it tests are valid criteria regarding who should be held accountable; however, there are a couple unnecessary tests for cold paths - __GFP_FAIL and fatal_signal_pending(). The previous patch updated try_charge() to handle both __GFP_FAIL and dying tasks correctly and the only thing these two tests are doing is making accounting less accurate and sprinkling tests for cold path conditions in the hot paths. There's nothing meaningful gained by these extra tests. This patch removes the two unnecessary tests from __memcg_kmem_bypass(). Signed-off-by: Tejun Heo Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d8174e8..641bb60 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -781,24 +781,10 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; - if (gfp & __GFP_NOACCOUNT) return true; - /* - * __GFP_NOFAIL allocations will move on even if charging is not - * possible. Therefore we don't even try, and have this allocation - * unaccounted. We could in theory charge it forcibly, but we hope - * those allocations are rare, and won't be worth the trouble. - */ - if (gfp & __GFP_NOFAIL) - return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; - - /* If the test is dying, just let it go. */ - if (unlikely(fatal_signal_pending(current))) - return true; - return false; } -- cgit v0.10.2 From 61f9ec1d8e97131ce55159647fcdfeccc0f40647 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 5 Nov 2015 18:46:23 -0800 Subject: mm: fix docbook comment for get_vaddr_frames() get_vaddr_frames() has a comment that's *almost* a docbook comment; add the missing star so that the tools will find it properly. Signed-off-by: Jonathan Corbet Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/frame_vector.c b/mm/frame_vector.c index cdabcb9..7cf2b71 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -7,7 +7,7 @@ #include #include -/* +/** * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map -- cgit v0.10.2 From 145949a1387ba7a4fd0df15181e09345ec7b0492 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 5 Nov 2015 18:46:26 -0800 Subject: mm/list_lru.c: replace nr_node_ids for loop with for_each_node() The functions used in the patch are in slowpath, which gets called whenever alloc_super is called during mounts. Though this should not make difference for the architectures with sequential numa node ids, for the powerpc which can potentially have sparse node ids (for e.g., 4 node system having numa ids, 0,1,16,17 is common), this patch saves some unnecessary allocations for non existing numa nodes. Even without that saving, perhaps patch makes code more readable. [vdavydov@parallels.com: take memcg_aware check outside for_each loop] Signed-off-by: Raghavendra K T Reviewed-by: Vladimir Davydov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Anton Blanchard Cc: Nishanth Aravamudan Cc: Greg Kurz Cc: Grant Likely Cc: Nikunj A Dadhania Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/list_lru.c b/mm/list_lru.c index e1da19f..2823747 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru) #ifdef CONFIG_MEMCG_KMEM static inline bool list_lru_memcg_aware(struct list_lru *lru) { + /* + * This needs node 0 to be always present, even + * in the systems supporting sparse numa ids. + */ return !!lru->node[0].memcg_lrus; } @@ -377,16 +381,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { int i; - for (i = 0; i < nr_node_ids; i++) { - if (!memcg_aware) - lru->node[i].memcg_lrus = NULL; - else if (memcg_init_list_lru_node(&lru->node[i])) + if (!memcg_aware) + return 0; + + for_each_node(i) { + if (memcg_init_list_lru_node(&lru->node[i])) goto fail; } return 0; fail: - for (i = i - 1; i >= 0; i--) + for (i = i - 1; i >= 0; i--) { + if (!lru->node[i].memcg_lrus) + continue; memcg_destroy_list_lru_node(&lru->node[i]); + } return -ENOMEM; } @@ -397,7 +405,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru) if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_destroy_list_lru_node(&lru->node[i]); } @@ -409,16 +417,20 @@ static int memcg_update_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return 0; - for (i = 0; i < nr_node_ids; i++) { + for_each_node(i) { if (memcg_update_list_lru_node(&lru->node[i], old_size, new_size)) goto fail; } return 0; fail: - for (i = i - 1; i >= 0; i--) + for (i = i - 1; i >= 0; i--) { + if (!lru->node[i].memcg_lrus) + continue; + memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); + } return -ENOMEM; } @@ -430,7 +442,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); } @@ -485,7 +497,7 @@ static void memcg_drain_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); } @@ -522,7 +534,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, if (!lru->node) goto out; - for (i = 0; i < nr_node_ids; i++) { + for_each_node(i) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); -- cgit v0.10.2 From c118baf802562688d46e6002f2b5fe66b947da21 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 5 Nov 2015 18:46:29 -0800 Subject: arch/powerpc/mm/numa.c: do not allocate bootmem memory for non existing nodes With the setup_nr_nodes(), we have already initialized node_possible_map. So it is safe to use for_each_node here. There are many places in the kernel that use hardcoded 'for' loop with nr_node_ids, because all other architectures have numa nodes populated serially. That should be reason we had maintained the same for powerpc. But, since sparse numa node ids possible on powerpc, we unnecessarily allocate memory for non existent numa nodes. For e.g., on a system with 0,1,16,17 as numa nodes nr_node_ids=18 and we allocate memory for nodes 2-14. This patch we allocate memory for only existing numa nodes. The patch is boot tested on a 4 node tuleta, confirming with printks that it works as expected. Signed-off-by: Raghavendra K T Cc: Vladimir Davydov Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Anton Blanchard Cc: Nishanth Aravamudan Cc: Greg Kurz Cc: Grant Likely Cc: Nikunj A Dadhania Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 8b9502a..8d8a541 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void) setup_nr_node_ids(); /* allocate the map */ - for (node = 0; node < nr_node_ids; node++) + for_each_node(node) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ -- cgit v0.10.2 From b0d61c7e56815b0b881c81f6779a65f4fdae4bc0 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:32 -0800 Subject: mm/msync: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/msync.c b/mm/msync.c index bb04d53..24e612f 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; - if (start & ~PAGE_MASK) + if (offset_in_page(start)) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; -- cgit v0.10.2 From 1824cb753354e026ab898cd472bddd540b50b00b Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:35 -0800 Subject: mm/nommu: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/nommu.c b/mm/nommu.c index ab14a20..1e0f168 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - if (a.offset & ~PAGE_MASK) + if (offset_in_page(a.offset)) return -EINVAL; return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, @@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) goto erase_whole_vma; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - if (start & ~PAGE_MASK) + if (offset_in_page(start)) return -EINVAL; - if (end != vma->vm_end && end & ~PAGE_MASK) + if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { ret = split_vma(mm, vma, start, 1); @@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr, if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return -EINVAL; if (flags & MREMAP_FIXED && new_addr != addr) -- cgit v0.10.2 From e7bbdd071314b52507e6c615e2cec90d46f82c57 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:38 -0800 Subject: mm/mincore: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mincore.c b/mm/mincore.c index be25efd..14bb9fb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, /* This also avoids any overflows on PAGE_CACHE_ALIGN */ pages = len >> PAGE_SHIFT; - pages += (len & ~PAGE_MASK) != 0; + pages += (offset_in_page(len)) != 0; if (!access_ok(VERIFY_WRITE, vec, pages)) return -EFAULT; -- cgit v0.10.2 From 5d57b0146aa942b939bbd77e09130270dc9b97d2 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:40 -0800 Subject: mm/early_ioremap: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 17ae14b..6d5717b 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; + offset = offset_in_page(phys_addr); phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr + 1) - phys_addr; @@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) return; - offset = virt_addr & ~PAGE_MASK; + offset = offset_in_page(virt_addr); nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; @@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) char *p; while (size) { - slop = src & ~PAGE_MASK; + slop = offset_in_page(src); clen = size; if (clen > MAX_MAP_CHUNK - slop) clen = MAX_MAP_CHUNK - slop; -- cgit v0.10.2 From f09f1243ca2d5d297881bf2c2148d9ab35314314 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:43 -0800 Subject: mm/percpu: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/percpu.c b/mm/percpu.c index a63b4d8..8a943b9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); - PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); - PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); - PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); @@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; @@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { -- cgit v0.10.2 From ea53cde089e07cfd7996c2072f770ebb984ce8db Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:46 -0800 Subject: mm/util: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/util.c b/mm/util.c index 68ff8a5..9af1c12 100644 --- a/mm/util.c +++ b/mm/util.c @@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) + if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -- cgit v0.10.2 From 8fd9e4883a2b08c52ec00f3c214b45d096fc697a Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:49 -0800 Subject: mm/mlock: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mlock.c b/mm/mlock.c index 7e6ad9c..550228d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -560,7 +560,7 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) @@ -616,7 +616,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); @@ -645,7 +645,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; down_write(¤t->mm->mmap_sem); -- cgit v0.10.2 From 891c49abfb097bbd7024b4072dd1c8e1c995d3ec Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:51 -0800 Subject: mm/vmalloc: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af3a519..9db9ef5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -358,7 +358,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct vmap_area *first; BUG_ON(!size); - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), @@ -936,7 +936,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) void *vaddr = NULL; unsigned int order; - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* @@ -989,7 +989,7 @@ static void vb_free(const void *addr, unsigned long size) unsigned int order; struct vmap_block *vb; - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); @@ -1902,7 +1902,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) while (count) { unsigned long offset, length; - offset = (unsigned long)addr & ~PAGE_MASK; + offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; @@ -1941,7 +1941,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) while (count) { unsigned long offset, length; - offset = (unsigned long)addr & ~PAGE_MASK; + offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; @@ -2392,7 +2392,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, bool purged = false; /* verify parameters and allocate data structures */ - BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; -- cgit v0.10.2 From de1741a1333ea37694dddf7c94aa4cf2d0e58912 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:54 -0800 Subject: mm/mmap: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index bd932c1..3ec19b6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; /* Do simple checking here so the lower-level routines won't have @@ -1473,7 +1473,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - if (a.offset & ~PAGE_MASK) + if (offset_in_page(a.offset)) return -EINVAL; return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, @@ -1989,7 +1989,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; @@ -2025,7 +2025,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr > TASK_SIZE - len) return -ENOMEM; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return -EINVAL; addr = arch_rebalance_pgtables(addr, len); @@ -2535,7 +2535,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unsigned long end; struct vm_area_struct *vma, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; len = PAGE_ALIGN(len); @@ -2733,7 +2733,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (error & ~PAGE_MASK) + if (offset_in_page(error)) return error; error = mlock_future_check(mm, mm->def_flags, len); -- cgit v0.10.2 From f19cb115a25f3f25752fdc56340e7433462157ba Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:46:57 -0800 Subject: mm/mremap: use offset_in_page macro linux/mm.h provides offset_in_page() macro. Let's use already predefined macro instead of (addr & ~PAGE_MASK). Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mremap.c b/mm/mremap.c index 5a71cce..c25bc62 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -401,7 +401,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long charged = 0; unsigned long map_flags; - if (new_addr & ~PAGE_MASK) + if (offset_in_page(new_addr)) goto out; if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) @@ -435,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (ret & ~PAGE_MASK) + if (offset_in_page(ret)) goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); - if (!(ret & ~PAGE_MASK)) + if (!(offset_in_page(ret))) goto out; out1: vm_unacct_memory(charged); @@ -484,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return ret; old_len = PAGE_ALIGN(old_len); @@ -566,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (new_addr & ~PAGE_MASK) { + if (offset_in_page(new_addr)) { ret = new_addr; goto out; } @@ -574,7 +574,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: - if (ret & ~PAGE_MASK) { + if (offset_in_page(ret)) { vm_unacct_memory(charged); locked = 0; } -- cgit v0.10.2 From 35bd16a227534cb6ffc9b26a33061c2dcf91934b Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:47:00 -0800 Subject: mm/memblock: make memblock_remove_range() static memblock_remove_range() is only used in the mm/memblock.c, so we can make it static. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memblock.h b/include/linux/memblock.h index c518eb5..24daf8f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -89,10 +89,6 @@ int memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, unsigned long flags); -int memblock_remove_range(struct memblock_type *type, - phys_addr_t base, - phys_addr_t size); - void __next_mem_range(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, diff --git a/mm/memblock.c b/mm/memblock.c index 1c7b647..d300f13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, return 0; } -int __init_memblock memblock_remove_range(struct memblock_type *type, +static int __init_memblock memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; -- cgit v0.10.2 From f2f81fb2b72b83b661b11da6f1b0bd3526706278 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:03 -0800 Subject: mm, migrate: count pages failing all retries in vmstat and tracepoint Migration tries up to 10 times to migrate pages that return -EAGAIN until it gives up. If some pages fail all retries, they are counted towards the number of failed pages that migrate_pages() returns. They should also be counted in the /proc/vmstat pgmigrate_fail and in the mm_migrate_pages tracepoint. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Naoya Horiguchi Cc: "Kirill A. Shutemov" Cc: "Aneesh Kumar K.V" Cc: Konstantin Khlebnikov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 842ecd7..94961f4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1169,7 +1169,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, } } } - rc = nr_failed + retry; + nr_failed += retry; + rc = nr_failed; out: if (nr_succeeded) count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); -- cgit v0.10.2 From b171e4093017d4d6e411f5e97823e5e4a21266a2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:06 -0800 Subject: mm/page_alloc: remove unused parameter in init_currently_empty_zone() Commit a2f3aa025766 ("[PATCH] Fix sparsemem on Cell") fixed an oops experienced on the Cell architecture when init-time functions, early_*(), are called at runtime by introducing an 'enum memmap_context' parameter to memmap_init_zone() and init_currently_empty_zone(). This parameter is intended to be used to tell whether the call of these two functions is being made on behalf of a hotplug event, or happening at boot-time. However, init_currently_empty_zone() does not use this parameter at all, so remove it. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d943477..2d7e660 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -823,8 +823,7 @@ enum memmap_context { MEMMAP_HOTPLUG, }; extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, - unsigned long size, - enum memmap_context context); + unsigned long size); extern void lruvec_init(struct lruvec *lruvec); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0780d11..67d488a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages, - MEMMAP_HOTPLUG); + return init_currently_empty_zone(zone, start_pfn, num_pages); + return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 805bbad..c60605d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4900,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone) int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size, - enum memmap_context context) + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -5413,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, - size, MEMMAP_EARLY); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; -- cgit v0.10.2 From 600e19afc5f8a6c18ea49cee9511c5797db02391 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 5 Nov 2015 18:47:08 -0800 Subject: mm: use only per-device readahead limit Maximal readahead size is limited now by two values: 1) by global 2Mb constant (MAX_READAHEAD in max_sane_readahead()) 2) by configurable per-device value* (bdi->ra_pages) There are devices, which require custom readahead limit. For instance, for RAIDs it's calculated as number of devices multiplied by chunk size times 2. Readahead size can never be larger than bdi->ra_pages * 2 value (POSIX_FADV_SEQUNTIAL doubles readahead size). If so, why do we need two limits? I suggest to completely remove this max_sane_readahead() stuff and use per-device readahead limit everywhere. Also, using right readahead size for RAID disks can significantly increase i/o performance: before: dd if=/dev/md2 of=/dev/null bs=100M count=100 100+0 records in 100+0 records out 10485760000 bytes (10 GB) copied, 12.9741 s, 808 MB/s after: $ dd if=/dev/md2 of=/dev/null bs=100M count=100 100+0 records in 100+0 records out 10485760000 bytes (10 GB) copied, 8.91317 s, 1.2 GB/s (It's an 8-disks RAID5 storage). This patch doesn't change sys_readahead and madvise(MADV_WILLNEED) behavior introduced by 6d2be915e589b58 ("mm/readahead.c: fix readahead failure for memoryless NUMA nodes and limit readahead pages"). Signed-off-by: Roman Gushchin Cc: Raghavendra K T Cc: Jan Kara Cc: Wu Fengguang Cc: David Rientjes Cc: onstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 80001de..fa08f3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2036,8 +2036,6 @@ void page_cache_async_readahead(struct address_space *mapping, pgoff_t offset, unsigned long size); -unsigned long max_sane_readahead(unsigned long nr); - /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/mm/filemap.c b/mm/filemap.c index 327910c..1fe962b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1807,7 +1807,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct file *file, pgoff_t offset) { - unsigned long ra_pages; struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ @@ -1836,10 +1835,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, /* * mmap read-around */ - ra_pages = max_sane_readahead(ra->ra_pages); - ra->start = max_t(long, 0, offset - ra_pages / 2); - ra->size = ra_pages; - ra->async_size = ra_pages / 4; + ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); } diff --git a/mm/readahead.c b/mm/readahead.c index 24682f6..998ad59 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = max_sane_readahead(nr_to_read); + nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); while (nr_to_read) { int err; @@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } -#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) -/* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a - * sensible upper limit. - */ -unsigned long max_sane_readahead(unsigned long nr) -{ - return min(nr, MAX_READAHEAD); -} - /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large @@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = max_sane_readahead(ra->ra_pages); + unsigned long max = ra->ra_pages; pgoff_t prev_offset; /* -- cgit v0.10.2 From 25ee01a2fca02dfb5a3ce316e77910c468108199 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:11 -0800 Subject: mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps Currently /proc/PID/smaps provides no usage info for vma(VM_HUGETLB), which is inconvenient when we want to know per-task or per-vma base hugetlb usage. To solve this, this patch adds new fields for hugetlb usage like below: Size: 20480 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB AnonHugePages: 0 kB Shared_Hugetlb: 18432 kB Private_Hugetlb: 2048 kB Swap: 0 kB KernelPageSize: 2048 kB MMUPageSize: 2048 kB Locked: 0 kB VmFlags: rd wr mr mw me de ht [hughd@google.com: fix Private_Hugetlb alignment ] Signed-off-by: Naoya Horiguchi Acked-by: Joern Engel Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 3a9d65c..a7d6c06 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -424,6 +424,9 @@ Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB +AnonHugePages: 0 kB +Shared_Hugetlb: 0 kB +Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB @@ -452,6 +455,11 @@ and a page is modified, the file page is replaced by a private anonymous copy. "Swap" shows how much would-be-anonymous memory is also used, but out on swap. "SwapPss" shows proportional swap share of this mapping. +"AnonHugePages" shows the ammount of memory backed by transparent hugepage. +"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by +hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical +reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. + "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded manner. The codes are the following: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b029d42..5988b83 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -446,6 +446,8 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; u64 pss; u64 swap_pss; }; @@ -625,12 +627,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) seq_putc(m, '\n'); } +#ifdef CONFIG_HUGETLB_PAGE +static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page = NULL; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } + if (page) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) + mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } + return 0; +} +#endif /* HUGETLB_PAGE */ + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; struct mem_size_stats mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, +#ifdef CONFIG_HUGETLB_PAGE + .hugetlb_entry = smaps_hugetlb_range, +#endif .mm = vma->vm_mm, .private = &mss, }; @@ -652,6 +686,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" @@ -667,6 +703,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shared_hugetlb >> 10, + mss.private_hugetlb >> 10, mss.swap >> 10, (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, -- cgit v0.10.2 From 5d317b2b6536592a9b51fe65faed43d65ca9158e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:14 -0800 Subject: mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status Currently there's no easy way to get per-process usage of hugetlb pages, which is inconvenient because userspace applications which use hugetlb typically want to control their processes on the basis of how much memory (including hugetlb) they use. So this patch simply provides easy access to the info via /proc/PID/status. Signed-off-by: Naoya Horiguchi Acked-by: Joern Engel Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a7d6c06..12ac0e4 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -175,6 +175,7 @@ read the file /proc/PID/status: VmLib: 1412 kB VmPTE: 20 kb VmSwap: 0 kB + HugetlbPages: 0 kB Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -238,6 +239,7 @@ Table 1-2: Contents of the status files (as of 4.1) VmPTE size of page table entries VmPMD size of second level page tables VmSwap size of swap usage (the number of referred swapents) + HugetlbPages size of hugetlb memory portions Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5988b83..288185e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) ptes >> 10, pmds >> 10, swap << (PAGE_SHIFT-10)); + hugetlb_report_usage(m, mm); } unsigned long task_vsize(struct mm_struct *mm) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5e35379..685c262 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -483,6 +483,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, #define hugepages_supported() (HPAGE_SHIFT != 0) #endif +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); + +static inline void hugetlb_count_add(long l, struct mm_struct *mm) +{ + atomic_long_add(l, &mm->hugetlb_usage); +} + +static inline void hugetlb_count_sub(long l, struct mm_struct *mm) +{ + atomic_long_sub(l, &mm->hugetlb_usage); +} #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL @@ -519,6 +530,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, { return &mm->page_table_lock; } + +static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) +{ +} + +static inline void hugetlb_count_sub(long l, struct mm_struct *mm) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3d6baa7..0a85da2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -486,6 +486,9 @@ struct mm_struct { /* address of the bounds directory */ void __user *bd_addr; #endif +#ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; +#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9cc7734..abfbe8c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2790,6 +2790,12 @@ void hugetlb_show_meminfo(void) 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "HugetlbPages:\t%8lu kB\n", + atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { @@ -3025,6 +3031,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, get_page(ptepage); page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); + hugetlb_count_add(pages_per_huge_page(h), dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -3105,6 +3112,7 @@ again: if (huge_pte_dirty(pte)) set_page_dirty(page); + hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { @@ -3509,6 +3517,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); + hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); diff --git a/mm/rmap.c b/mm/rmap.c index f5b5c1f..d40e7ae 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1352,7 +1352,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (!PageHuge(page)) { + if (PageHuge(page)) { + hugetlb_count_sub(1 << compound_order(page), mm); + } else { if (PageAnon(page)) dec_mm_counter(mm, MM_ANONPAGES); else -- cgit v0.10.2 From 29d06bbb41595f82db309a5516426ef8bd0f27b7 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:17 -0800 Subject: mm/vmscan: make inactive_anon_is_low_global return directly Delete unnecessary if to let inactive_anon_is_low_global return directly. No functional changes. Signed-off-by: Yaowei Bai Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 7f63a93..773fc8d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1866,10 +1866,7 @@ static int inactive_anon_is_low_global(struct zone *zone) active = zone_page_state(zone, NR_ACTIVE_ANON); inactive = zone_page_state(zone, NR_INACTIVE_ANON); - if (inactive * zone->inactive_ratio < active) - return 1; - - return 0; + return inactive * zone->inactive_ratio < active; } /** -- cgit v0.10.2 From 21c527a3cba07f9a9ce17b3a445f110a847793e2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:20 -0800 Subject: mm/compaction.c: add an is_via_compact_memory() helper Introduce is_via_compact_memory() helper indicating compacting via /proc/sys/vm/compact_memory to improve readability. To catch this situation in __compaction_suitable, use order as parameter directly instead of using struct compact_control. This patch has no functional changes. Signed-off-by: Yaowei Bai Cc: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index c5c627a..a8e6593 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1197,6 +1197,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } +/* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + static int __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { @@ -1223,11 +1232,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_COMPLETE; } - /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (cc->order == -1) + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ @@ -1290,11 +1295,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, int fragindex; unsigned long watermark; - /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (order == -1) + if (is_via_compact_memory(order)) return COMPACT_CONTINUE; watermark = low_wmark_pages(zone); @@ -1658,10 +1659,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) * this makes sure we compact the whole zone regardless of * cached scanner positions. */ - if (cc->order == -1) + if (is_via_compact_memory(cc->order)) __reset_isolation_suitable(zone); - if (cc->order == -1 || !compaction_deferred(zone, cc->order)) + if (is_via_compact_memory(cc->order) || + !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); if (cc->order > 0) { -- cgit v0.10.2 From aa750fd71c242dba02ee2034e15fbd7d0cdb2461 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Thu, 5 Nov 2015 18:47:23 -0800 Subject: mm/filemap.c: make global sync not clear error status of individual inodes filemap_fdatawait() is a function to wait for on-going writeback to complete but also consume and clear error status of the mapping set during writeback. The latter functionality is critical for applications to detect writeback error with system calls like fsync(2)/fdatasync(2). However filemap_fdatawait() is also used by sync(2) or FIFREEZE ioctl, which don't check error status of individual mappings. As a result, fsync() may not be able to detect writeback error if events happen in the following order: Application System admin ---------------------------------------------------------- write data on page cache Run sync command writeback completes with error filemap_fdatawait() clears error fsync returns success (but the data is not on disk) This patch adds filemap_fdatawait_keep_errors() for call sites where writeback error is not handled so that they don't clear error status. Signed-off-by: Jun'ichi Nomura Acked-by: Andi Kleen Reviewed-by: Tejun Heo Cc: Fengguang Wu Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7378169..206a68b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); old_inode = inode; - filemap_fdatawait(mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(mapping); cond_resched(); diff --git a/fs/sync.c b/fs/sync.c index fbc98ee..4ec430a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) static void fdatawait_one_bdev(struct block_device *bdev, void *arg) { - filemap_fdatawait(bdev->bd_inode->i_mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 72d8a84..9355f37 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2422,6 +2422,7 @@ extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); diff --git a/mm/filemap.c b/mm/filemap.c index 1fe962b..884766d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/** - * filemap_fdatawait_range - wait for writeback to complete - * @mapping: address space structure to wait for - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Walk the list of under-writeback pages of the given address space - * in the given range and wait for all of them. - */ -int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, - loff_t end_byte) +static int __filemap_fdatawait_range(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret2, ret = 0; + int ret = 0; if (end_byte < start_byte) goto out; @@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, cond_resched(); } out: + return ret; +} + +/** + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. Check error status of + * the address space and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) +{ + int ret, ret2; + + ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); ret2 = filemap_check_errors(mapping); if (!ret) ret = ret2; @@ -383,11 +397,38 @@ out: EXPORT_SYMBOL(filemap_fdatawait_range); /** + * filemap_fdatawait_keep_errors - wait for writeback without clearing errors + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. Unlike filemap_fdatawait(), this function + * does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +void filemap_fdatawait_keep_errors(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return; + + __filemap_fdatawait_range(mapping, 0, i_size - 1); +} + +/** * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space - * and wait for all of them. + * and wait for all of them. Check error status of the address space + * and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. */ int filemap_fdatawait(struct address_space *mapping) { -- cgit v0.10.2 From a5f65109026b35b654b94fdcd26a971185a53adc Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:26 -0800 Subject: mm: hwpoison: ratelimit messages from unpoison_memory() Currently kernel prints out results of every single unpoison event, which i= s not necessary because unpoison is purely a testing feature and testers can = get little or no information from lots of lines of unpoison log storm. So this patch ratelimits printk in unpoison_memory(). This patch introduces a file local ratelimit_state, which adds 64 bytes to memory-failure.o. If we apply pr_info_ratelimited() for 8 callsite below, 2= 56 bytes is added, so it's a win. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9588269..16a0ec3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -1403,6 +1404,12 @@ static int __init memory_failure_init(void) } core_initcall(memory_failure_init); +#define unpoison_pr_info(fmt, pfn, rs) \ +({ \ + if (__ratelimit(rs)) \ + pr_info(fmt, pfn); \ +}) + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1421,6 +1428,8 @@ int unpoison_memory(unsigned long pfn) struct page *p; int freeit = 0; unsigned int nr_pages; + static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (!pfn_valid(pfn)) return -ENXIO; @@ -1429,23 +1438,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", + pfn, &unpoison_rs); return 0; } if (page_count(page) > 1) { - pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn); + unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", + pfn, &unpoison_rs); return 0; } if (page_mapped(page)) { - pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn); + unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", + pfn, &unpoison_rs); return 0; } if (page_mapping(page)) { - pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", - pfn); + unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1455,7 +1467,8 @@ int unpoison_memory(unsigned long pfn) * In such case, we yield to memory_failure() and make unpoison fail. */ if (!PageHuge(page) && PageTransHuge(page)) { - pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1469,12 +1482,14 @@ int unpoison_memory(unsigned long pfn) * to the end. */ if (PageHuge(page)) { - pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", + pfn, &unpoison_rs); return 0; } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); - pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1486,7 +1501,8 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", + pfn, &unpoison_rs); num_poisoned_pages_sub(nr_pages); freeit = 1; if (PageHuge(page)) -- cgit v0.10.2 From 3608de0787e51d3d826656e105524b48ade7b16f Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 5 Nov 2015 18:47:29 -0800 Subject: mm/memcontrol.c: fix order calculation in try_charge() Since commit 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()"), the order to pass to mem_cgroup_oom() is calculated by passing the number of pages to get_order() instead of the expected size in bytes. AFAICT, it only affects the value displayed in the oom warning message. This patch fix this. Michal said: : We haven't noticed that just because the OOM is enabled only for page : faults of order-0 (single page) and get_order work just fine. Thanks for : noticing this. If we ever start triggering OOM on different orders this : would be broken. Signed-off-by: Jerome Marchand Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b952abe..a1c05ff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2094,7 +2094,8 @@ retry: mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); + mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -- cgit v0.10.2 From 80f73b4b71f767f7fcd85f68be18af7795904484 Mon Sep 17 00:00:00 2001 From: Ebru Akagunduz Date: Thu, 5 Nov 2015 18:47:32 -0800 Subject: Documentation/vm/transhuge.txt: add information about max_ptes_swap max_ptes_swap specifies how many pages can be brought in from swap when collapsing a group of pages into a transparent huge page. /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap A higher value can cause excessive swap IO and waste memory. A lower value can prevent THPs from being collapsed, resulting fewer pages being collapsed into THPs, and lower memory access performance. Signed-off-by: Ebru Akagunduz Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Oleg Nesterov Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 8143b9e..8a28268 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -170,6 +170,16 @@ A lower value leads to gain less thp performance. Value of max_ptes_none can waste cpu time very little, you can ignore it. +max_ptes_swap specifies how many pages can be brought in from +swap when collapsing a group of pages into a transparent huge page. + +/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap + +A higher value can cause excessive swap IO and waste +memory. A lower value can prevent THPs from being +collapsed, resulting fewer pages being collapsed into +THPs, and lower memory access performance. + == Boot parameter == You can change the sysfs boot time defaults of Transparent Hugepage -- cgit v0.10.2 From 42e2e45777a8c2ec32b6a3c3d81a7d454f6afb6d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:36 -0800 Subject: mm/vmscan: make inactive_anon/file_is_low return bool Make inactive_anon/file_is_low return bool due to these particular functions only using either one or zero as their return value. No functional change. Signed-off-by: Yaowei Bai Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 773fc8d..38d0481 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1859,7 +1859,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } #ifdef CONFIG_SWAP -static int inactive_anon_is_low_global(struct zone *zone) +static bool inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1876,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct lruvec *lruvec) +static bool inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation * is pointless. */ if (!total_swap_pages) - return 0; + return false; if (!mem_cgroup_disabled()) return mem_cgroup_inactive_anon_is_low(lruvec); @@ -1891,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec) return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct lruvec *lruvec) +static inline bool inactive_anon_is_low(struct lruvec *lruvec) { - return 0; + return false; } #endif @@ -1911,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_file_is_low(struct lruvec *lruvec) { unsigned long inactive; unsigned long active; @@ -1922,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec) return active > inactive; } -static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { if (is_file_lru(lru)) return inactive_file_is_low(lruvec); -- cgit v0.10.2 From 13308ca9ef9acb325b52bd8562639b5844f3cbf2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 5 Nov 2015 18:47:40 -0800 Subject: mm/memcontrol: make mem_cgroup_inactive_anon_is_low() return bool Make mem_cgroup_inactive_anon_is_low return bool due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 641bb60..4142f94 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -382,7 +382,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { unsigned long inactive_ratio; unsigned long inactive; @@ -585,10 +585,10 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline int +static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { - return 1; + return true; } static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -- cgit v0.10.2 From 426fb5e72d92b868912e47a1e3ca2df6eabc3872 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:47:44 -0800 Subject: mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending SIGKILL It was confirmed that a local unprivileged user can consume all memory reserves and hang up that system using time lag between the OOM killer sets TIF_MEMDIE on an OOM victim and sends SIGKILL to that victim, for printk() inside for_each_process() loop at oom_kill_process() can consume many seconds when there are many thread groups sharing the same memory. Before starting oom-depleter process: Node 0 DMA: 3*4kB (UM) 6*8kB (U) 4*16kB (UEM) 0*32kB 0*64kB 1*128kB (M) 2*256kB (EM) 2*512kB (UE) 2*1024kB (EM) 1*2048kB (E) 1*4096kB (M) = 9980kB Node 0 DMA32: 31*4kB (UEM) 27*8kB (UE) 32*16kB (UE) 13*32kB (UE) 14*64kB (UM) 7*128kB (UM) 8*256kB (UM) 8*512kB (UM) 3*1024kB (U) 4*2048kB (UM) 362*4096kB (UM) = 1503220kB As of invoking the OOM killer: Node 0 DMA: 11*4kB (UE) 8*8kB (UEM) 6*16kB (UE) 2*32kB (EM) 0*64kB 1*128kB (U) 3*256kB (UEM) 2*512kB (UE) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7308kB Node 0 DMA32: 1049*4kB (UEM) 507*8kB (UE) 151*16kB (UE) 53*32kB (UEM) 83*64kB (UEM) 52*128kB (EM) 25*256kB (UEM) 11*512kB (M) 6*1024kB (UM) 1*2048kB (M) 0*4096kB = 44556kB Between the thread group leader got TIF_MEMDIE and receives SIGKILL: Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB The oom-depleter's thread group leader which got TIF_MEMDIE started memset() in user space after the OOM killer set TIF_MEMDIE, and it was free to abuse ALLOC_NO_WATERMARKS by TIF_MEMDIE for memset() in user space until SIGKILL is delivered. If SIGKILL is delivered before TIF_MEMDIE is set, the oom-depleter can terminate without touching memory reserves. Although the possibility of hitting this time lag is very small for 3.19 and earlier kernels because TIF_MEMDIE is set immediately before sending SIGKILL, preemption or long interrupts (an extreme example is SysRq-t) can step between and allow memory allocations which are not needed for terminating the OOM victim. Fixes: 83363b917a29 ("oom: make sure that TIF_MEMDIE is set under task_lock") Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ecc0bc..8ad35aa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -554,6 +554,12 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; + /* + * We should send SIGKILL before setting TIF_MEMDIE in order to prevent + * the OOM victim from depleting the memory reserves from the user + * space under its control. + */ + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), @@ -585,7 +591,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, } rcu_read_unlock(); - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); put_task_struct(victim); } #undef K -- cgit v0.10.2 From 880b768937e90c433c0c8254a22b1eb63df005a4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:47:51 -0800 Subject: mm/oom_kill.c: fix potentially killing unrelated process At the for_each_process() loop in oom_kill_process(), we are comparing address of OOM victim's mm without holding a reference to that mm. If there are a lot of processes to compare or a lot of "Kill process %d (%s) sharing same memory" messages to print, for_each_process() loop could take very long time. It is possible that meanwhile the OOM victim exits and releases its mm, and then mm is allocated with the same address and assigned to some unrelated process. When we hit such race, the unrelated process will be killed by error. To make sure that the OOM victim's mm does not go away until for_each_process() loop finishes, get a reference on the OOM victim's mm before calling task_unlock(victim). [oleg@redhat.com: several fixes] Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ad35aa..5ba743a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -552,8 +552,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, victim = p; } - /* mm cannot safely be dereferenced after task_unlock(victim) */ + /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; + atomic_inc(&mm->mm_count); /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user @@ -591,6 +592,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, } rcu_read_unlock(); + mmdrop(mm); put_task_struct(victim); } #undef K -- cgit v0.10.2 From 840807a8f40bb25a8df5b6412bba6bc156643be5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:47:54 -0800 Subject: mm/oom_kill.c: suppress unnecessary "sharing same memory" message oom_kill_process() sends SIGKILL to other thread groups sharing victim's mm. But printing "Kill process %d (%s) sharing same memory\n" lines makes no sense if they already have pending SIGKILL. This patch reduces the "Kill process" lines by printing that line with info level only if SIGKILL is not pending. Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5ba743a..c170d9f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -583,9 +583,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, !(p->flags & PF_KTHREAD)) { if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; + if (fatal_signal_pending(p)) + continue; task_lock(p); /* Protect ->comm from prctl() */ - pr_err("Kill process %d (%s) sharing same memory\n", + pr_info("Kill process %d (%s) sharing same memory\n", task_pid_nr(p), p->comm); task_unlock(p); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); -- cgit v0.10.2 From fa6c7b46aaa0cc00846703e8c0ec1e1636ff25ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:56 -0800 Subject: mm, compaction: export tracepoints status strings to userspace Some compaction tracepoints convert the integer return values to strings using the compaction_status_string array. This works for in-kernel printing, but not userspace trace printing of raw captured trace such as via trace-cmd report. This patch converts the private array to appropriate tracepoint macros that result in proper userspace support. trace-cmd output before: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret= after: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret=partial Signed-off-by: Vlastimil Babka Reviewed-by: Steven Rostedt Cc: Joonsoo Kim Cc: Ingo Molnar Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compaction.h b/include/linux/compaction.h index aa8f61c..f14ba98 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -15,7 +15,7 @@ /* For more detailed tracepoint output */ #define COMPACT_NO_SUITABLE_PAGE 5 #define COMPACT_NOT_SUITABLE_ZONE 6 -/* When adding new state, please change compaction_status_string, too */ +/* When adding new states, please adjust include/trace/events/compaction.h */ /* Used to signal whether compaction detected need_sched() or lock contention */ /* No contention detected */ diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 9a6a3fe..1275a55 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -9,6 +9,35 @@ #include #include +#define COMPACTION_STATUS \ + EM( COMPACT_DEFERRED, "deferred") \ + EM( COMPACT_SKIPPED, "skipped") \ + EM( COMPACT_CONTINUE, "continue") \ + EM( COMPACT_PARTIAL, "partial") \ + EM( COMPACT_COMPLETE, "complete") \ + EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ + EMe(COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") + +/* + * First define the enums in the above macros to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +COMPACTION_STATUS + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + DECLARE_EVENT_CLASS(mm_compaction_isolate_template, TP_PROTO( @@ -161,7 +190,7 @@ TRACE_EVENT(mm_compaction_end, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async", - compaction_status_string[__entry->status]) + __print_symbolic(__entry->status, COMPACTION_STATUS)) ); TRACE_EVENT(mm_compaction_try_to_compact_pages, @@ -217,7 +246,7 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template, __entry->nid, __entry->name, __entry->order, - compaction_status_string[__entry->ret]) + __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, diff --git a/mm/compaction.c b/mm/compaction.c index a8e6593..a5849c4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#ifdef CONFIG_TRACEPOINTS -static const char *const compaction_status_string[] = { - "deferred", - "skipped", - "continue", - "partial", - "complete", - "no_suitable_page", - "not_suitable_zone", -}; -#endif #define CREATE_TRACE_POINTS #include -- cgit v0.10.2 From 1743d0506003f7db0602d120ecf63e2747af0d72 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:47:59 -0800 Subject: mm, compaction: export tracepoints zone names to userspace Some compaction tracepoints use zone->name to print which zone is being compacted. This works for in-kernel printing, but not userspace trace printing of raw captured trace such as via trace-cmd report. This patch uses zone_idx() instead of zone->name as the raw value, and when printing, converts the zone_type to string using the appropriate EM() macros and some ugly tricks to overcome the problem that half the values depend on CONFIG_ options and one does not simply use #ifdef inside of #define. trace-cmd output before: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret=partial after: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=Normal order=9 ret=partial Signed-off-by: Vlastimil Babka Reviewed-by: Steven Rostedt Cc: Joonsoo Kim Cc: Ingo Molnar Cc: Mel Gorman Cc: David Rientjes Cc: Valentin Rothberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 1275a55..a43000d7 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -18,6 +18,31 @@ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ EMe(COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") +#ifdef CONFIG_ZONE_DMA +#define IFDEF_ZONE_DMA(X) X +#else +#define IFDEF_ZONE_DMA(X) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define IFDEF_ZONE_DMA32(X) X +#else +#define IFDEF_ZONE_DMA32(X) +#endif + +#ifdef CONFIG_HIGHMEM +#define IFDEF_ZONE_HIGHMEM(X) X +#else +#define IFDEF_ZONE_HIGHMEM(X) +#endif + +#define ZONE_TYPE \ + IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ + IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ + EM (ZONE_NORMAL, "Normal") \ + IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ + EMe(ZONE_MOVABLE,"Movable") + /* * First define the enums in the above macros to be exported to userspace * via TRACE_DEFINE_ENUM(). @@ -28,6 +53,7 @@ #define EMe(a, b) TRACE_DEFINE_ENUM(a); COMPACTION_STATUS +ZONE_TYPE /* * Now redefine the EM() and EMe() macros to map the enums to the strings @@ -230,21 +256,21 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_STRUCT__entry( __field(int, nid) - __field(char *, name) + __field(enum zone_type, idx) __field(int, order) __field(int, ret) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); - __entry->name = (char *)zone->name; + __entry->idx = zone_idx(zone); __entry->order = order; __entry->ret = ret; ), TP_printk("node=%d zone=%-8s order=%d ret=%s", __entry->nid, - __entry->name, + __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); @@ -276,7 +302,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_STRUCT__entry( __field(int, nid) - __field(char *, name) + __field(enum zone_type, idx) __field(int, order) __field(unsigned int, considered) __field(unsigned int, defer_shift) @@ -285,7 +311,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_fast_assign( __entry->nid = zone_to_nid(zone); - __entry->name = (char *)zone->name; + __entry->idx = zone_idx(zone); __entry->order = order; __entry->considered = zone->compact_considered; __entry->defer_shift = zone->compact_defer_shift; @@ -294,7 +320,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", __entry->nid, - __entry->name, + __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __entry->order_failed, __entry->considered, -- cgit v0.10.2 From 2d1e10412c2388ff9b6afc60536eaa195a419289 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 Nov 2015 18:48:02 -0800 Subject: mm, compaction: distinguish contended status in tracepoints Compaction returns prematurely with COMPACT_PARTIAL when contended or has fatal signal pending. This is ok for the callers, but might be misleading in the traces, as the usual reason to return COMPACT_PARTIAL is that we think the allocation should succeed. After this patch we distinguish the premature ending condition in the mm_compaction_finished and mm_compaction_end tracepoints. The contended status covers the following reasons: - lock contention or need_resched() detected in async compaction - fatal signal pending - too many pages isolated in the zone (only for async compaction) Further distinguishing the exact reason seems unnecessary for now. Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compaction.h b/include/linux/compaction.h index f14ba98..4cd4ddf 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -15,6 +15,7 @@ /* For more detailed tracepoint output */ #define COMPACT_NO_SUITABLE_PAGE 5 #define COMPACT_NOT_SUITABLE_ZONE 6 +#define COMPACT_CONTENDED 7 /* When adding new states, please adjust include/trace/events/compaction.h */ /* Used to signal whether compaction detected need_sched() or lock contention */ diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index a43000d7..c92d1e1 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -16,7 +16,8 @@ EM( COMPACT_PARTIAL, "partial") \ EM( COMPACT_COMPLETE, "complete") \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ - EMe(COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") + EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ + EMe(COMPACT_CONTENDED, "contended") #ifdef CONFIG_ZONE_DMA #define IFDEF_ZONE_DMA(X) X diff --git a/mm/compaction.c b/mm/compaction.c index a5849c4..de3e1e7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1202,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, unsigned long watermark; if (cc->contended || fatal_signal_pending(current)) - return COMPACT_PARTIAL; + return COMPACT_CONTENDED; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { @@ -1393,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: - ret = COMPACT_PARTIAL; + ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; @@ -1424,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * and we want compact_finished() to detect it */ if (err == -ENOMEM && !compact_scanners_met(cc)) { - ret = COMPACT_PARTIAL; + ret = COMPACT_CONTENDED; goto out; } } @@ -1477,6 +1477,9 @@ out: trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); + if (ret == COMPACT_CONTENDED) + ret = COMPACT_PARTIAL; + return ret; } -- cgit v0.10.2 From da39da3a54fed88e29024f2f1f6cd7357cd03a44 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:48:05 -0800 Subject: mm, oom: remove task_lock protecting comm printing The oom killer takes task_lock() in a couple of places solely to protect printing the task's comm. A process's comm, including current's comm, may change due to /proc/pid/comm or PR_SET_NAME. The comm will always be NULL-terminated, so the worst race scenario would only be during update. We can tolerate a comm being printed that is in the middle of an update to avoid taking the lock. Other locations in the kernel have already dropped task_lock() when printing comm, so this is consistent. Signed-off-by: David Rientjes Suggested-by: Oleg Nesterov Cc: Michal Hocko Cc: Vladimir Davydov Cc: Sergey Senozhatsky Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 1b35799..5a13119 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -93,7 +93,7 @@ extern int current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); -extern void cpuset_print_task_mems_allowed(struct task_struct *p); +extern void cpuset_print_current_mems_allowed(void); /* * read_mems_allowed_begin is required when making decisions involving @@ -219,7 +219,7 @@ static inline void rebuild_sched_domains(void) partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_print_task_mems_allowed(struct task_struct *p) +static inline void cpuset_print_current_mems_allowed(void) { } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0acff0..9ef59a3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, } /** - * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @tsk: pointer to task_struct of some task. + * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * - * Description: Prints @task's name, cpuset name, and cached copy of its + * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ -void cpuset_print_task_mems_allowed(struct task_struct *tsk) +void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); - cgrp = task_cs(tsk)->css.cgroup; - pr_info("%s cpuset=", tsk->comm); + cgrp = task_cs(current)->css.cgroup; + pr_info("%s cpuset=", current->comm); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); + pr_cont(" mems_allowed=%*pbl\n", + nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c170d9f..58f3d27 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -377,13 +377,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " "oom_score_adj=%hd\n", current->comm, oc->gfp_mask, oc->order, current->signal->oom_score_adj); - cpuset_print_task_mems_allowed(current); - task_unlock(current); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) mem_cgroup_print_oom_info(memcg, p); @@ -509,10 +507,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, if (__ratelimit(&oom_rs)) dump_header(oc, p, memcg); - task_lock(p); pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); - task_unlock(p); /* * If any of p's children has a different mm and is eligible for kill, @@ -586,10 +582,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, if (fatal_signal_pending(p)) continue; - task_lock(p); /* Protect ->comm from prctl() */ pr_info("Kill process %d (%s) sharing same memory\n", task_pid_nr(p), p->comm); - task_unlock(p); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); -- cgit v0.10.2 From d031a157915e0508ffa1ab9f1bbf977257529cb4 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Thu, 5 Nov 2015 18:48:08 -0800 Subject: mm/vmscan.c: fix types of some locals In zone_reclaimable_pages(), `nr' is returned by a function which is declared as returning "unsigned long", so declare it such. Negative values are meaningless here. In zone_pagecache_reclaimable() we should also declare `delta' and `nr_pagecache_reclaimable' as being unsigned longs because they're used to store the values returned by zone_page_state() and zone_unmapped_file_pages() which also happen to return unsigned integers. [akpm@linux-foundation.org: make zone_pagecache_reclaimable() return ulong rather than long] Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 38d0481..fdd89978 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc) static unsigned long zone_reclaimable_pages(struct zone *zone) { - int nr; + unsigned long nr; nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); @@ -3693,10 +3693,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long zone_pagecache_reclaimable(struct zone *zone) { - long nr_pagecache_reclaimable; - long delta = 0; + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; /* * If RECLAIM_UNMAP is set, then all file pages are considered -- cgit v0.10.2 From 9fd745d450e7e2b0d2f1b386b886e7d568b64404 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Thu, 5 Nov 2015 18:48:11 -0800 Subject: mm: fix overflow in find_zone_movable_pfns_for_nodes() If the user set "movablecore=xx" to a large number, corepages will overflow. Fix the problem. Signed-off-by: Xishi Qiu Reviewed-by: Yasuaki Ishimatsu Acked-by: Tang Chen Acked-by: David Rientjes Cc: Mel Gorman Cc: Tang Chen Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c60605d..4aed338 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5666,6 +5666,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ required_movablecore = roundup(required_movablecore, MAX_ORDER_NR_PAGES); + required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; required_kernelcore = max(required_kernelcore, corepages); -- cgit v0.10.2 From 87e8827b37c0c391d9915d0dc6a06c9b5f9cac65 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:14 -0800 Subject: mm: fix the racy mm->locked_vm change in "mm->locked_vm += grow" and vm_stat_account() in acct_stack_growth() are not safe; multiple threads using the same ->mm can do this at the same time trying to expans different vma's under down_read(mmap_sem). This means that one of the "locked_vm += grow" changes can be lost and we can miss munlock_vma_pages_all() later. Move this code into the caller(s) under mm->page_table_lock. All other updates to ->locked_vm hold mmap_sem for writing. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Cc: Andrey Konovalov Cc: Davidlohr Bueso Cc: "Kirill A. Shutemov" Cc: Sasha Levin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 3ec19b6..d1ac224 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2138,10 +2138,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; - /* Ok, everything looks good - let it rip */ - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } @@ -2202,6 +2198,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * against concurrent vma expansions. */ spin_lock(&vma->vm_mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + vm_stat_account(vma->vm_mm, vma->vm_flags, + vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2273,6 +2273,10 @@ int expand_downwards(struct vm_area_struct *vma, * against concurrent vma expansions. */ spin_lock(&vma->vm_mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + vm_stat_account(vma->vm_mm, vma->vm_flags, + vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; -- cgit v0.10.2 From 09357814778a38a5ab2d031cba6c9e9fe090c849 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:17 -0800 Subject: mm: add the "struct mm_struct *mm" local into Cosmetic, but expand_upwards() and expand_downwards() overuse vma->vm_mm, a local variable makes sense imho. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Cc: Andrey Konovalov Cc: Davidlohr Bueso Cc: "Kirill A. Shutemov" Cc: Sasha Levin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index d1ac224..3204a7e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2148,6 +2148,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { + struct mm_struct *mm = vma->vm_mm; int error; if (!(vma->vm_flags & VM_GROWSUP)) @@ -2197,10 +2198,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * So, we reuse mm->page_table_lock to guard * against concurrent vma expansions. */ - spin_lock(&vma->vm_mm->page_table_lock); + spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - vm_stat_account(vma->vm_mm, vma->vm_flags, + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; @@ -2208,8 +2209,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_next) vma_gap_update(vma->vm_next); else - vma->vm_mm->highest_vm_end = address; - spin_unlock(&vma->vm_mm->page_table_lock); + mm->highest_vm_end = address; + spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } @@ -2217,7 +2218,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); - validate_mm(vma->vm_mm); + validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2228,6 +2229,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { + struct mm_struct *mm = vma->vm_mm; int error; /* @@ -2272,17 +2274,17 @@ int expand_downwards(struct vm_area_struct *vma, * So, we reuse mm->page_table_lock to guard * against concurrent vma expansions. */ - spin_lock(&vma->vm_mm->page_table_lock); + spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - vm_stat_account(vma->vm_mm, vma->vm_flags, + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } @@ -2290,7 +2292,7 @@ int expand_downwards(struct vm_area_struct *vma, } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); - validate_mm(vma->vm_mm); + validate_mm(mm); return error; } -- cgit v0.10.2 From 0c1b2d783cf3432490bf1e532c742fffeadc0bf3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:20 -0800 Subject: mm/oom_kill: remove the wrong fatal_signal_pending() check in oom_kill_process() The fatal_signal_pending() was added to suppress unnecessary "sharing same memory" message, but it can't 100% help anyway because it can be false-negative; SIGKILL can be already dequeued. And worse, it can be false-positive due to exec or coredump. exec is mostly fine, but coredump is not. It is possible that the group leader has the pending SIGKILL because its sub-thread originated the coredump, in this case we must not skip this process. We could probably add the additional ->group_exit_task check but this patch just removes the wrong check along with pr_info(). Signed-off-by: Oleg Nesterov Acked-by: David Rientjes Acked-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Kyle Walker Cc: Stanislav Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 58f3d27..c837d06 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -579,11 +579,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, !(p->flags & PF_KTHREAD)) { if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; - if (fatal_signal_pending(p)) - continue; - pr_info("Kill process %d (%s) sharing same memory\n", - task_pid_nr(p), p->comm); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); -- cgit v0.10.2 From c319025a6c79e532d862e3a0b9506ba316a4d13a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:23 -0800 Subject: mm/oom_kill: cleanup the "kill sharing same memory" loop Purely cosmetic, but the complex "if" condition looks annoying to me. Especially because it is not consistent with OOM_SCORE_ADJ_MIN check which adds another if/continue. Signed-off-by: Oleg Nesterov Acked-by: David Rientjes Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Tetsuo Handa Cc: Kyle Walker Cc: Stanislav Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c837d06..2b6e880 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -574,14 +574,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * pending fatal signal. */ rcu_read_lock(); - for_each_process(p) - if (p->mm == mm && !same_thread_group(p, victim) && - !(p->flags & PF_KTHREAD)) { - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - continue; + for_each_process(p) { + if (p->mm != mm) + continue; + if (same_thread_group(p, victim)) + continue; + if (unlikely(p->flags & PF_KTHREAD)) + continue; + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); - } + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + } rcu_read_unlock(); mmdrop(mm); -- cgit v0.10.2 From 4d7b3394f76ed72cfdec23ca5571dbab6ec41793 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 5 Nov 2015 18:48:26 -0800 Subject: mm/oom_kill: fix the wrong task->mm == mm checks in oom_kill_process() Both "child->mm == mm" and "p->mm != mm" checks in oom_kill_process() are wrong. task->mm can be NULL if the task is the exited group leader. This means in particular that "kill sharing same memory" loop can miss a process with a zombie leader which uses the same ->mm. Note: the process_has_mm(child, p->mm) check is still not 100% correct, p->mm can be NULL too. This is minor, but probably deserves a fix or a comment anyway. [akpm@linux-foundation.org: document process_shares_mm() a bit] Signed-off-by: Oleg Nesterov Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Kyle Walker Cc: Stanislav Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2b6e880..e477828 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -474,6 +474,24 @@ void oom_killer_enable(void) oom_killer_disabled = false; } +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -521,7 +539,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; - if (child->mm == p->mm) + if (process_shares_mm(child, p->mm)) continue; /* * oom_badness() returns 0 if the thread is unkillable @@ -575,7 +593,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, */ rcu_read_lock(); for_each_process(p) { - if (p->mm != mm) + if (!process_shares_mm(p, mm)) continue; if (same_thread_group(p, victim)) continue; -- cgit v0.10.2 From 3ca65c19ddbb45f504edf92fe7126ecc94d56e36 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 5 Nov 2015 18:48:29 -0800 Subject: mm: optimize PageHighMem() check This came up when implementing HIHGMEM/PAE40 for ARC. The kmap() / kmap_atomic() generated code seemed needlessly bloated due to the way PageHighMem() macro is implemented. It derives the exact zone for page and then does pointer subtraction with first zone to infer the zone_type. The pointer arithmatic in turn generates the code bloat. PageHighMem(page) is_highmem(page_zone(page)) zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones Instead use is_highmem_idx() to work on zone_type available in page flags ----- Before ----- 80756348: mov_s r13,r0 8075634a: ld_s r2,[r13,0] 8075634c: lsr_s r2,r2,30 8075634e: mpy r2,r2,0x2a4 80756352: add_s r2,r2,0x80aef880 80756358: ld_s r3,[r2,28] 8075635a: sub_s r2,r2,r3 8075635c: breq r2,0x2a4,80756378 80756364: breq r2,0x548,80756378 ----- After ----- 80756330: mov_s r13,r0 80756332: ld_s r2,[r13,0] 80756334: lsr_s r2,r2,30 80756336: sub_s r2,r2,1 80756338: brlo r2,2,80756348 For x86 defconfig build (32 bit only) it saves around 900 bytes. For ARC defconfig with HIGHMEM, it saved around 2K bytes. ---->8------- ./scripts/bloat-o-meter x86/vmlinux-defconfig-pre x86/vmlinux-defconfig-post add/remove: 0/0 grow/shrink: 0/36 up/down: 0/-934 (-934) function old new delta saveable_page 162 154 -8 saveable_highmem_page 154 146 -8 skb_gro_reset_offset 147 131 -16 ... ... __change_page_attr_set_clr 1715 1678 -37 setup_data_read 434 394 -40 mon_bin_event 1967 1927 -40 swsusp_save 1148 1105 -43 _set_pages_array 549 493 -56 ---->8------- e.g. For ARC kmap() Signed-off-by: Vineet Gupta Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Jennifer Herbert Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 416509e..a525e50 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -256,7 +256,7 @@ PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ -#define PageHighMem(__p) is_highmem(page_zone(__p)) +#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #else PAGEFLAG_FALSE(HighMem) #endif -- cgit v0.10.2 From e6ee219fdd69c87ceaeb421bcd753a63937f8f31 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:48:32 -0800 Subject: mm/mmap.c: remove redundant statement "error = -ENOMEM" It is still a little better to remove it, although it should be skipped by "-O2". Signed-off-by: Chen Gang =0A= Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 3204a7e..28d1b35 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1562,7 +1562,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Clear old maps */ - error = -ENOMEM; while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) -- cgit v0.10.2 From 1e3ee14b9355a688ffe24725fa746ab120c42881 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:48:35 -0800 Subject: mm/mmap.c: do not initialize retval in mmap_pgoff() When fget() fails we can return -EBADF directly. Signed-off-by: Chen Gang Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 28d1b35..7e69f30 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, pgoff) { struct file *file = NULL; - unsigned long retval = -EBADF; + unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) - goto out; + return -EBADF; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file))); retval = -EINVAL; @@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, out_fput: if (file) fput(file); -out: return retval; } -- cgit v0.10.2 From c9427bc043da23de03d142c3c87ce4a57297c471 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 5 Nov 2015 18:48:38 -0800 Subject: mm/nommu.c: drop unlikely inside BUG_ON() (1) For !CONFIG_BUG cases, the bug call is a no-op, so we couldn't care less and the change is ok. (2) ppc and mips, which HAVE_ARCH_BUG_ON, do not rely on branch predictions as it seems to be pointless[1] and thus callers should not be trying to push an optimization in the first place. (3) For CONFIG_BUG and !HAVE_ARCH_BUG_ON cases, BUG_ON() contains an unlikely compiler flag already. Hence, we can drop unlikely behind BUG_ON(). [1] http://lkml.iu.edu/hypermail/linux/kernel/1101.3/02289.html Signed-off-by: Geliang Tang Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/nommu.c b/mm/nommu.c index 1e0f168..92be862 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - BUG_ON(unlikely(last->vm_end <= last->vm_start)); - BUG_ON(unlikely(last->vm_top < last->vm_end)); + BUG_ON(last->vm_end <= last->vm_start); + BUG_ON(last->vm_top < last->vm_end); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - BUG_ON(unlikely(region->vm_end <= region->vm_start)); - BUG_ON(unlikely(region->vm_top < region->vm_end)); - BUG_ON(unlikely(region->vm_start < last->vm_top)); + BUG_ON(region->vm_end <= region->vm_start); + BUG_ON(region->vm_top < region->vm_end); + BUG_ON(region->vm_start < last->vm_top); lastp = p; } -- cgit v0.10.2 From 27f28b972e12a4080e5f5e4eb36b8224705652d4 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 5 Nov 2015 18:48:41 -0800 Subject: mm/mmap.c: change __install_special_mapping() args order Make __install_special_mapping() args order match the caller, so the caller can pass their register args directly to callee with no touch. For most of architectures, args (at least the first 5th args) are in registers, so this change will have effect on most of architectures. For -O2, __install_special_mapping() may be inlined under most of architectures, but for -Os, it should not. So this change can get a little better performance for -Os, at least. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 7e69f30..220effd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3052,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma, static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, const struct vm_operations_struct *ops, - void *priv) + unsigned long vm_flags, void *priv, + const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; @@ -3102,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping( unsigned long addr, unsigned long len, unsigned long vm_flags, const struct vm_special_mapping *spec) { - return __install_special_mapping(mm, addr, len, vm_flags, - &special_mapping_vmops, (void *)spec); + return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, + &special_mapping_vmops); } int install_special_mapping(struct mm_struct *mm, @@ -3111,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm, unsigned long vm_flags, struct page **pages) { struct vm_area_struct *vma = __install_special_mapping( - mm, addr, len, vm_flags, &legacy_special_mapping_vmops, - (void *)pages); + mm, addr, len, vm_flags, (void *)pages, + &legacy_special_mapping_vmops); return PTR_ERR_OR_ZERO(vma); } -- cgit v0.10.2 From c2d42c16ad83006a706d83e51a7268db04af733a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:48:43 -0800 Subject: mm/vmstat.c: uninline node_page_state() With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of stack. Uninlining node_page_state() reduces this to 440 bytes. The stack consumption issue is fixed by newer gcc (4.8.4) however with that compiler this patch reduces the node.o text size from 7314 bytes to 4578. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 82e7db7..49dfe40 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } #ifdef CONFIG_NUMA -/* - * Determine the per node value of a stat item. This function - * is called frequently in a NUMA machine, so try to be as - * frugal as possible. - */ -static inline unsigned long node_page_state(int node, - enum zone_stat_item item) -{ - struct zone *zones = NODE_DATA(node)->node_zones; - - return -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], item) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], item) + -#endif -#ifdef CONFIG_HIGHMEM - zone_page_state(&zones[ZONE_HIGHMEM], item) + -#endif - zone_page_state(&zones[ZONE_NORMAL], item) + - zone_page_state(&zones[ZONE_MOVABLE], item); -} +extern unsigned long node_page_state(int node, enum zone_stat_item item); extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #else diff --git a/mm/vmstat.c b/mm/vmstat.c index fbf1448..ffcb4f5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) else __inc_zone_state(z, NUMA_OTHER); } + +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state(int node, enum zone_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + + return +#ifdef CONFIG_ZONE_DMA + zone_page_state(&zones[ZONE_DMA], item) + +#endif +#ifdef CONFIG_ZONE_DMA32 + zone_page_state(&zones[ZONE_DMA32], item) + +#endif +#ifdef CONFIG_HIGHMEM + zone_page_state(&zones[ZONE_HIGHMEM], item) + +#endif + zone_page_state(&zones[ZONE_NORMAL], item) + + zone_page_state(&zones[ZONE_MOVABLE], item); +} + #endif #ifdef CONFIG_COMPACTION -- cgit v0.10.2 From a1c34a3bf00af2cede839879502e12dc68491ad5 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 5 Nov 2015 18:48:46 -0800 Subject: mm: Don't offset memmap for flatmem Srinivas Kandagatla reported bad page messages when trying to remove the bottom 2MB on an ARM based IFC6410 board BUG: Bad page state in process swapper pfn:fffa8 page:ef7fb500 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x96640253(locked|error|dirty|active|arch_1|reclaim|mlocked) page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: flags: 0x200041(locked|active|mlocked) Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 3.19.0-rc3-00007-g412f9ba-dirty #816 Hardware name: Qualcomm (Flattened Device Tree) unwind_backtrace show_stack dump_stack bad_page free_pages_prepare free_hot_cold_page __free_pages free_highmem_page mem_init start_kernel Disabling lock debugging due to kernel taint Removing the lower 2MB made the start of the lowmem zone to no longer be page block aligned. IFC6410 uses CONFIG_FLATMEM where alloc_node_mem_map allocates memory for the mem_map. alloc_node_mem_map will offset for unaligned nodes with the assumption the pfn/page translation functions will account for the offset. The functions for CONFIG_FLATMEM do not offset however, resulting in overrunning the memmap array. Just use the allocated memmap without any offset when running with CONFIG_FLATMEM to avoid the overrun. Signed-off-by: Laura Abbott Signed-off-by: Laura Abbott Reported-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Acked-by: Vlastimil Babka Tested-by: Bjorn Andersson Cc: Santosh Shilimkar Cc: Russell King Cc: Kevin Hilman Cc: Arnd Bergman Cc: Stephen Boyd Cc: Andy Gross Cc: Mel Gorman Cc: Steven Rostedt Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4aed338..86f7d95 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5421,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { + unsigned long __maybe_unused offset = 0; + /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; @@ -5437,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + offset = pgdat->node_start_pfn - start; end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); @@ -5444,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) if (!map) map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); - pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); + pgdat->node_mem_map = map + offset; } #ifndef CONFIG_NEED_MULTIPLE_NODES /* @@ -5452,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); + mem_map -= offset; #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif -- cgit v0.10.2 From f7ae3a95eab4e6766768cef664d2d429f53bd5f4 Mon Sep 17 00:00:00 2001 From: yalin wang Date: Thu, 5 Nov 2015 18:48:50 -0800 Subject: include/linux/vm_event_item.h: change HIGHMEM_ZONE macro definition Change HIGHMEM_ZONE to be the same as the DMA_ZONE macro. Signed-off-by: yalin wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9246d32..e623d39 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -14,12 +14,12 @@ #endif #ifdef CONFIG_HIGHMEM -#define HIGHMEM_ZONE(xx) , xx##_HIGH +#define HIGHMEM_ZONE(xx) xx##_HIGH, #else #define HIGHMEM_ZONE(xx) #endif -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), -- cgit v0.10.2 From a2c1aad3b5fccbb948878b75f9b8f13248666fd6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 5 Nov 2015 18:48:52 -0800 Subject: mm/vmacache: inline vmacache_valid_mm() This function incurs in very hot paths and merely does a few loads for validity check. Lets inline it, such that we can save the function call overhead. (akpm: this is cosmetic - the compiler already inlines vmacache_valid_mm()) Signed-off-by: Davidlohr Bueso Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmacache.c b/mm/vmacache.c index b6e3662..fd09dc9 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm) * Also handle the case where a kernel thread has adopted this mm via use_mm(). * That kernel thread's vmacache is not applicable to this mm. */ -static bool vmacache_valid_mm(struct mm_struct *mm) +static inline bool vmacache_valid_mm(struct mm_struct *mm) { return current->mm == mm && !(current->flags & PF_KTHREAD); } -- cgit v0.10.2 From bde304bdf4ec4a5f58cc1e90fe2d9cd2d96304c4 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Thu, 5 Nov 2015 18:48:56 -0800 Subject: mm/page_alloc.c: skip ZONE_MOVABLE if required_kernelcore is larger than totalpages If kernelcore was not specified, or the kernelcore size is zero (required_movablecore >= totalpages), or the kernelcore size is larger than totalpages, there is no ZONE_MOVABLE. We should fill the zone with both kernel memory and movable memory. Signed-off-by: Xishi Qiu Reviewed-by: Yasuaki Ishimatsu Cc: Mel Gorman Cc: David Rientjes Cc: Tang Chen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 86f7d95..06e6230 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5675,8 +5675,11 @@ static void __init find_zone_movable_pfns_for_nodes(void) required_kernelcore = max(required_kernelcore, corepages); } - /* If kernelcore was not specified, there is no ZONE_MOVABLE */ - if (!required_kernelcore) + /* + * If kernelcore was not specified or kernelcore size is larger + * than totalpages, there is no ZONE_MOVABLE. + */ + if (!required_kernelcore || required_kernelcore >= totalpages) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ -- cgit v0.10.2 From d05e83a6f861ad02c2fcba75d4c4cfe49e3bc90f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:48:59 -0800 Subject: memcg: simplify charging kmem pages Charging kmem pages proceeds in two steps. First, we try to charge the allocation size to the memcg the current task belongs to, then we allocate a page and "commit" the charge storing the pointer to the memcg in the page struct. Such a design looks overcomplicated, because there is not much sense in trying charging the allocation before actually allocating a page: we won't be able to consume much memory over the limit even if we charge after doing the actual allocation, besides we already charge user pages post factum, so being pedantic with kmem pages just looks pointless. So this patch simplifies the design by merging the "charge" and the "commit" steps into the same function, which takes the allocated page. Also, rename the charge and uncharge methods to memcg_kmem_charge and memcg_kmem_uncharge and make the charge method return error code instead of bool to conform to mem_cgroup_try_charge. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4142f94..c952466 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -752,11 +752,8 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) * conditions, but because they are pretty simple, they are expected to be * fast. */ -bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, - int order); -void __memcg_kmem_commit_charge(struct page *page, - struct mem_cgroup *memcg, int order); -void __memcg_kmem_uncharge_pages(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); /* * helper for acessing a memcg's index. It will be used as an index in the @@ -789,52 +786,30 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) } /** - * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. - * @gfp: the gfp allocation flags. - * @memcg: a pointer to the memcg this was charged against. - * @order: allocation order. + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order * - * returns true if the memcg where the current task belongs can hold this - * allocation. - * - * We return true automatically if this allocation is not to be accounted to - * any memcg. + * Returns 0 on success, an error code on failure. */ -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) +static __always_inline int memcg_kmem_charge(struct page *page, + gfp_t gfp, int order) { if (__memcg_kmem_bypass(gfp)) - return true; - return __memcg_kmem_newpage_charge(gfp, memcg, order); + return 0; + return __memcg_kmem_charge(page, gfp, order); } /** - * memcg_kmem_uncharge_pages: uncharge pages from memcg - * @page: pointer to struct page being freed - * @order: allocation order. + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order */ -static inline void -memcg_kmem_uncharge_pages(struct page *page, int order) +static __always_inline void memcg_kmem_uncharge(struct page *page, int order) { if (memcg_kmem_enabled()) - __memcg_kmem_uncharge_pages(page, order); -} - -/** - * memcg_kmem_commit_charge: embeds correct memcg in a page - * @page: pointer to struct page recently allocated - * @memcg: the memcg structure we charged against - * @order: allocation order. - * - * Needs to be called after memcg_kmem_newpage_charge, regardless of success or - * failure of the allocation. if @page is NULL, this function will revert the - * charges. Otherwise, it will commit @page to @memcg. - */ -static inline void -memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) -{ - if (memcg_kmem_enabled() && memcg) - __memcg_kmem_commit_charge(page, memcg, order); + __memcg_kmem_uncharge(page, order); } /** @@ -878,18 +853,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) return false; } -static inline bool -memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) -{ - return true; -} - -static inline void memcg_kmem_uncharge_pages(struct page *page, int order) +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { + return 0; } -static inline void -memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) +static inline void memcg_kmem_uncharge(struct page *page, int order) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1c05ff..e249279 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2404,57 +2404,26 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -/* - * We need to verify if the allocation against current->mm->owner's memcg is - * possible for the given order. But the page is not allocated yet, so we'll - * need a further commit step to do the final arrangements. - * - * It is possible for the task to switch cgroups in this mean time, so at - * commit time, we can't rely on task conversion any longer. We'll then use - * the handle argument to return to the caller which cgroup we should commit - * against. We could also return the memcg directly and avoid the pointer - * passing, but a boolean return value gives better semantics considering - * the compiled-out case as well. - * - * Returning true means the allocation is possible. - */ -bool -__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret; - *_memcg = NULL; - memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_kmem_is_active(memcg)) { css_put(&memcg->css); - return true; + return 0; } ret = memcg_charge_kmem(memcg, gfp, 1 << order); - if (!ret) - *_memcg = memcg; css_put(&memcg->css); - return (ret == 0); -} - -void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, - int order) -{ - VM_BUG_ON(mem_cgroup_is_root(memcg)); - - /* The page allocation failed. Revert */ - if (!page) { - memcg_uncharge_kmem(memcg, 1 << order); - return; - } page->mem_cgroup = memcg; + return ret; } -void __memcg_kmem_uncharge_pages(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 06e6230..446bb36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3428,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag); struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages(gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages_node(nid, gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -3455,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) */ void __free_kmem_pages(struct page *page, unsigned int order) { - memcg_kmem_uncharge_pages(page, order); + memcg_kmem_uncharge(page, order); __free_pages(page, order); } -- cgit v0.10.2 From f3ccb2c42297757d2e9b820ad37960462df7b7c1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:01 -0800 Subject: memcg: unify slab and other kmem pages charging We have memcg_kmem_charge and memcg_kmem_uncharge methods for charging and uncharging kmem pages to memcg, but currently they are not used for charging slab pages (i.e. they are only used for charging pages allocated with alloc_kmem_pages). The only reason why the slab subsystem uses special helpers, memcg_charge_slab and memcg_uncharge_slab, is that it needs to charge to the memcg of kmem cache while memcg_charge_kmem charges to the memcg that the current task belongs to. To remove this diversity, this patch adds an extra argument to __memcg_kmem_charge that can be a pointer to a memcg or NULL. If it is not NULL, the function tries to charge to the memcg it points to, otherwise it charge to the current context. Next, it makes the slab subsystem use this function to charge slab pages. Since memcg_charge_kmem and memcg_uncharge_kmem helpers are now used only in __memcg_kmem_charge and __memcg_kmem_uncharge, they are inlined. Since __memcg_kmem_charge stores a pointer to the memcg in the page struct, we don't need memcg_uncharge_slab anymore and can use free_kmem_pages. Besides, one can now detect which memcg a slab page belongs to by reading /proc/kpagecgroup. Note, this patch switches slab to charge-after-alloc design. Since this design is already used for all other memcg charges, it should not make any difference. [hannes@cmpxchg.org: better to have an outer function than a magic parameter for the memcg lookup] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Johannes Weiner Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c952466..0ba4c86 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -752,6 +752,8 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) * conditions, but because they are pretty simple, they are expected to be * fast. */ +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge(struct page *page, int order); @@ -770,10 +772,6 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep); struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages); -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); - static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e249279..9575cff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2215,34 +2215,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) -{ - struct page_counter *counter; - int ret = 0; - - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret < 0) - return ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret) - page_counter_uncharge(&memcg->kmem, nr_pages); - - return ret; -} - -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_uncharge(&memcg->memsw, nr_pages); - - page_counter_uncharge(&memcg->kmem, nr_pages); - - css_put_many(&memcg->css, nr_pages); -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -2404,36 +2376,59 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(current->mm); + unsigned int nr_pages = 1 << order; + struct page_counter *counter; + int ret = 0; - if (!memcg_kmem_is_active(memcg)) { - css_put(&memcg->css); + if (!memcg_kmem_is_active(memcg)) return 0; + + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret) + return ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) { + page_counter_uncharge(&memcg->kmem, nr_pages); + return ret; } - ret = memcg_charge_kmem(memcg, gfp, 1 << order); + page->mem_cgroup = memcg; + return 0; +} + +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(current->mm); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); - page->mem_cgroup = memcg; return ret; } void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; + unsigned int nr_pages = 1 << order; if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, 1 << order); + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); + page->mem_cgroup = NULL; + css_put_many(&memcg->css, nr_pages); } struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) diff --git a/mm/slab.c b/mm/slab.c index 461935b..272e809 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1593,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - if (memcg_charge_slab(cachep, flags, cachep->gfporder)) - return NULL; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); return NULL; } + if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { + __free_pages(page, cachep->gfporder); + return NULL; + } + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (page_is_pfmemalloc(page)) pfmemalloc_active = true; @@ -1654,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_pages(page, cachep->gfporder); - memcg_uncharge_slab(cachep, cachep->gfporder); + __free_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index bf51a8d..27492eb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -236,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } -static __always_inline int memcg_charge_slab(struct kmem_cache *s, - gfp_t gfp, int order) +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) { if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); -} - -static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ - if (!memcg_kmem_enabled()) - return; - if (is_root_cache(s)) - return; - memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); + return __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -289,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; } -static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) { return 0; } -static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ -} - static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index e1bb147..423dbe7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1328,16 +1328,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, flags |= __GFP_NOTRACK; - if (memcg_charge_slab(s, flags, order)) - return NULL; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else page = __alloc_pages_node(node, flags, order); - if (!page) - memcg_uncharge_slab(s, order); + if (page && memcg_charge_slab(page, flags, order, s)) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -1476,8 +1475,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); - memcg_uncharge_slab(s, order); + __free_kmem_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v0.10.2 From df4065516b0dbfa35ac0e9b8124d441221c0a285 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:04 -0800 Subject: memcg: simplify and inline __mem_cgroup_from_kmem Before the previous patch ("memcg: unify slab and other kmem pages charging"), __mem_cgroup_from_kmem had to handle two types of kmem - slab pages and pages allocated with alloc_kmem_pages - memcg in the page struct. Now we can unify it. Since after it, this function becomes tiny we can fold it into mem_cgroup_from_kmem. [hughd@google.com: move mem_cgroup_from_kmem into list_lru.c] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0ba4c86..998311e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -770,8 +770,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); - static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) @@ -830,13 +828,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } - -static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - if (!memcg_kmem_enabled()) - return NULL; - return __mem_cgroup_from_kmem(ptr); -} #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -882,11 +873,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } - -static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) -{ - return NULL; -} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ - diff --git a/mm/list_lru.c b/mm/list_lru.c index 2823747..afc71ea 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -63,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) return &nlru->lru; } +static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + struct page *page; + + if (!memcg_kmem_enabled()) + return NULL; + page = virt_to_head_page(ptr); + return page->mem_cgroup; +} + static inline struct list_lru_one * list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9575cff..c0111cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2430,24 +2430,6 @@ void __memcg_kmem_uncharge(struct page *page, int order) page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } - -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) -{ - struct mem_cgroup *memcg = NULL; - struct kmem_cache *cachep; - struct page *page; - - page = virt_to_head_page(ptr); - if (PageSlab(page)) { - cachep = page->slab_cache; - if (!is_root_cache(cachep)) - memcg = cachep->memcg_params.memcg; - } else - /* page allocated by alloc_kmem_pages */ - memcg = page->mem_cgroup; - - return memcg; -} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v0.10.2 From ad12695f177c3403a64348b42718faf9727fe358 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:07 -0800 Subject: ksm: add cond_resched() to the rmap_walks While at it add it to the file and anon walks too. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/ksm.c b/mm/ksm.c index 7ee101e..e87dec7 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1914,9 +1914,11 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; + cond_resched(); anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { + cond_resched(); vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/mm/rmap.c b/mm/rmap.c index d40e7ae..78a6928 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1609,6 +1609,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + cond_resched(); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; @@ -1658,6 +1660,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); + cond_resched(); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; -- cgit v0.10.2 From f2e5ff85edea30a59b96cf9e20e8886991b0d097 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:10 -0800 Subject: ksm: don't fail stable tree lookups if walking over stale stable_nodes The stable_nodes can become stale at any time if the underlying pages gets freed. The stable_node gets collected and removed from the stable rbtree if that is detected during the rbtree lookups. Don't fail the lookup if running into stale stable_nodes, just restart the lookup after collecting the stale stable_nodes. Otherwise the CPU spent in the preparation stage is wasted and the lookup must be repeated at the next loop potentially failing a second time in a second stale stable_node. If we don't prune aggressively we delay the merging of the unstable node candidates and at the same time we delay the freeing of the stale stable_nodes. Keeping stale stable_nodes around wastes memory and it can't provide any benefit. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/ksm.c b/mm/ksm.c index e87dec7..9f182f9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,8 +1177,18 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); tree_page = get_ksm_page(stable_node, false); - if (!tree_page) - return NULL; + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } ret = memcmp_pages(page, tree_page); put_page(tree_page); @@ -1254,12 +1264,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage) unsigned long kpfn; struct rb_root *root; struct rb_node **new; - struct rb_node *parent = NULL; + struct rb_node *parent; struct stable_node *stable_node; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; +again: + parent = NULL; new = &root->rb_node; while (*new) { @@ -1269,8 +1281,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); tree_page = get_ksm_page(stable_node, false); - if (!tree_page) - return NULL; + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } ret = memcmp_pages(kpage, tree_page); put_page(tree_page); -- cgit v0.10.2 From 98666f8a2576b12f5f3ebcef61a8cdbefede1be3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:13 -0800 Subject: ksm: use the helper method to do the hlist_empty check This just uses the helper function to cleanup the assumption on the hlist_node internals. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/ksm.c b/mm/ksm.c index 9f182f9..d4ee159 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -625,7 +625,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) unlock_page(page); put_page(page); - if (stable_node->hlist.first) + if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; -- cgit v0.10.2 From 85c6e8dd23c6aa9ff299bf5256fe5f0a6c44f100 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:16 -0800 Subject: ksm: use find_mergeable_vma in try_to_merge_with_ksm_page Doing the VM_MERGEABLE check after the page == kpage check won't provide any meaningful benefit. The !vma->anon_vma check of find_mergeable_vma is the only superfluous bit in using find_mergeable_vma because the !PageAnon check of try_to_merge_one_page() implicitly checks for that, but it still looks cleaner to share the same find_mergeable_vma(). Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/ksm.c b/mm/ksm.c index d4ee159..0183083 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1021,8 +1021,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (page == kpage) /* ksm page forked */ return 0; - if (!(vma->vm_flags & VM_MERGEABLE)) - goto out; if (PageTransCompound(page) && page_trans_compound_anon_split(page)) goto out; BUG_ON(PageTransCompound(page)); @@ -1087,10 +1085,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, int err = -EFAULT; down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto out; - vma = find_vma(mm, rmap_item->address); - if (!vma || vma->vm_start > rmap_item->address) + vma = find_mergeable_vma(mm, rmap_item->address); + if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); -- cgit v0.10.2 From c8f95ed1a9ce373d20d3b09a6a9fb91c8beef27e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 Nov 2015 18:49:19 -0800 Subject: ksm: unstable_tree_search_insert error checking cleanup get_mergeable_page() can only return NULL (also in case of errors) or the pinned mergeable page. It can't return an error different than NULL. This optimizes away the unnecessary error check. Add a return after the "out:" label in the callee to make it more readable. Signed-off-by: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Petr Holasek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/ksm.c b/mm/ksm.c index 0183083..b5cd647 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) flush_dcache_page(page); } else { put_page(page); -out: page = NULL; +out: + page = NULL; } up_read(&mm->mmap_sem); return page; @@ -1358,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); - if (IS_ERR_OR_NULL(tree_page)) + if (!tree_page) return NULL; /* -- cgit v0.10.2 From 326c2597a3b93b8a89e0c3a2b41fd9971d095a97 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 5 Nov 2015 18:49:21 -0800 Subject: mm: clear pte in clear_soft_dirty() As mentioned in the commit 56eecdb912b5 ("mm: Use ptep/pmdp_set_numa() for updating _PAGE_NUMA bit"), architectures like ppc64 don't do tlb flush in set_pte/pmd functions. So when dealing with existing pte in clear_soft_dirty, the pte must be cleared before being modified. Signed-off-by: Laurent Dufour Reviewed-by: Aneesh Kumar K.V Cc: Pavel Emelyanov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 288185e..ef44bb4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -792,19 +792,20 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { + ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); ptent = pte_wrprotect(ptent); ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } - - set_pte_at(vma->vm_mm, addr, pte, ptent); } static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); -- cgit v0.10.2 From 5d3875a01eeafef7ef0b73a99462cb9bd55e8485 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 5 Nov 2015 18:49:24 -0800 Subject: mm: clear_soft_dirty_pmd() requires THP Don't build clear_soft_dirty_pmd() if transparent huge pages are not enabled. Signed-off-by: Laurent Dufour Reviewed-by: Aneesh Kumar K.V Cc: Pavel Emelyanov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef44bb4..187b3b5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -801,7 +801,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, addr, pte, ptent); } } +#else +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} +#endif +#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -815,14 +822,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } - #else - -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} - static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { -- cgit v0.10.2 From 706874e9096e9d468eed9c2b03c8374e806535f3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 5 Nov 2015 18:49:27 -0800 Subject: mm: do not inc NR_PAGETABLE if ptlock_init failed If ALLOC_SPLIT_PTLOCKS is defined, ptlock_init may fail, in which case we shouldn't increment NR_PAGETABLE. Since small allocations, such as ptlock, normally do not fail (currently they can fail if kmemcg is used though), this patch does not really fix anything and should be considered as a code cleanup. Signed-off-by: Vladimir Davydov Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index fa08f3c..3c258f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1606,8 +1606,10 @@ static inline void pgtable_init(void) static inline bool pgtable_page_ctor(struct page *page) { + if (!ptlock_init(page)) + return false; inc_zone_page_state(page, NR_PAGETABLE); - return ptlock_init(page); + return true; } static inline void pgtable_page_dtor(struct page *page) -- cgit v0.10.2 From 7a14239a8fff45a241b6943a3ac444d5b67fcbed Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:30 -0800 Subject: mm Documentation: undoc non-linear vmas While updating some mm Documentation, I came across a few straggling references to the non-linear vmas which were happily removed in v4.0. Delete them. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 12ac0e4..9781b97 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -485,7 +485,6 @@ manner. The codes are the following: ac - area is accountable nr - swap space is not reserved for the area ht - area uses huge tlb pages - nl - non-linear mapping ar - architecture specific flag dd - do not include area into core dump sd - soft-dirty flag diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 6513fe2..479c3d0 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -99,12 +99,10 @@ Steps: 4. The new page is prepped with some settings from the old page so that accesses to the new page will discover a page with the correct settings. -5. All the page table references to the page are converted - to migration entries or dropped (nonlinear vmas). - This decrease the mapcount of a page. If the resulting - mapcount is not zero then we do not migrate the page. - All user space processes that attempt to access the page - will now wait on the page lock. +5. All the page table references to the page are converted to migration + entries. This decreases the mapcount of a page. If the resulting + mapcount is not zero then we do not migrate the page. All user space + processes that attempt to access the page will now wait on the page lock. 6. The radix tree lock is taken. This will cause all processes trying to access the page via the mapping to block on the radix tree spinlock. diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 32ee3a6..e983ee4 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -552,63 +552,17 @@ different reverse map mechanisms. is really unevictable or not. In this case, try_to_unmap_anon() will return SWAP_AGAIN. - (*) try_to_unmap_file() - linear mappings + (*) try_to_unmap_file() Unmapping of a mapped file page works the same as for anonymous mappings, except that the scan visits all VMAs that map the page's index/page offset - in the page's mapping's reverse map priority search tree. It also visits - each VMA in the page's mapping's non-linear list, if the list is - non-empty. + in the page's mapping's reverse map interval search tree. As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file page, try_to_unmap_file() will attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. - (*) try_to_unmap_file() - non-linear mappings - - If a page's mapping contains a non-empty non-linear mapping VMA list, then - try_to_un{map|lock}() must also visit each VMA in that list to determine - whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit - all VMAs in the non-linear list to ensure that the pages is not/should not - be mlocked. - - If a VM_LOCKED VMA is found in the list, the scan could terminate. - However, there is no easy way to determine whether the page is actually - mapped in a given VMA - either for unmapping or testing whether the - VM_LOCKED VMA actually pins the page. - - try_to_unmap_file() handles non-linear mappings by scanning a certain - number of pages - a "cluster" - in each non-linear VMA associated with the - page's mapping, for each file mapped page that vmscan tries to unmap. If - this happens to unmap the page we're trying to unmap, try_to_unmap() will - notice this on return (page_mapcount(page) will be 0) and return - SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to - recirculate this page. We take advantage of the cluster scan in - try_to_unmap_cluster() as follows: - - For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the - mmap semaphore of the associated mm_struct for read without blocking. - - If this attempt is successful and the VMA is VM_LOCKED, - try_to_unmap_cluster() will retain the mmap semaphore for the scan; - otherwise it drops it here. - - Then, for each page in the cluster, if we're holding the mmap semaphore - for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to - mlock the page. This call is a no-op if the page is already locked, - but will mlock any pages in the non-linear mapping that happen to be - unlocked. - - If one of the pages so mlocked is the page passed in to try_to_unmap(), - try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default - SWAP_AGAIN. This will allow vmscan to cull the page, rather than - recirculating it on the inactive list. - - Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it - returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED - VMA, but couldn't be mlocked. - try_to_munlock() REVERSE MAP SCAN --------------------------------- @@ -625,10 +579,9 @@ introduced a variant of try_to_unmap() called try_to_munlock(). try_to_munlock() calls the same functions as try_to_unmap() for anonymous and mapped file pages with an additional argument specifying unlock versus unmap processing. Again, these functions walk the respective reverse maps looking -for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file -pages mapped in linear VMAs, as in the try_to_unmap() case, the functions -attempt to acquire the associated mmap semaphore, mlock the page via -mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the +for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case, +the functions attempt to acquire the associated mmap semaphore, mlock the page +via mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page. If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap @@ -636,12 +589,6 @@ semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to recycle the page on the inactive list and hope that it has better luck with the page next time. -For file pages mapped into non-linear VMAs, the try_to_munlock() logic works -slightly differently. On encountering a VM_LOCKED non-linear VMA that might -map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the -page. munlock_vma_page() will just leave the page unlocked and let vmscan deal -with it - the usual fallback position. - Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. However, the scan can terminate when it encounters a VM_LOCKED VMA and can -- cgit v0.10.2 From b87537d9e2feb30f6a962f27eb32768682698d3b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:33 -0800 Subject: mm: rmap use pte lock not mmap_sem to set PageMlocked KernelThreadSanitizer (ktsan) has shown that the down_read_trylock() of mmap_sem in try_to_unmap_one() (when going to set PageMlocked on a page found mapped in a VM_LOCKED vma) is ineffective against races with exit_mmap()'s munlock_vma_pages_all(), because mmap_sem is not held when tearing down an mm. But that's okay, those races are benign; and although we've believed for years in that ugly down_read_trylock(), it's unsuitable for the job, and frustrates the good intention of setting PageMlocked when it fails. It just doesn't matter if here we read vm_flags an instant before or after a racing mlock() or munlock() or exit_mmap() sets or clears VM_LOCKED: the syscalls (or exit) work their way up the address space (taking pt locks after updating vm_flags) to establish the final state. We do still need to be careful never to mark a page Mlocked (hence unevictable) by any race that will not be corrected shortly after. The page lock protects from many of the races, but not all (a page is not necessarily locked when it's unmapped). But the pte lock we just dropped is good to cover the rest (and serializes even with munlock_vma_pages_all(), so no special barriers required): now hold on to the pte lock while calling mlock_vma_page(). Is that lock ordering safe? Yes, that's how follow_page_pte() calls it, and how page_remove_rmap() calls the complementary clear_page_mlock(). This fixes the following case (though not a case which anyone has complained of), which mmap_sem did not: truncation's preliminary unmap_mapping_range() is supposed to remove even the anonymous COWs of filecache pages, and that might race with try_to_unmap_one() on a VM_LOCKED vma, so that mlock_vma_page() sets PageMlocked just after zap_pte_range() unmaps the page, causing "Bad page state (mlocked)" when freed. The pte lock protects against this. You could say that it also protects against the more ordinary case, racing with the preliminary unmapping of a filecache page itself: but in our current tree, that's independently protected by i_mmap_rwsem; and that race would be why "Bad page state (mlocked)" was seen before commit 48ec833b7851 ("Revert mm/memory.c: share the i_mmap_rwsem"). Vlastimil Babka points out another race which this patch protects against. try_to_unmap_one() might reach its mlock_vma_page() TestSetPageMlocked a moment after munlock_vma_pages_all() did its Phase 1 TestClearPageMlocked: leaving PageMlocked and unevictable when it should be evictable. mmap_sem is ineffective because exit_mmap() does not hold it; page lock ineffective because __munlock_pagevec() only takes it afterwards, in Phase 2; pte lock is effective because __munlock_pagevec_fill() takes it to get the page, after VM_LOCKED was cleared from vm_flags, so visible to try_to_unmap_one. Kirill Shutemov points out that if the compiler chooses to implement a "vma->vm_flags &= VM_WHATEVER" or "vma->vm_flags |= VM_WHATEVER" operation with an intermediate store of unrelated bits set, since I'm here foregoing its usual protection by mmap_sem, try_to_unmap_one() might catch sight of a spurious VM_LOCKED in vm_flags, and make the wrong decision. This does not appear to be an immediate problem, but we may want to define vm_flags accessors in future, to guard against such a possibility. While we're here, make a related optimization in try_to_munmap_one(): if it's doing TTU_MUNLOCK, then there's no point at all in descending the page tables and getting the pt lock, unless the vma is VM_LOCKED. Yes, that can change racily, but it can change racily even without the optimization: it's not critical. Far better not to waste time here. Stopped short of separating try_to_munlock_one() from try_to_munmap_one() on this occasion, but that's probably the sensible next step - with a rename, given that try_to_munlock()'s business is to try to set Mlocked. Updated the unevictable-lru Documentation, to remove its reference to mmap semaphore, but found a few more updates needed in just that area. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index e983ee4..fa3b527 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -531,37 +531,20 @@ map. try_to_unmap() is always called, by either vmscan for reclaim or for page migration, with the argument page locked and isolated from the LRU. Separate -functions handle anonymous and mapped file pages, as these types of pages have -different reverse map mechanisms. +functions handle anonymous and mapped file and KSM pages, as these types of +pages have different reverse map lookup mechanisms, with different locking. +In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(), +it will call try_to_unmap_one() for every VMA which might contain the page. - (*) try_to_unmap_anon() +When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED +VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it, +and return SWAP_MLOCK to indicate that the page is unevictable: and the scan +stops there. - To unmap anonymous pages, each VMA in the list anchored in the anon_vma - must be visited - at least until a VM_LOCKED VMA is encountered. If the - page is being unmapped for migration, VM_LOCKED VMAs do not stop the - process because mlocked pages are migratable. However, for reclaim, if - the page is mapped into a VM_LOCKED VMA, the scan stops. - - try_to_unmap_anon() attempts to acquire in read mode the mmap semaphore of - the mm_struct to which the VMA belongs. If this is successful, it will - mlock the page via mlock_vma_page() - we wouldn't have gotten to - try_to_unmap_anon() if the page were already mlocked - and will return - SWAP_MLOCK, indicating that the page is unevictable. - - If the mmap semaphore cannot be acquired, we are not sure whether the page - is really unevictable or not. In this case, try_to_unmap_anon() will - return SWAP_AGAIN. - - (*) try_to_unmap_file() - - Unmapping of a mapped file page works the same as for anonymous mappings, - except that the scan visits all VMAs that map the page's index/page offset - in the page's mapping's reverse map interval search tree. - - As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file - page, try_to_unmap_file() will attempt to acquire the associated - mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this - is successful, and SWAP_AGAIN, if not. +mlock_vma_page() is called while holding the page table's lock (in addition +to the page lock, and the rmap lock): to serialize against concurrent mlock or +munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim, +holepunching, and truncation of file pages and their anonymous COWed pages. try_to_munlock() REVERSE MAP SCAN @@ -577,22 +560,15 @@ all PTEs from the page. For this purpose, the unevictable/mlock infrastructure introduced a variant of try_to_unmap() called try_to_munlock(). try_to_munlock() calls the same functions as try_to_unmap() for anonymous and -mapped file pages with an additional argument specifying unlock versus unmap +mapped file and KSM pages with a flag argument specifying unlock versus unmap processing. Again, these functions walk the respective reverse maps looking for VM_LOCKED VMAs. When such a VMA is found, as in the try_to_unmap() case, -the functions attempt to acquire the associated mmap semaphore, mlock the page -via mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the -pre-clearing of the page's PG_mlocked done by munlock_vma_page. - -If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap -semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to -recycle the page on the inactive list and hope that it has better luck with the -page next time. +the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK. This +undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page. Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. -However, the scan can terminate when it encounters a VM_LOCKED VMA and can -successfully acquire the VMA's mmap semaphore for read and mlock the page. +However, the scan can terminate when it encounters a VM_LOCKED VMA. Although try_to_munlock() might be called a great many times when munlocking a large region or tearing down a large address space that has been mlocked via mlockall(), overall this is a fairly rare event. @@ -620,11 +596,6 @@ Some examples of these unevictable pages on the LRU lists are: (3) mlocked pages that could not be isolated from the LRU and moved to the unevictable list in mlock_vma_page(). - (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't - acquire the VMA's mmap semaphore to test the flags and set PageMlocked. - munlock_vma_page() was forced to let the page back on to the normal LRU - list for vmscan to handle. - shrink_inactive_list() also diverts any unevictable pages that it finds on the inactive lists to the appropriate zone's unevictable list. diff --git a/mm/rmap.c b/mm/rmap.c index 78a6928..b93fb54 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1304,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, int ret = SWAP_AGAIN; enum ttu_flags flags = (enum ttu_flags)arg; + /* munlock has nothing to gain from examining un-locked vmas */ + if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) + goto out; + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1314,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) - goto out_mlock; - + if (vma->vm_flags & VM_LOCKED) { + /* Holding pte lock, we do *not* need mmap_sem here */ + mlock_vma_page(page); + ret = SWAP_MLOCK; + goto out_unmap; + } if (flags & TTU_MUNLOCK) goto out_unmap; } @@ -1421,31 +1428,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) + if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; - -out_mlock: - pte_unmap_unlock(pte, ptl); - - - /* - * We need mmap_sem locking, Otherwise VM_LOCKED check makes - * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. - * if trylock failed, the page remain in evictable lru and later - * vmscan could retry to move the page to unevictable lru if the - * page is actually mlocked. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_page(page); - ret = SWAP_MLOCK; - } - up_read(&vma->vm_mm->mmap_sem); - } - return ret; } bool is_vma_temporary_stack(struct vm_area_struct *vma) -- cgit v0.10.2 From 51afb12ba809db664682a31154c11e720e2c363c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:37 -0800 Subject: mm: page migration fix PageMlocked on migrated pages Commit e6c509f85455 ("mm: use clear_page_mlock() in page_remove_rmap()") in v3.7 inadvertently made mlock_migrate_page() impotent: page migration unmaps the page from userspace before migrating, and that commit clears PageMlocked on the final unmap, leaving mlock_migrate_page() with nothing to do. Not a serious bug, the next attempt at reclaiming the page would fix it up; but a betrayal of page migration's intent - the new page ought to emerge as PageMlocked. I don't see how to fix it for mlock_migrate_page() itself; but easily fixed in remove_migration_pte(), by calling mlock_vma_page() when the vma is VM_LOCKED - under pte lock as in try_to_unmap_one(). Delete mlock_migrate_page()? Not quite, it does still serve a purpose for migrate_misplaced_transhuge_page(): where we could replace it by a test, clear_page_mlock(), mlock_vma_page() sequence; but would that be an improvement? mlock_migrate_page() is fairly lean, and let's make it leaner by skipping the irq save/restore now clearly not needed. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/internal.h b/mm/internal.h index bc0fa9a..d4b807d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -271,20 +271,19 @@ extern unsigned int munlock_vma_page(struct page *page); extern void clear_page_mlock(struct page *page); /* - * mlock_migrate_page - called only from migrate_page_copy() to - * migrate the Mlocked page flag; update statistics. + * mlock_migrate_page - called only from migrate_misplaced_transhuge_page() + * (because that does not go through the full procedure of migration ptes): + * to migrate the Mlocked page flag; update statistics. */ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { - unsigned long flags; int nr_pages = hpage_nr_pages(page); - local_irq_save(flags); + /* Holding pmd lock, no change in irq context: __mod is safe */ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); - local_irq_restore(flags); } } diff --git a/mm/migrate.c b/mm/migrate.c index 94961f4..ed72c49 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -171,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(new); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, ptep); unlock: @@ -537,7 +540,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) cpupid = page_cpupid_xchg_last(page, -1); page_cpupid_xchg_last(newpage, cpupid); - mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* * Please do not reorder this without considering how mm/ksm.c's @@ -1787,7 +1789,6 @@ fail_putback: SetPageActive(page); if (TestClearPageUnevictable(new_page)) SetPageUnevictable(page); - mlock_migrate_page(page, new_page); unlock_page(new_page); put_page(new_page); /* Free it */ @@ -1829,6 +1830,7 @@ fail_putback: goto fail_putback; } + mlock_migrate_page(new_page, page); mem_cgroup_migrate(page, new_page, false); page_remove_rmap(page); -- cgit v0.10.2 From 45637bab30d6e7651737f51aa99417baef4d114a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:40 -0800 Subject: mm: rename mem_cgroup_migrate to mem_cgroup_replace_page After v4.3's commit 0610c25daa3e ("memcg: fix dirty page migration") mem_cgroup_migrate() doesn't have much to offer in page migration: convert migrate_misplaced_transhuge_page() to set_page_memcg() instead. Then rename mem_cgroup_migrate() to mem_cgroup_replace_page(), since its remaining callers are replace_page_cache_page() and shmem_replace_page(): both of whom passed lrucare true, so just eliminate that argument. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 998311e..c06c70b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -297,8 +297,7 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare); +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); @@ -537,9 +536,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_migrate(struct page *oldpage, - struct page *newpage, - bool lrucare) +static inline void mem_cgroup_replace_page(struct page *old, struct page *new) { } diff --git a/mm/filemap.c b/mm/filemap.c index 884766d..58e04e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -551,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_end_page_stat(memcg); - mem_cgroup_migrate(old, new, true); + mem_cgroup_replace_page(old, new); radix_tree_preload_end(); if (freepage) freepage(old); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0111cb..a44494f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4531,9 +4531,8 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. + * Prevent mem_cgroup_replace_page() from looking at + * page->mem_cgroup of its source page while we change it. */ if (!trylock_page(page)) goto out; @@ -5495,7 +5494,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - migrate a charge to another page + * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to * @lrucare: either or both pages might be on the LRU already @@ -5504,16 +5503,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * * Both pages must be locked, @newpage->mapping must be set up. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare) +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), newpage); @@ -5525,25 +5521,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (newpage->mem_cgroup) return; - /* - * Swapcache readahead pages can get migrated before being - * charged, and migration from compaction can happen to an - * uncharged page when the PFN walker finds a page that - * reclaim just put back on the LRU but has not released yet. - */ + /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; - if (lrucare) - lock_page_lru(oldpage, &isolated); - + lock_page_lru(oldpage, &isolated); oldpage->mem_cgroup = NULL; + unlock_page_lru(oldpage, isolated); - if (lrucare) - unlock_page_lru(oldpage, isolated); - - commit_charge(newpage, memcg, lrucare); + commit_charge(newpage, memcg, true); } /* diff --git a/mm/migrate.c b/mm/migrate.c index ed72c49..836e410 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1831,8 +1830,8 @@ fail_putback: } mlock_migrate_page(new_page, page); - mem_cgroup_migrate(page, new_page, false); - + set_page_memcg(new_page, page_memcg(page)); + set_page_memcg(page, NULL); page_remove_rmap(page); spin_unlock(ptl); diff --git a/mm/shmem.c b/mm/shmem.c index 48ce829..6529226 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1023,7 +1023,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage, true); + mem_cgroup_replace_page(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } -- cgit v0.10.2 From 14e0f9bcc95f1aef26a9f860cceda35faee79b34 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:43 -0800 Subject: mm: correct a couple of page migration comments It's migrate.c not migration,c, and nowadays putback_movable_pages() not putback_lru_pages(). Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 836e410..d149cbb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1,5 +1,5 @@ /* - * Memory Migration functionality - linux/mm/migration.c + * Memory Migration functionality - linux/mm/migrate.c * * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter * @@ -1113,7 +1113,7 @@ out: * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. - * The caller should call putback_lru_pages() to return pages to the LRU + * The caller should call putback_movable_pages() to return pages to the LRU * or free list only if ret != 0. * * Returns the number of pages that were not migrated, or an error code. -- cgit v0.10.2 From 2def7424c9be0069831380823fdb5cf72103b919 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:46 -0800 Subject: mm: page migration use the put_new_page whenever necessary I don't know of any problem from the way it's used in our current tree, but there is one defect in page migration's custom put_new_page feature. An unused newpage is expected to be released with the put_new_page(), but there was one MIGRATEPAGE_SUCCESS (0) path which released it with putback_lru_page(): which can be very wrong for a custom pool. Fixed more easily by resetting put_new_page once it won't be needed, than by adding a further flag to modify the rc test. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Acked-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index d149cbb..2f2e223 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -938,10 +938,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, int force, enum migrate_mode mode, enum migrate_reason reason) { - int rc = 0; + int rc = MIGRATEPAGE_SUCCESS; int *result = NULL; - struct page *newpage = get_new_page(page, private, &result); + struct page *newpage; + newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -955,6 +956,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; rc = __unmap_and_move(page, newpage, force, mode); + if (rc == MIGRATEPAGE_SUCCESS) + put_new_page = NULL; out: if (rc != -EAGAIN) { @@ -981,7 +984,7 @@ out: * it. Otherwise, putback_lru_page() will drop the reference grabbed * during isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + if (put_new_page) { ClearPageSwapBacked(newpage); put_new_page(newpage, private); } else if (unlikely(__is_movable_balloon_page(newpage))) { @@ -1022,7 +1025,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct page *hpage, int force, enum migrate_mode mode) { - int rc = 0; + int rc = -EAGAIN; int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; @@ -1044,8 +1047,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!new_hpage) return -ENOMEM; - rc = -EAGAIN; - if (!trylock_page(hpage)) { if (!force || mode != MIGRATE_SYNC) goto out; @@ -1070,8 +1071,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); + put_new_page = NULL; + } unlock_page(hpage); out: @@ -1083,7 +1086,7 @@ out: * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + if (put_new_page) put_new_page(new_hpage, private); else putback_active_hugepage(new_hpage); -- cgit v0.10.2 From 7db7671f835ccad66db20154ac1274140937d9b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:49 -0800 Subject: mm: page migration trylock newpage at same level as oldpage Clean up page migration a little by moving the trylock of newpage from move_to_new_page() into __unmap_and_move(), where the old page has been locked. Adjust unmap_and_move_huge_page() and balloon_page_migrate() accordingly. But make one kind-of-functional change on the way: whereas trylock of newpage used to BUG() if it failed, now simply return -EAGAIN if so. Cutting out BUG()s is good, right? But, to be honest, this is really to extend the usefulness of the custom put_new_page feature, allowing a pool of new pages to be shared perhaps with racing uses. Use an "else" instead of that "skip_unmap" label. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index fcad832..d3116be 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage, struct balloon_dev_info *balloon = balloon_page_device(page); int rc = -EAGAIN; - /* - * Block others from accessing the 'newpage' when we get around to - * establishing additional references. We should be the only one - * holding a reference to the 'newpage' at this point. - */ - BUG_ON(!trylock_page(newpage)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); if (WARN_ON(!__is_movable_balloon_page(page))) { dump_page(page, "not movable balloon page"); - unlock_page(newpage); return rc; } if (balloon && balloon->migratepage) rc = balloon->migratepage(balloon, newpage, page, mode); - unlock_page(newpage); return rc; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/migrate.c b/mm/migrate.c index 2f2e223..6d7774e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -727,13 +727,8 @@ static int move_to_new_page(struct page *newpage, struct page *page, struct address_space *mapping; int rc; - /* - * Block others from accessing the page when we get around to - * establishing additional references. We are the only one - * holding a reference to the new page at this point. - */ - if (!trylock_page(newpage)) - BUG(); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); /* Prepare mapping for the new page.*/ newpage->index = page->index; @@ -774,9 +769,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, remove_migration_ptes(page, newpage); page->mapping = NULL; } - - unlock_page(newpage); - return rc; } @@ -861,6 +853,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } + /* + * Block others from accessing the new page when we get around to + * establishing additional references. We are usually the only one + * holding a reference to newpage at this point. We used to have a BUG + * here if trylock_page(newpage) fails, but would like to allow for + * cases where there might be a race with the previous use of newpage. + * This is much like races on refcount of oldpage: just don't BUG(). + */ + if (unlikely(!trylock_page(newpage))) + goto out_unlock; + if (unlikely(isolated_balloon_page(page))) { /* * A ballooned page does not need any special attention from @@ -870,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto out_unlock; + goto out_unlock_both; } /* @@ -889,30 +892,27 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto out_unlock; + goto out_unlock_both; } - goto skip_unmap; - } - - /* Establish migration ptes or remove ptes */ - if (page_mapped(page)) { + } else if (page_mapped(page)) { + /* Establish migration ptes */ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } -skip_unmap: if (!page_mapped(page)) rc = move_to_new_page(newpage, page, page_was_mapped, mode); if (rc && page_was_mapped) remove_migration_ptes(page, page); +out_unlock_both: + unlock_page(newpage); +out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - -out_unlock: unlock_page(page); out: return rc; @@ -1056,6 +1056,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); + if (unlikely(!trylock_page(new_hpage))) + goto put_anon; + if (page_mapped(hpage)) { try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); @@ -1068,6 +1071,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) remove_migration_ptes(hpage, hpage); + unlock_page(new_hpage); + +put_anon: if (anon_vma) put_anon_vma(anon_vma); -- cgit v0.10.2 From 5c3f9a67371643b6faa987622bc1b67667bab848 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:53 -0800 Subject: mm: page migration remove_migration_ptes at lock+unlock level Clean up page migration a little more by calling remove_migration_ptes() from the same level, on success or on failure, from __unmap_and_move() or from unmap_and_move_huge_page(). Don't reset page->mapping of a PageAnon old page in move_to_new_page(), leave that to when the page is freed. Except for here in page migration, it has been an invariant that a PageAnon (bit set in page->mapping) page stays PageAnon until it is freed, and I think we're safer to keep to that. And with the above rearrangement, it's necessary because zap_pte_range() wants to identify whether a migration entry represents a file or an anon page, to update the appropriate rss stats without waiting on it. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 6d7774e..7b44ebdf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -722,7 +722,7 @@ static int fallback_migrate_page(struct address_space *mapping, * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int page_was_mapped, enum migrate_mode mode) + enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -755,19 +755,21 @@ static int move_to_new_page(struct page *newpage, struct page *page, * space which also has its own migratepage callback. This * is the most common path for page migration. */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page, mode); + rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc != MIGRATEPAGE_SUCCESS) { + /* + * When successful, old pagecache page->mapping must be cleared before + * page is freed; but stats require that PageAnon be left as PageAnon. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + set_page_memcg(page, NULL); + if (!PageAnon(page)) + page->mapping = NULL; + } else { set_page_memcg(newpage, NULL); newpage->mapping = NULL; - } else { - set_page_memcg(page, NULL); - if (page_was_mapped) - remove_migration_ptes(page, newpage); - page->mapping = NULL; } return rc; } @@ -902,10 +904,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, page_was_mapped, mode); + rc = move_to_new_page(newpage, page, mode); - if (rc && page_was_mapped) - remove_migration_ptes(page, page); + if (page_was_mapped) + remove_migration_ptes(page, + rc == MIGRATEPAGE_SUCCESS ? newpage : page); out_unlock_both: unlock_page(newpage); @@ -1066,10 +1069,11 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); + rc = move_to_new_page(new_hpage, hpage, mode); - if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) - remove_migration_ptes(hpage, hpage); + if (page_was_mapped) + remove_migration_ptes(hpage, + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); unlock_page(new_hpage); -- cgit v0.10.2 From 03f15c86c8d1b9d81e6d215715e110aef8f936e0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:56 -0800 Subject: mm: simplify page migration's anon_vma comment and flow __unmap_and_move() contains a long stale comment on page_get_anon_vma() and PageSwapCache(), with an odd control flow that's hard to follow. Mostly this reflects our confusion about the lifetime of an anon_vma, in the early days of page migration, before we could take a reference to one. Nowadays this seems quite straightforward: cut it all down to essentials. I cannot see the relevance of swapcache here at all, so don't treat it any differently: I believe the old comment reflects in part our anon_vma confusions, and in part the original v2.6.16 page migration technique, which used actual swap to migrate anon instead of swap-like migration entries. Why should a swapcache page not be migrated with the aid of migration entry ptes like everything else? So lose that comment now, and enable migration entries for swapcache in the next patch. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 7b44ebdf..08a7b6c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -819,6 +819,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, goto out_unlock; wait_on_page_writeback(page); } + /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. @@ -826,34 +827,15 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. + * + * Only page_get_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + * But if we cannot get anon_vma, then we won't need it anyway, + * because that implies that the anon page is no longer mapped + * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) { - /* - * Only page_lock_anon_vma_read() understands the subtleties of - * getting a hold on an anon_vma from outside one of its mms. - */ + if (PageAnon(page) && !PageKsm(page)) anon_vma = page_get_anon_vma(page); - if (anon_vma) { - /* - * Anon page - */ - } else if (PageSwapCache(page)) { - /* - * We cannot be sure that the anon_vma of an unmapped - * swapcache page is safe to use because we don't - * know in advance if the VMA that this page belonged - * to still exists. If the VMA and others sharing the - * data have been freed, then the anon_vma could - * already be invalid. - * - * To avoid this possibility, swapcache pages get - * migrated but are not remapped when migration - * completes - */ - } else { - goto out_unlock; - } - } /* * Block others from accessing the new page when we get around to @@ -898,6 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } else if (page_mapped(page)) { /* Establish migration ptes */ + VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, + page); try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; -- cgit v0.10.2 From 470f119f012068e5d94458c98dc4eec102f88cd3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:49:59 -0800 Subject: mm: page migration use migration entry for swapcache too Hitherto page migration has avoided using a migration entry for a swapcache page mapped into userspace, apparently for historical reasons. So any page blessed with swapcache would entail a minor fault when it's next touched, which page migration otherwise tries to avoid. Swapcache in an mlocked area is rare, so won't often matter, but still better fixed. Just rearrange the block in try_to_unmap_one(), to handle TTU_MIGRATION before checking PageAnon, that's all (apart from some reindenting). Well, no, that's not quite all: doesn't this by the way fix a soft_dirty bug, that page migration of a file page was forgetting to transfer the soft_dirty bit? Probably not a serious bug: if I understand correctly, soft_dirty afficionados usually have to handle file pages separately anyway; but we publish the bit in /proc//pagemap on file mappings as well as anonymous, so page migration ought not to perturb it. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Reviewed-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/rmap.c b/mm/rmap.c index b93fb54..b577fbb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1379,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, dec_mm_counter(mm, MM_ANONPAGES); else dec_mm_counter(mm, MM_FILEPAGES); + } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { + swp_entry_t entry; + pte_t swp_pte; + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, pte_write(pteval)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; - - if (PageSwapCache(page)) { - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pte, pteval); - ret = SWAP_FAIL; - goto out_unmap; - } - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); - } - dec_mm_counter(mm, MM_ANONPAGES); - inc_mm_counter(mm, MM_SWAPENTS); - } else if (IS_ENABLED(CONFIG_MIGRATION)) { - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - BUG_ON(!(flags & TTU_MIGRATION)); - entry = make_migration_entry(page, pte_write(pteval)); + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION)) { - /* Establish migration entry for a file page */ - swp_entry_t entry; - entry = make_migration_entry(page, pte_write(pteval)); - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else dec_mm_counter(mm, MM_FILEPAGES); -- cgit v0.10.2 From cf4b769abb8aef01f887543cb8308c0d8671367c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:02 -0800 Subject: mm: page migration avoid touching newpage until no going back We have had trouble in the past from the way in which page migration's newpage is initialized in dribs and drabs - see commit 8bdd63809160 ("mm: fix direct reclaim writeback regression") which proposed a cleanup. We have no actual problem now, but I think the procedure would be clearer (and alternative get_new_page pools safer to implement) if we assert that newpage is not touched until we are sure that it's going to be used - except for taking the trylock on it in __unmap_and_move(). So shift the early initializations from move_to_new_page() into migrate_page_move_mapping(), mapping and NULL-mapping paths. Similarly migrate_huge_page_move_mapping(), but its NULL-mapping path can just be deleted: you cannot reach hugetlbfs_migrate_page() with a NULL mapping. Adjust stages 3 to 8 in the Documentation file accordingly. Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 479c3d0..fea5c08 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -92,27 +92,26 @@ Steps: 2. Insure that writeback is complete. -3. Prep the new page that we want to move to. It is locked - and set to not being uptodate so that all accesses to the new - page immediately lock while the move is in progress. +3. Lock the new page that we want to move to. It is locked so that accesses to + this (not yet uptodate) page immediately lock while the move is in progress. -4. The new page is prepped with some settings from the old page so that - accesses to the new page will discover a page with the correct settings. - -5. All the page table references to the page are converted to migration +4. All the page table references to the page are converted to migration entries. This decreases the mapcount of a page. If the resulting mapcount is not zero then we do not migrate the page. All user space processes that attempt to access the page will now wait on the page lock. -6. The radix tree lock is taken. This will cause all processes trying +5. The radix tree lock is taken. This will cause all processes trying to access the page via the mapping to block on the radix tree spinlock. -7. The refcount of the page is examined and we back out if references remain +6. The refcount of the page is examined and we back out if references remain otherwise we know that we are the only one referencing this page. -8. The radix tree is checked and if it does not contain the pointer to this +7. The radix tree is checked and if it does not contain the pointer to this page then we back out because someone else modified the radix tree. +8. The new page is prepped with some settings from the old page so that + accesses to the new page will discover a page with the correct settings. + 9. The radix tree is changed to point to the new page. 10. The reference count of the old page is dropped because the radix tree diff --git a/mm/migrate.c b/mm/migrate.c index 08a7b6c..3067e40 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -320,6 +320,14 @@ int migrate_page_move_mapping(struct address_space *mapping, /* Anonymous page without mapping */ if (page_count(page) != expected_count) return -EAGAIN; + + /* No turning back from here */ + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + return MIGRATEPAGE_SUCCESS; } @@ -355,8 +363,15 @@ int migrate_page_move_mapping(struct address_space *mapping, } /* - * Now we know that no one else is looking at the page. + * Now we know that no one else is looking at the page: + * no turning back from here. */ + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { SetPageSwapCache(newpage); @@ -403,12 +418,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - if (!mapping) { - if (page_count(page) != 1) - return -EAGAIN; - return MIGRATEPAGE_SUCCESS; - } - spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -426,6 +435,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -730,21 +742,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - /* Prepare mapping for the new page.*/ - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); - - /* - * Indirectly called below, migrate_page_copy() copies PG_dirty and thus - * needs newpage's memcg set to transfer memcg dirty page accounting. - * So perform memcg migration in two steps: - * 1. set newpage->mem_cgroup (here) - * 2. clear page->mem_cgroup (below) - */ - set_page_memcg(newpage, page_memcg(page)); - mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page, mode); @@ -767,9 +764,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; - } else { - set_page_memcg(newpage, NULL); - newpage->mapping = NULL; } return rc; } @@ -971,10 +965,9 @@ out: * it. Otherwise, putback_lru_page() will drop the reference grabbed * during isolation. */ - if (put_new_page) { - ClearPageSwapBacked(newpage); + if (put_new_page) put_new_page(newpage, private); - } else if (unlikely(__is_movable_balloon_page(newpage))) { + else if (unlikely(__is_movable_balloon_page(newpage))) { /* drop our reference, page already in the balloon */ put_page(newpage); } else -- cgit v0.10.2 From 42cb14b110a5698ccf26ce59c4441722605a3743 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:05 -0800 Subject: mm: migrate dirty page without clear_page_dirty_for_io etc clear_page_dirty_for_io() has accumulated writeback and memcg subtleties since v2.6.16 first introduced page migration; and the set_page_dirty() which completed its migration of PageDirty, later had to be moderated to __set_page_dirty_nobuffers(); then PageSwapBacked had to skip that too. No actual problems seen with this procedure recently, but if you look into what the clear_page_dirty_for_io(page)+set_page_dirty(newpage) is actually achieving, it turns out to be nothing more than moving the PageDirty flag, and its NR_FILE_DIRTY stat from one zone to another. It would be good to avoid a pile of irrelevant decrementations and incrementations, and improper event counting, and unnecessary descent of the radix_tree under tree_lock (to set the PAGECACHE_TAG_DIRTY which radix_tree_replace_slot() left in place anyway). Do the NR_FILE_DIRTY movement, like the other stats movements, while interrupts still disabled in migrate_page_move_mapping(); and don't even bother if the zone is the same. Do the PageDirty movement there under tree_lock too, where old page is frozen and newpage not yet visible: bearing in mind that as soon as newpage becomes visible in radix_tree, an un-page-locked set_page_dirty() might interfere (or perhaps that's just not possible: anything doing so should already hold an additional reference to the old page, preventing its migration; but play safe). But we do still need to transfer PageDirty in migrate_page_copy(), for those who don't go the mapping route through migrate_page_move_mapping(). Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Sasha Levin Cc: Dmitry Vyukov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 3067e40..2834fab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -313,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + struct zone *oldzone, *newzone; + int dirty; int expected_count = 1 + extra_count; void **pslot; @@ -331,6 +334,9 @@ int migrate_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } + oldzone = page_zone(page); + newzone = page_zone(newpage); + spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -378,6 +384,13 @@ int migrate_page_move_mapping(struct address_space *mapping, set_page_private(newpage, page_private(page)); } + /* Move dirty while page refs frozen and newpage not yet exposed */ + dirty = PageDirty(page); + if (dirty) { + ClearPageDirty(page); + SetPageDirty(newpage); + } + radix_tree_replace_slot(pslot, newpage); /* @@ -387,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_unfreeze_refs(page, expected_count - 1); + spin_unlock(&mapping->tree_lock); + /* Leave irq disabled to prevent preemption while updating stats */ + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be @@ -397,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping, * via NR_FILE_PAGES and NR_ANON_PAGES if they * are mapped to swap space. */ - __dec_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - if (!PageSwapCache(page) && PageSwapBacked(page)) { - __dec_zone_page_state(page, NR_SHMEM); - __inc_zone_page_state(newpage, NR_SHMEM); + if (newzone != oldzone) { + __dec_zone_state(oldzone, NR_FILE_PAGES); + __inc_zone_state(newzone, NR_FILE_PAGES); + if (PageSwapBacked(page) && !PageSwapCache(page)) { + __dec_zone_state(oldzone, NR_SHMEM); + __inc_zone_state(newzone, NR_SHMEM); + } + if (dirty && mapping_cap_account_dirty(mapping)) { + __dec_zone_state(oldzone, NR_FILE_DIRTY); + __inc_zone_state(newzone, NR_FILE_DIRTY); + } } - spin_unlock_irq(&mapping->tree_lock); + local_irq_enable(); return MIGRATEPAGE_SUCCESS; } @@ -524,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageMappedToDisk(page)) SetPageMappedToDisk(newpage); - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - /* - * Want to mark the page and the radix tree as dirty, and - * redo the accounting that clear_page_dirty_for_io undid, - * but we can't use set_page_dirty because that function - * is actually a signal that all of the page has become dirty. - * Whereas only part of our page may be dirty. - */ - if (PageSwapBacked(page)) - SetPageDirty(newpage); - else - __set_page_dirty_nobuffers(newpage); - } + /* Move dirty on pages not done by migrate_page_move_mapping() */ + if (PageDirty(page)) + SetPageDirty(newpage); if (page_is_young(page)) set_page_young(newpage); -- cgit v0.10.2 From 3acaea6804b3a10e996ce6ebc342089f481e1cdb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 5 Nov 2015 18:50:08 -0800 Subject: mm/cma.c: suppress warning mm/cma.c: In function 'cma_alloc': mm/cma.c:366: warning: 'pfn' may be used uninitialized in this function The patch actually improves the tracing a bit: if alloc_contig_range() fails, tracing will display the offending pfn rather than -1. Cc: Stefan Strogin Cc: Michal Nazarewicz Cc: Marek Szyprowski Cc: Laurent Pinchart Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/cma.c b/mm/cma.c index 4eb56ba..ea506eb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -363,7 +363,9 @@ err: */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) { - unsigned long mask, offset, pfn, start = 0; + unsigned long mask, offset; + unsigned long pfn = -1; + unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; @@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) start = bitmap_no + mask + 1; } - trace_cma_alloc(page ? pfn : -1UL, page, count, align); + trace_cma_alloc(pfn, page, count, align); pr_debug("%s(): returned %p\n", __func__, page); return page; -- cgit v0.10.2 From 9dd861d55b01f1d0848f82007e8665371ae18710 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 5 Nov 2015 18:50:11 -0800 Subject: mm/maccess.c: actually return -EFAULT from strncpy_from_unsafe As far as I can tell, strncpy_from_unsafe never returns -EFAULT. ret is the result of a __copy_from_user_inatomic(), which is 0 for success and positive (in this case necessarily 1) for access error - it is never negative. So we were always returning the length of the, possibly truncated, destination string. Signed-off-by: Rasmus Villemoes Acked-by: Alexei Starovoitov Cc: Masami Hiramatsu Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/maccess.c b/mm/maccess.c index 1b13638..d159b1c 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -104,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_enable(); set_fs(old_fs); - return ret < 0 ? ret : src - unsafe_addr; + return ret ? -EFAULT : src - unsafe_addr; } -- cgit v0.10.2 From b4e289a6a659c5c2c056a67fa4f31f3dd8317537 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 5 Nov 2015 18:50:14 -0800 Subject: mm/hugetlb: make node_hstates array static There are no users of the node_hstates array outside of the mm/hugetlb.c. So let's make it static. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index abfbe8c..4fc590a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2376,7 +2376,7 @@ struct node_hstate { struct kobject *hugepages_kobj; struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; }; -struct node_hstate node_hstates[MAX_NUMNODES]; +static struct node_hstate node_hstates[MAX_NUMNODES]; /* * A subset of global hstate attributes for node devices -- cgit v0.10.2 From 099730d67417dfee273e9b10ac2560ca7fac7eb9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:50:17 -0800 Subject: mm, hugetlb: use memory policy when available I have a hugetlbfs user which is never explicitly allocating huge pages with 'nr_hugepages'. They only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. This works, but they noticed that mbind() was not doing them any good and the pages were being allocated without respect for the policy they specified. The code in question is this: > struct page *alloc_huge_page(struct vm_area_struct *vma, ... > page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); > if (!page) { > page = alloc_buddy_huge_page(h, NUMA_NO_NODE); dequeue_huge_page_vma() is smart and will respect the VMA's memory policy. But, it only grabs _existing_ huge pages from the huge page pool. If the pool is empty, we fall back to alloc_buddy_huge_page() which obviously can't do anything with the VMA's policy because it isn't even passed the VMA. Almost everybody preallocates huge pages. That's probably why nobody has ever noticed this. Looking back at the git history, I don't think this _ever_ worked from when alloc_buddy_huge_page() was introduced in 7893d1d5, 8 years ago. The fix is to pass vma/addr down in to the places where we actually call in to the buddy allocator. It's fairly straightforward plumbing. This has been lightly tested. Signed-off-by: Dave Hansen Cc: Naoya Horiguchi Cc: Mike Kravetz Cc: Hillf Danton Cc: David Rientjes Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4fc590a..899f6a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1437,7 +1437,76 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) dissolve_free_huge_page(pfn_to_page(pfn)); } -static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +/* + * There are 3 ways this can get called: + * 1. With vma+addr: we use the VMA's memory policy + * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge + * page from any node, and let the buddy allocator itself figure + * it out. + * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page + * strictly from 'nid' + */ +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) +{ + int order = huge_page_order(h); + gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + unsigned int cpuset_mems_cookie; + + /* + * We need a VMA to get a memory policy. If we do not + * have one, we use the 'nid' argument + */ + if (!vma) { + /* + * If a specific node is requested, make sure to + * get memory from there, but only when a node + * is explicitly specified. + */ + if (nid != NUMA_NO_NODE) + gfp |= __GFP_THISNODE; + /* + * Make sure to call something that can handle + * nid=NUMA_NO_NODE + */ + return alloc_pages_node(nid, gfp, order); + } + + /* + * OK, so we have a VMA. Fetch the mempolicy and try to + * allocate a huge page with it. + */ + do { + struct page *page; + struct mempolicy *mpol; + struct zonelist *zl; + nodemask_t *nodemask; + + cpuset_mems_cookie = read_mems_allowed_begin(); + zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + mpol_cond_put(mpol); + page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + if (page) + return page; + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return NULL; +} + +/* + * There are two ways to allocate a huge page: + * 1. When you have a VMA and an address (like a fault) + * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) + * + * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in + * this case which signifies that the allocation should be done with + * respect for the VMA's memory policy. + * + * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This + * implies that memory policies will not be taken in to account. + */ +static struct page *__alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) { struct page *page; unsigned int r_nid; @@ -1446,6 +1515,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) return NULL; /* + * Make sure that anyone specifying 'nid' is not also specifying a VMA. + * This makes sure the caller is picking _one_ of the modes with which + * we can call this function, not both. + */ + if (vma || (addr != -1)) { + WARN_ON_ONCE(addr == -1); + WARN_ON_ONCE(nid != NUMA_NO_NODE); + } + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit @@ -1478,14 +1556,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } spin_unlock(&hugetlb_lock); - if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); - else - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); spin_lock(&hugetlb_lock); if (page) { @@ -1510,6 +1581,27 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } /* + * Allocate a huge page from 'nid'. Note, 'nid' may be + * NUMA_NO_NODE, which means that it may be allocated + * anywhere. + */ +struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) +{ + unsigned long addr = -1; + + return __alloc_buddy_huge_page(h, NULL, addr, nid); +} + +/* + * Use the VMA's mpolicy to allocate a huge page from the buddy. + */ +struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); +} + +/* * This allocation function is useful in the context where vma is irrelevant. * E.g. soft-offlining uses this function because it only cares physical * address of error page. @@ -1524,7 +1616,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = alloc_buddy_huge_page(h, nid); + page = __alloc_buddy_huge_page_no_mpol(h, nid); return page; } @@ -1554,7 +1646,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); if (!page) { alloc_ok = false; break; @@ -1787,7 +1879,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; -- cgit v0.10.2 From e0ec90ee7e6f6cbaa6d59ffb48d2a7af5e80e61d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:50:20 -0800 Subject: mm, hugetlbfs: optimize when NUMA=n My recent patch "mm, hugetlb: use memory policy when available" added some bloat to hugetlb.o. This patch aims to get some of the bloat back, especially when NUMA is not in play. It does this with an implicit #ifdef and marking some things static that should have been static in my first patch. It also makes the warnings only VM_WARN_ON()s. They were responsible for a pretty big chunk of the bloat. Doing this gets our NUMA=n text size back to a wee bit _below_ where we started before the original patch. It also shaves a bit of space off the NUMA=y case, but not much. Enforcing the mempolicy definitely takes some text and it's hard to avoid. size(1) output: text data bss dec hex filename 30745 3433 2492 36670 8f3e hugetlb.o.nonuma.baseline 31305 3755 2492 37552 92b0 hugetlb.o.nonuma.patch1 30713 3433 2492 36638 8f1e hugetlb.o.nonuma.patch2 (this patch) 25235 473 41276 66984 105a8 hugetlb.o.numa.baseline 25715 475 41276 67466 1078a hugetlb.o.numa.patch1 25491 473 41276 67240 106a8 hugetlb.o.numa.patch2 (this patch) Signed-off-by: Dave Hansen Cc: Naoya Horiguchi Cc: Mike Kravetz Cc: Hillf Danton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 899f6a8..241de27 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1455,9 +1455,14 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, /* * We need a VMA to get a memory policy. If we do not - * have one, we use the 'nid' argument + * have one, we use the 'nid' argument. + * + * The mempolicy stuff below has some non-inlined bits + * and calls ->vm_ops. That makes it hard to optimize at + * compile-time, even when NUMA is off and it does + * nothing. This helps the compiler optimize it out. */ - if (!vma) { + if (!IS_ENABLED(CONFIG_NUMA) || !vma) { /* * If a specific node is requested, make sure to * get memory from there, but only when a node @@ -1474,7 +1479,8 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, /* * OK, so we have a VMA. Fetch the mempolicy and try to - * allocate a huge page with it. + * allocate a huge page with it. We will only reach this + * when CONFIG_NUMA=y. */ do { struct page *page; @@ -1520,8 +1526,8 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, * we can call this function, not both. */ if (vma || (addr != -1)) { - WARN_ON_ONCE(addr == -1); - WARN_ON_ONCE(nid != NUMA_NO_NODE); + VM_WARN_ON_ONCE(addr == -1); + VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); } /* * Assume we will successfully allocate the surplus page to @@ -1585,6 +1591,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, * NUMA_NO_NODE, which means that it may be allocated * anywhere. */ +static struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) { unsigned long addr = -1; @@ -1595,6 +1602,7 @@ struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ +static struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { -- cgit v0.10.2 From f5fc3c5d817435970aa301d066820a9ac12c8120 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 Nov 2015 18:50:23 -0800 Subject: mm: memcontrol: eliminate root memory.current memory.current on the root level doesn't add anything that wouldn't be more accurate and detailed using system statistics. It already doesn't include slabs, and it'll be a pain to keep in sync when further memory types are accounted in the memory controller. Remove it. Note that this applies to the new unified hierarchy interface only. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a44494f..59b100f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5026,7 +5026,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_usage(mem_cgroup_from_css(css), false); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } static int memory_low_show(struct seq_file *m, void *v) @@ -5138,6 +5140,7 @@ static int memory_events_show(struct seq_file *m, void *v) static struct cftype memory_files[] = { { .name = "current", + .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, { -- cgit v0.10.2 From 6071ca5201066f4b2a61cfb693dd186d6bc6e9f3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 Nov 2015 18:50:26 -0800 Subject: mm: page_counter: let page_counter_try_charge() return bool page_counter_try_charge() currently returns 0 on success and -ENOMEM on failure, which is surprising behavior given the function name. Make it follow the expected pattern of try_stuff() functions that return a boolean true to indicate success, or false for failure. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Linus Torvalds diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 17fa4f8..7e62920 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -36,9 +36,9 @@ static inline unsigned long page_counter_read(struct page_counter *counter) void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_try_charge(struct page_counter *counter, - unsigned long nr_pages, - struct page_counter **fail); +bool page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_limit(struct page_counter *counter, unsigned long limit); int page_counter_memparse(const char *buf, const char *max, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 6e00574..33d59ab 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -186,7 +186,8 @@ again: } rcu_read_unlock(); - ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); + if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) + ret = -ENOMEM; css_put(&h_cg->css); done: *ptr = h_cg; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59b100f..d47de73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2016,8 +2016,8 @@ retry: return 0; if (!do_swap_account || - !page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (!page_counter_try_charge(&memcg->memory, batch, &counter)) + page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) page_counter_uncharge(&memcg->memsw, batch); @@ -2381,14 +2381,13 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, { unsigned int nr_pages = 1 << order; struct page_counter *counter; - int ret = 0; + int ret; if (!memcg_kmem_is_active(memcg)) return 0; - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret) - return ret; + if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) + return -ENOMEM; ret = try_charge(memcg, gfp, nr_pages); if (ret) { diff --git a/mm/page_counter.c b/mm/page_counter.c index 11b4bed..7c6a63d 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * - * Returns 0 on success, or -ENOMEM and @fail if the counter or one of - * its ancestors has hit its configured limit. + * Returns %true on success, or %false and @fail if the counter or one + * of its ancestors has hit its configured limit. */ -int page_counter_try_charge(struct page_counter *counter, - unsigned long nr_pages, - struct page_counter **fail) +bool page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) { struct page_counter *c; @@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter, if (new > c->watermark) c->watermark = new; } - return 0; + return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); - return -ENOMEM; + return false; } /** -- cgit v0.10.2 From c12176d3368b9b36ae484d323d41e94be26f9b65 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 5 Nov 2015 18:50:29 -0800 Subject: memcg: fix thresholds for 32b architectures. Commit 424cdc141380 ("memcg: convert threshold to bytes") has fixed a regression introduced by 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") where thresholds were silently converted to use page units rather than bytes when interpreting the user input. The fix is not complete, though, as properly pointed out by Ben Hutchings during stable backport review. The page count is converted to bytes but unsigned long is used to hold the value which would be obviously not sufficient for 32b systems with more than 4G thresholds. The same applies to usage as taken from mem_cgroup_usage which might overflow. Let's remove this bytes vs. pages internal tracking differences and handle thresholds in page units internally. Chage mem_cgroup_usage() to return the value in page units and revert 424cdc141380 because this should be sufficient for the consistent handling. mem_cgroup_read_u64 as the only users of mem_cgroup_usage outside of the threshold handling code is converted to give the proper in bytes result. It is doing that already for page_counter output so this is more consistent as well. The value presented to the userspace is still in bytes units. Fixes: 424cdc141380 ("memcg: convert threshold to bytes") Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") Signed-off-by: Michal Hocko Reported-by: Ben Hutchings Reviewed-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: From: Michal Hocko Subject: memcg-fix-thresholds-for-32b-architectures-fix Cc: Ben Hutchings Cc: Vladimir Davydov Cc: Johannes Weiner From: Andrew Morton Subject: memcg-fix-thresholds-for-32b-architectures-fix-fix don't attempt to inline mem_cgroup_usage() The compiler ignores the inline anwyay. And __always_inlining it adds 600 bytes of goop to the .o file. Cc: Ben Hutchings Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d47de73..38765d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2801,9 +2801,9 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - u64 val; + unsigned long val; if (mem_cgroup_is_root(memcg)) { val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -2816,7 +2816,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) else val = page_counter_read(&memcg->memsw); } - return val << PAGE_SHIFT; + return val; } enum { @@ -2850,9 +2850,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: if (counter == &memcg->memory) - return mem_cgroup_usage(memcg, false); + return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; if (counter == &memcg->memsw) - return mem_cgroup_usage(memcg, true); + return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->limit * PAGE_SIZE; @@ -3352,7 +3352,6 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, ret = page_counter_memparse(args, "-1", &threshold); if (ret) return ret; - threshold <<= PAGE_SHIFT; mutex_lock(&memcg->thresholds_lock); -- cgit v0.10.2 From b72bdfa73603f2f81fbac651b9ae36807e877752 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:50:32 -0800 Subject: mm, oom: add comment for why oom_adj exists /proc/pid/oom_adj exists solely to avoid breaking existing userspace binaries that write to the tunable. Add a comment in the only possible location within the kernel tree to describe the situation and motivation for keeping it around. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index 29595af..bd3e9e6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, return simple_read_from_buffer(buf, count, ppos, buffer, len); } +/* + * /proc/pid/oom_adj exists solely for backwards compatibility with previous + * kernels. The effective policy is defined by oom_score_adj, which has a + * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. + * Values written to oom_adj are simply mapped linearly to oom_score_adj. + * Processes that become oom disabled via oom_adj will still be oom disabled + * with this implementation. + * + * oom_adj cannot be removed since existing userspace binaries use it. + */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { -- cgit v0.10.2 From d0424c429f8e0555a337d71e0a13f2289c636ec9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:34 -0800 Subject: tmpfs: avoid a little creat and stat slowdown LKP reports that v4.2 commit afa2db2fb6f1 ("tmpfs: truncate prealloc blocks past i_size") causes a 14.5% slowdown in the AIM9 creat-clo benchmark. creat-clo does just what you'd expect from the name, and creat's O_TRUNC on 0-length file does indeed get into more overhead now shmem_setattr() tests "0 <= 0" instead of "0 < 0". I'm not sure how much we care, but I think it would not be too VW-like to add in a check for whether any pages (or swap) are allocated: if none are allocated, there's none to remove from the radix_tree. At first I thought that check would be good enough for the unmaps too, but no: we should not skip the unlikely case of unmapping pages beyond the new EOF, which were COWed from holes which have now been reclaimed, leaving none. This gives me an 8.5% speedup: on Haswell instead of LKP's Westmere, and running a debug config before and after: I hope those account for the lesser speedup. And probably someone has a benchmark where a thousand threads keep on stat'ing the same file repeatedly: forestall that report by adjusting v4.3 commit 44a30220bc0a ("shmem: recalculate file inode when fstat") not to take the spinlock in shmem_getattr() when there's no work to do. Signed-off-by: Hugh Dickins Reported-by: Ying Huang Tested-by: Ying Huang Cc: Josef Bacik Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index 6529226..3b8b739 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -548,12 +548,12 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - spin_lock(&info->lock); - shmem_recalc_inode(inode); - spin_unlock(&info->lock); - + if (info->alloced - info->swapped != inode->i_mapping->nrpages) { + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } generic_fillattr(inode, stat); - return 0; } @@ -586,10 +586,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - shmem_truncate_range(inode, newsize, (loff_t)-1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); + if (info->alloced) + shmem_truncate_range(inode, + newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); } } -- cgit v0.10.2 From a5be35632cf3048d3922e2e2fb05a4c6f0889ff0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 Nov 2015 18:50:37 -0800 Subject: Documentation/filesystems/proc.txt: a little tidying There's an odd line about "Locked" at the head of the description of /proc/meminfo: it seems to have strayed from /proc/PID/smaps, so lead it back there. Move "Swap" and "SwapPss" descriptions down above it, to match the order in the file (though "PageSize"s still undescribed). The example of "Locked: 374 kB" (the same as Pss, neither Rss nor Size) is so unlikely as to be misleading: just make it 0, this is /bin/bash text; which would be "dw" (disabled write) not "de" (do not expand). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 9781b97..1e4a6cc 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -433,8 +433,8 @@ Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB -Locked: 374 kB -VmFlags: rd ex mr mw me de +Locked: 0 kB +VmFlags: rd ex mr mw me dw the first of these lines shows the same information as is displayed for the mapping in /proc/PID/maps. The remaining lines show the size of the mapping @@ -454,13 +454,13 @@ accessed. "Anonymous" shows the amount of memory that does not belong to any file. Even a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. -"Swap" shows how much would-be-anonymous memory is also used, but out on -swap. -"SwapPss" shows proportional swap share of this mapping. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. +"Swap" shows how much would-be-anonymous memory is also used, but out on swap. +"SwapPss" shows proportional swap share of this mapping. +"Locked" indicates whether the mapping is locked in memory or not. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded @@ -824,9 +824,6 @@ varies by architecture and compile options. The following is from a > cat /proc/meminfo -The "Locked" indicates whether the mapping is locked in memory or not. - - MemTotal: 16344972 kB MemFree: 13634064 kB MemAvailable: 14836172 kB -- cgit v0.10.2 From 5ba97bf9d8754bd984e81c1061fcda682681939e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 Nov 2015 18:50:40 -0800 Subject: mm: remove refresh_cpu_vm_stats() definition for !SMP kernel refresh_cpu_vm_stats(int cpu) is no longer referenced by !SMP kernel since Linux 3.12. Signed-off-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 49dfe40..5dbc8b0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -247,7 +247,6 @@ static inline void __dec_zone_page_state(struct page *page, #define set_pgdat_percpu_threshold(pgdat, callback) { } -static inline void refresh_cpu_vm_stats(int cpu) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } -- cgit v0.10.2 From 0ba8663cbfae066fc504b858db7cbb7d03c2b872 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:43 -0800 Subject: mm/kasan: rename kasan_enabled() to kasan_report_enabled() The function only disable/enable reporting. In the later patch we will be adding a kasan early enable/disable. Rename kasan_enabled to properly reflect its function. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c242adf..a6b46cc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -63,7 +63,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_enabled(void) +static inline bool kasan_report_enabled(void) { return !current->kasan_depth; } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e07c94f..6c3f82b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -220,7 +220,7 @@ void kasan_report(unsigned long addr, size_t size, { struct kasan_access_info info; - if (likely(!kasan_enabled())) + if (likely(!kasan_report_enabled())) return; info.access_addr = (void *)addr; -- cgit v0.10.2 From 527f215b78976e94995dce7163b07539b576d519 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:46 -0800 Subject: mm/kasan: MODULE_VADDR is not available on all archs Use is_module_address instead Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6c3f82b..d269f20 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -85,9 +86,11 @@ static void print_error_description(struct kasan_access_info *info) static inline bool kernel_or_module_addr(const void *addr) { - return (addr >= (void *)_stext && addr < (void *)_end) - || (addr >= (void *)MODULES_VADDR - && addr < (void *)MODULES_END); + if (addr >= (void *)_stext && addr < (void *)_end) + return true; + if (is_module_address((unsigned long)addr)) + return true; + return false; } static inline bool init_task_stack_addr(const void *addr) -- cgit v0.10.2 From f2377d4eaab2aabe1938b3974b5b94f5ba4c7ead Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:48 -0800 Subject: mm/kasan: don't use kasan shadow pointer in generic functions We can't use generic functions like print_hex_dump to access kasan shadow region. This require us to setup another kasan shadow region for the address passed (kasan shadow address). Some architectures won't be able to do that. Hence make a copy of the shadow region row and pass that to generic functions. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/report.c b/mm/kasan/report.c index d269f20..c536708 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -164,14 +164,20 @@ static void print_shadow_for_address(const void *addr) for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { const void *kaddr = kasan_shadow_to_mem(shadow_row); char buffer[4 + (BITS_PER_LONG/8)*2]; + char shadow_buf[SHADOW_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), (i == 0) ? ">%p: " : " %p: ", kaddr); - + /* + * We should not pass a shadow pointer to generic + * function, because generic functions may try to + * access kasan mapping for the passed address. + */ kasan_disable_current(); + memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, - shadow_row, SHADOW_BYTES_PER_ROW, 0); + shadow_buf, SHADOW_BYTES_PER_ROW, 0); kasan_enable_current(); if (row_is_guilty(shadow_row, shadow)) -- cgit v0.10.2 From fc5aeeaf593278f07ffa4d97296e27423ecae867 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Nov 2015 18:50:51 -0800 Subject: mm/kasan: prevent deadlock in kasan reporting When we end up calling kasan_report in real mode, our shadow mapping for the spinlock variable will show poisoned. This will result in us calling kasan_report_error with lock_report spin lock held. To prevent this disable kasan reporting when we are priting error w.r.t kasan. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/report.c b/mm/kasan/report.c index c536708..7833f07 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -173,12 +173,10 @@ static void print_shadow_for_address(const void *addr) * function, because generic functions may try to * access kasan mapping for the passed address. */ - kasan_disable_current(); memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, shadow_buf, SHADOW_BYTES_PER_ROW, 0); - kasan_enable_current(); if (row_is_guilty(shadow_row, shadow)) pr_err("%*c\n", @@ -195,6 +193,10 @@ void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; + /* + * Make sure we don't end up in loop. + */ + kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); @@ -204,12 +206,17 @@ void kasan_report_error(struct kasan_access_info *info) pr_err("=================================" "=================================\n"); spin_unlock_irqrestore(&report_lock, flags); + kasan_enable_current(); } void kasan_report_user_access(struct kasan_access_info *info) { unsigned long flags; + /* + * Make sure we don't end up in loop. + */ + kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); @@ -222,6 +229,7 @@ void kasan_report_user_access(struct kasan_access_info *info) pr_err("=================================" "=================================\n"); spin_unlock_irqrestore(&report_lock, flags); + kasan_enable_current(); } void kasan_report(unsigned long addr, size_t size, -- cgit v0.10.2 From e91210766341cb356ead7fd39f07493a3d00b80f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:50:55 -0800 Subject: kasan: update reported bug types for not user nor kernel memory accesses Each access with address lower than kasan_shadow_to_mem(KASAN_SHADOW_START) is reported as user-memory-access. This is not always true, the accessed address might not be in user space. Fix this by reporting such accesses as null-ptr-derefs or wild-memory-accesses. There's another reason for this change. For userspace ASan we have a bunch of systems that analyze error types for the purpose of classification and deduplication. Sooner of later we will write them to KASAN as well. Then clearly and explicitly stated error types will bring value. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 8da2114..1104cb0 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -235,18 +235,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) static __always_inline void check_memory_region(unsigned long addr, size_t size, bool write) { - struct kasan_access_info info; - if (unlikely(size == 0)) return; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - info.access_addr = (void *)addr; - info.access_size = size; - info.is_write = write; - info.ip = _RET_IP_; - kasan_report_user_access(&info); + kasan_report(addr, size, write, _RET_IP_); return; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index a6b46cc..4f6c62e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -54,9 +54,6 @@ struct kasan_global { #endif }; -void kasan_report_error(struct kasan_access_info *info); -void kasan_report_user_access(struct kasan_access_info *info); - static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7833f07..964aaf4 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -189,9 +189,10 @@ static void print_shadow_for_address(const void *addr) static DEFINE_SPINLOCK(report_lock); -void kasan_report_error(struct kasan_access_info *info) +static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; + const char *bug_type; /* * Make sure we don't end up in loop. @@ -200,32 +201,26 @@ void kasan_report_error(struct kasan_access_info *info) spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); - print_error_description(info); - print_address_description(info); - print_shadow_for_address(info->first_bad_addr); - pr_err("=================================" - "=================================\n"); - spin_unlock_irqrestore(&report_lock, flags); - kasan_enable_current(); -} - -void kasan_report_user_access(struct kasan_access_info *info) -{ - unsigned long flags; - - /* - * Make sure we don't end up in loop. - */ - kasan_disable_current(); - spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); - pr_err("BUG: KASan: user-memory-access on address %p\n", - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); - dump_stack(); + if (info->access_addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + pr_err("BUG: KASan: %s on address %p\n", + bug_type, info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, + task_pid_nr(current)); + dump_stack(); + } else { + print_error_description(info); + print_address_description(info); + print_shadow_for_address(info->first_bad_addr); + } pr_err("=================================" "=================================\n"); spin_unlock_irqrestore(&report_lock, flags); @@ -244,6 +239,7 @@ void kasan_report(unsigned long addr, size_t size, info.access_size = size; info.is_write = is_write; info.ip = ip; + kasan_report_error(&info); } -- cgit v0.10.2 From 0952d87fd6a6211ac51b2abdc5c066b49c651fd8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:50:58 -0800 Subject: kasan: update reported bug types for kernel memory accesses Update the names of the bad access types to better reflect the type of the access that happended and make these error types "literals" that can be used for classification and deduplication in scripts. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 964aaf4..cdf4c31 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -49,7 +49,7 @@ static const void *find_first_bad_addr(const void *addr, size_t size) static void print_error_description(struct kasan_access_info *info) { - const char *bug_type = "unknown crash"; + const char *bug_type = "unknown-crash"; u8 shadow_val; info->first_bad_addr = find_first_bad_addr(info->access_addr, @@ -58,21 +58,25 @@ static void print_error_description(struct kasan_access_info *info) shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); switch (shadow_val) { - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - bug_type = "use after free"; + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: case KASAN_KMALLOC_REDZONE: + bug_type = "slab-out-of-bounds"; + break; case KASAN_GLOBAL_REDZONE: - case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: - bug_type = "out of bounds access"; + bug_type = "global-out-of-bounds"; break; case KASAN_STACK_LEFT: case KASAN_STACK_MID: case KASAN_STACK_RIGHT: case KASAN_STACK_PARTIAL: - bug_type = "out of bounds on stack"; + bug_type = "stack-out-of-bounds"; + break; + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use-after-free"; break; } -- cgit v0.10.2 From cdf6a273dc4346277ab9d148ef29f6e058624a8c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:01 -0800 Subject: kasan: accurately determine the type of the bad access Makes KASAN accurately determine the type of the bad access. If the shadow byte value is in the [0, KASAN_SHADOW_SCALE_SIZE) range we can look at the next shadow byte to determine the type of the access. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cdf4c31..be53a8f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -50,15 +50,26 @@ static const void *find_first_bad_addr(const void *addr, size_t size) static void print_error_description(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; - u8 shadow_val; + u8 *shadow_addr; info->first_bad_addr = find_first_bad_addr(info->access_addr, info->access_size); - shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); - switch (shadow_val) { + /* + * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: -- cgit v0.10.2 From 25add7ec708170e4eaef1f9793a07803b2fb5c71 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:03 -0800 Subject: kasan: update log messages We decided to use KASAN as the short name of the tool and KernelAddressSanitizer as the full one. Update log messages according to that. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9ce5da2..d470cf2 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -126,5 +126,5 @@ void __init kasan_init(void) __flush_tlb_all(); init_task.kasan_depth = 0; - pr_info("Kernel address sanitizer initialized\n"); + pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1104cb0..be9b78d 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -518,7 +518,7 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASan doesn't support memory hot-add\n"); + pr_err("WARNING: KASAN doesn't support memory hot-add\n"); pr_err("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index be53a8f..ae6bd36 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -91,7 +91,7 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASan: %s in %pS at addr %p\n", + pr_err("BUG: KASAN: %s in %pS at addr %p\n", bug_type, (void *)info->ip, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", @@ -224,7 +224,7 @@ static void kasan_report_error(struct kasan_access_info *info) bug_type = "user-memory-access"; else bug_type = "wild-memory-access"; - pr_err("BUG: KASan: %s on address %p\n", + pr_err("BUG: KASAN: %s on address %p\n", bug_type, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", info->is_write ? "Write" : "Read", -- cgit v0.10.2 From 0295fd5d570626817d10deadf5a2ad5e49c36a1d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:06 -0800 Subject: kasan: various fixes in documentation [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt index 0d32355..94c88157 100644 --- a/Documentation/kasan.txt +++ b/Documentation/kasan.txt @@ -1,32 +1,31 @@ -Kernel address sanitizer -================ +KernelAddressSanitizer (KASAN) +============================== 0. Overview =========== -Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides +KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides a fast and comprehensive solution for finding use-after-free and out-of-bounds bugs. -KASan uses compile-time instrumentation for checking every memory access, -therefore you will need a gcc version of 4.9.2 or later. KASan could detect out -of bounds accesses to stack or global variables, but only if gcc 5.0 or later was -used to built the kernel. +KASAN uses compile-time instrumentation for checking every memory access, +therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is +required for detection of out-of-bounds accesses to stack or global variables. -Currently KASan is supported only for x86_64 architecture and requires that the -kernel be built with the SLUB allocator. +Currently KASAN is supported only for x86_64 architecture and requires the +kernel to be built with the SLUB allocator. 1. Usage -========= +======== To enable KASAN configure kernel with: CONFIG_KASAN = y -and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline -is compiler instrumentation types. The former produces smaller binary the -latter is 1.1 - 2 times faster. Inline instrumentation requires a gcc version -of 5.0 or later. +and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and +inline are compiler instrumentation types. The former produces smaller binary +the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC +version 5.0 or later. Currently KASAN works only with the SLUB memory allocator. For better bug detection and nicer report, enable CONFIG_STACKTRACE and put @@ -42,7 +41,7 @@ similar to the following to the respective kernel Makefile: KASAN_SANITIZE := n 1.1 Error reports -========== +================= A typical out of bounds access report looks like this: @@ -119,14 +118,16 @@ Memory state around the buggy address: ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== -First sections describe slub object where bad access happened. -See 'SLUB Debug output' section in Documentation/vm/slub.txt for details. +The header of the report discribe what kind of bug happened and what kind of +access caused it. It's followed by the description of the accessed slub object +(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and +the description of the accessed memory page. In the last section the report shows memory state around the accessed address. -Reading this part requires some more understanding of how KASAN works. +Reading this part requires some understanding of how KASAN works. -Each 8 bytes of memory are encoded in one shadow byte as accessible, -partially accessible, freed or they can be part of a redzone. +The state of each 8 aligned bytes of memory is encoded in one shadow byte. +Those 8 bytes can be accessible, partially accessible, freed or be a redzone. We use the following encoding for each shadow byte: 0 means that all 8 bytes of the corresponding memory region are accessible; number N (1 <= N <= 7) means that the first N bytes are accessible, and other (8 - N) bytes are not; @@ -139,7 +140,7 @@ the accessed address is partially accessible. 2. Implementation details -======================== +========================= From a high level, our approach to memory error detection is similar to that of kmemcheck: use shadow memory to record whether each byte of memory is safe -- cgit v0.10.2 From c63f06dd15797736c31dc86e6392811d0bac34a1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:09 -0800 Subject: kasan: move KASAN_SANITIZE in arch/x86/boot/Makefile Move KASAN_SANITIZE in arch/x86/boot/Makefile above the comment related to SVGA_MODE, since the comment refers to 'the next line'. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0d553e5..2ee62db 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,13 +9,13 @@ # Changed by many, many contributors over the years. # +KASAN_SANITIZE := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. # The number is the same as you would ordinarily press at bootup. -KASAN_SANITIZE := n - SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage -- cgit v0.10.2 From 5d0926efe728e00afbd81a1e3c498222cf908d23 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 5 Nov 2015 18:51:12 -0800 Subject: kasan: update reference to kasan prototype repo Update the reference to the kasan prototype repository on github, since it was renamed. Signed-off-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index be9b78d..21c50dc 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -4,7 +4,7 @@ * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin * - * Some of code borrowed from https://github.com/xairy/linux by + * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov * * This program is free software; you can redistribute it and/or modify diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ae6bd36..f5e068a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -4,7 +4,7 @@ * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin * - * Some of code borrowed from https://github.com/xairy/linux by + * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov * * This program is free software; you can redistribute it and/or modify -- cgit v0.10.2 From f523e737c08f5daaec9fac017e1bc5695e6f2760 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 5 Nov 2015 18:51:15 -0800 Subject: lib: test_kasan: add some testcases Add some out of bounds testcases to test_kasan module. Signed-off-by: Wang Long Acked-by: Andrey Ryabinin Cc: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c1efb1b..c32f3b0 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -138,6 +138,71 @@ static noinline void __init kmalloc_oob_16(void) kfree(ptr2); } +static noinline void __init kmalloc_oob_memset_2(void) +{ + char *ptr; + size_t size = 8; + + pr_info("out-of-bounds in memset2\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + memset(ptr+7, 0, 2); + kfree(ptr); +} + +static noinline void __init kmalloc_oob_memset_4(void) +{ + char *ptr; + size_t size = 8; + + pr_info("out-of-bounds in memset4\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + memset(ptr+5, 0, 4); + kfree(ptr); +} + + +static noinline void __init kmalloc_oob_memset_8(void) +{ + char *ptr; + size_t size = 8; + + pr_info("out-of-bounds in memset8\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + memset(ptr+1, 0, 8); + kfree(ptr); +} + +static noinline void __init kmalloc_oob_memset_16(void) +{ + char *ptr; + size_t size = 16; + + pr_info("out-of-bounds in memset16\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + memset(ptr+1, 0, 16); + kfree(ptr); +} + static noinline void __init kmalloc_oob_in_memset(void) { char *ptr; @@ -264,6 +329,10 @@ static int __init kmalloc_tests_init(void) kmalloc_oob_krealloc_less(); kmalloc_oob_16(); kmalloc_oob_in_memset(); + kmalloc_oob_memset_2(); + kmalloc_oob_memset_4(); + kmalloc_oob_memset_8(); + kmalloc_oob_memset_16(); kmalloc_uaf(); kmalloc_uaf_memset(); kmalloc_uaf2(); -- cgit v0.10.2 From e0d57714394f5e2ce4e2f9bbebf48e3c7a7fd3be Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 5 Nov 2015 18:51:18 -0800 Subject: kasan: Fix a type conversion error The current KASAN code can not find the following out-of-bounds bugs: char *ptr; ptr = kmalloc(8, GFP_KERNEL); memset(ptr+7, 0, 2); the cause of the problem is the type conversion error in *memory_is_poisoned_n* function. So this patch fix that. Signed-off-by: Wang Long Acked-by: Andrey Ryabinin Cc: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 21c50dc..2b21ccd 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -203,7 +203,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) return true; } return false; -- cgit v0.10.2 From 10f702627e139e21465f4c9d44f63527bbca163c Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Thu, 5 Nov 2015 18:51:21 -0800 Subject: kasan: use IS_ALIGNED in memory_is_poisoned_8() Use IS_ALIGNED() to determine whether the shadow span two bytes. It generates less code and more readable. Also add some comments in shadow check functions. Signed-off-by: Xishi Qiu Acked-by: Andrey Ryabinin Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 2b21ccd..d41b21b 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr) if (memory_is_poisoned_1(addr + 1)) return true; + /* + * If single shadow byte covers 2-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) return false; @@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr) if (memory_is_poisoned_1(addr + 3)) return true; + /* + * If single shadow byte covers 4-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) return false; @@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr) if (memory_is_poisoned_1(addr + 7)) return true; - if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) + /* + * If single shadow byte covers 8-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ + if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) return false; return unlikely(*(u8 *)shadow_addr); @@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr) if (unlikely(shadow_first_bytes)) return true; - if (likely(IS_ALIGNED(addr, 8))) + /* + * If two shadow bytes covers 16-byte access, we don't + * need to do anything more. Otherwise, test the last + * shadow byte. + */ + if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) return false; return memory_is_poisoned_1(addr + 15); -- cgit v0.10.2 From 89d3c87e20d95e3238eac85e43de7b3cb1f39d8b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 5 Nov 2015 18:51:23 -0800 Subject: mm, slub, kasan: enable user tracking by default with KASAN=y It's recommended to have slub's user tracking enabled with CONFIG_KASAN, because: a) User tracking disables slab merging which improves detecting out-of-bounds accesses. b) User tracking metadata acts as redzone which also improves detecting out-of-bounds accesses. c) User tracking provides additional information about object. This information helps to understand bugs. Currently it is not enabled by default. Besides recompiling the kernel with KASAN and reinstalling it, user also have to change the boot cmdline, which is not very handy. Enable slub user tracking by default with KASAN=y, since there is no good reason to not do this. [akpm@linux-foundation.org: little fixes, per David] Signed-off-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kasan.txt b/Documentation/kasan.txt index 94c88157..aa1e0c9 100644 --- a/Documentation/kasan.txt +++ b/Documentation/kasan.txt @@ -28,8 +28,7 @@ the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC version 5.0 or later. Currently KASAN works only with the SLUB memory allocator. -For better bug detection and nicer report, enable CONFIG_STACKTRACE and put -at least 'slub_debug=U' in the boot cmdline. +For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. To disable instrumentation for specific files or directories, add a line similar to the following to the respective kernel Makefile: diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 39f24d6..0fee5ac 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -15,8 +15,7 @@ config KASAN global variables requires gcc 5.0 or later. This feature consumes about 1/8 of available memory and brings about ~x3 performance slowdown. - For better error detection enable CONFIG_STACKTRACE, - and add slub_debug=U to boot cmdline. + For better error detection enable CONFIG_STACKTRACE. choice prompt "Instrumentation type" diff --git a/mm/slub.c b/mm/slub.c index 423dbe7..75a5fa9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) /* * Debug settings: */ -#ifdef CONFIG_SLUB_DEBUG_ON +#if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; +#elif defined(CONFIG_KASAN) +static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif -- cgit v0.10.2 From eb06f43f1c94d502b7867b0998e92cdabbc060bc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 5 Nov 2015 18:51:26 -0800 Subject: kasan: always taint kernel on report Currently we already taint the kernel in some cases. E.g. if we hit some bug in slub memory we call object_err() which will taint the kernel with TAINT_BAD_PAGE flag. But for other kind of bugs kernel left untainted. Always taint with TAINT_BAD_PAGE if kasan found some bug. This is useful for automated testing. Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Reviewed-by: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f5e068a..12f222d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -238,6 +238,7 @@ static void kasan_report_error(struct kasan_access_info *info) } pr_err("=================================" "=================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); kasan_enable_current(); } -- cgit v0.10.2 From 1aab92ec3de552362397b718744872ea2d17add2 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:29 -0800 Subject: mm: mlock: refactor mlock, munlock, and munlockall code mlock() allows a user to control page out of program memory, but this comes at the cost of faulting in the entire mapping when it is allocated. For large mappings where the entire area is not necessary this is not ideal. Instead of forcing all locked pages to be present when they are allocated, this set creates a middle ground. Pages are marked to be placed on the unevictable LRU (locked) when they are first used, but they are not faulted in by the mlock call. This series introduces a new mlock() system call that takes a flags argument along with the start address and size. This flags argument gives the caller the ability to request memory be locked in the traditional way, or to be locked after the page is faulted in. A new MCL flag is added to mirror the lock on fault behavior from mlock() in mlockall(). There are two main use cases that this set covers. The first is the security focussed mlock case. A buffer is needed that cannot be written to swap. The maximum size is known, but on average the memory used is significantly less than this maximum. With lock on fault, the buffer is guaranteed to never be paged out without consuming the maximum size every time such a buffer is created. The second use case is focussed on performance. Portions of a large file are needed and we want to keep the used portions in memory once accessed. This is the case for large graphical models where the path through the graph is not known until run time. The entire graph is unlikely to be used in a given invocation, but once a node has been used it needs to stay resident for further processing. Given these constraints we have a number of options. We can potentially waste a large amount of memory by mlocking the entire region (this can also cause a significant stall at startup as the entire file is read in). We can mlock every page as we access them without tracking if the page is already resident but this introduces large overhead for each access. The third option is mapping the entire region with PROT_NONE and using a signal handler for SIGSEGV to mprotect(PROT_READ) and mlock() the needed page. Doing this page at a time adds a significant performance penalty. Batching can be used to mitigate this overhead, but in order to safely avoid trying to mprotect pages outside of the mapping, the boundaries of each mapping to be used in this way must be tracked and available to the signal handler. This is precisely what the mm system in the kernel should already be doing. For mlock(MLOCK_ONFAULT) the user is charged against RLIMIT_MEMLOCK as if mlock(MLOCK_LOCKED) or mmap(MAP_LOCKED) was used, so when the VMA is created not when the pages are faulted in. For mlockall(MCL_ONFAULT) the user is charged as if MCL_FUTURE was used. This decision was made to keep the accounting checks out of the page fault path. To illustrate the benefit of this set I wrote a test program that mmaps a 5 GB file filled with random data and then makes 15,000,000 accesses to random addresses in that mapping. The test program was run 20 times for each setup. Results are reported for two program portions, setup and execution. The setup phase is calling mmap and optionally mlock on the entire region. For most experiments this is trivial, but it highlights the cost of faulting in the entire region. Results are averages across the 20 runs in milliseconds. mmap with mlock(MLOCK_LOCKED) on entire range: Setup avg: 8228.666 Processing avg: 8274.257 mmap with mlock(MLOCK_LOCKED) before each access: Setup avg: 0.113 Processing avg: 90993.552 mmap with PROT_NONE and signal handler and batch size of 1 page: With the default value in max_map_count, this gets ENOMEM as I attempt to change the permissions, after upping the sysctl significantly I get: Setup avg: 0.058 Processing avg: 69488.073 mmap with PROT_NONE and signal handler and batch size of 8 pages: Setup avg: 0.068 Processing avg: 38204.116 mmap with PROT_NONE and signal handler and batch size of 16 pages: Setup avg: 0.044 Processing avg: 29671.180 mmap with mlock(MLOCK_ONFAULT) on entire range: Setup avg: 0.189 Processing avg: 17904.899 The signal handler in the batch cases faulted in memory in two steps to avoid having to know the start and end of the faulting mapping. The first step covers the page that caused the fault as we know that it will be possible to lock. The second step speculatively tries to mlock and mprotect the batch size - 1 pages that follow. There may be a clever way to avoid this without having the program track each mapping to be covered by this handeler in a globally accessible structure, but I could not find it. It should be noted that with a large enough batch size this two step fault handler can still cause the program to crash if it reaches far beyond the end of the mapping. These results show that if the developer knows that a majority of the mapping will be used, it is better to try and fault it in at once, otherwise mlock(MLOCK_ONFAULT) is significantly faster. The performance cost of these patches are minimal on the two benchmarks I have tested (stream and kernbench). The following are the average values across 20 runs of stream and 10 runs of kernbench after a warmup run whose results were discarded. Avg throughput in MB/s from stream using 1000000 element arrays Test 4.2-rc1 4.2-rc1+lock-on-fault Copy: 10,566.5 10,421 Scale: 10,685 10,503.5 Add: 12,044.1 11,814.2 Triad: 12,064.8 11,846.3 Kernbench optimal load 4.2-rc1 4.2-rc1+lock-on-fault Elapsed Time 78.453 78.991 User Time 64.2395 65.2355 System Time 9.7335 9.7085 Context Switches 22211.5 22412.1 Sleeps 14965.3 14956.1 This patch (of 6): Extending the mlock system call is very difficult because it currently does not take a flags argument. A later patch in this set will extend mlock to support a middle ground between pages that are locked and faulted in immediately and unlocked pages. To pave the way for the new system call, the code needs some reorganization so that all the actual entry point handles is checking input and translating to VMA flags. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Michael Kerrisk Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Jonathan Corbet Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mlock.c b/mm/mlock.c index 550228d..fbd8c03 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -554,7 +554,8 @@ out: return ret; } -static int do_mlock(unsigned long start, size_t len, int on) +static int apply_vma_lock_flags(unsigned long start, size_t len, + vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct * vma, * prev; @@ -576,14 +577,11 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - vm_flags_t newflags; - - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + vm_flags_t newflags = vma->vm_flags & ~VM_LOCKED; - newflags = vma->vm_flags & ~VM_LOCKED; - if (on) - newflags |= VM_LOCKED; + newflags |= flags; + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; @@ -605,7 +603,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; @@ -629,7 +627,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) - error = do_mlock(start, len, 1); + error = apply_vma_lock_flags(start, len, flags); up_write(¤t->mm->mmap_sem); if (error) @@ -641,6 +639,11 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) return 0; } +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +{ + return do_mlock(start, len, VM_LOCKED); +} + SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; @@ -649,13 +652,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) start &= PAGE_MASK; down_write(¤t->mm->mmap_sem); - ret = do_mlock(start, len, 0); + ret = apply_vma_lock_flags(start, len, 0); up_write(¤t->mm->mmap_sem); return ret; } -static int do_mlockall(int flags) +static int apply_mlockall_flags(int flags) { struct vm_area_struct * vma, * prev = NULL; @@ -663,6 +666,7 @@ static int do_mlockall(int flags) current->mm->def_flags |= VM_LOCKED; else current->mm->def_flags &= ~VM_LOCKED; + if (flags == MCL_FUTURE) goto out; @@ -703,7 +707,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) - ret = do_mlockall(flags); + ret = apply_mlockall_flags(flags); up_write(¤t->mm->mmap_sem); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); @@ -716,7 +720,7 @@ SYSCALL_DEFINE0(munlockall) int ret; down_write(¤t->mm->mmap_sem); - ret = do_mlockall(0); + ret = apply_mlockall_flags(0); up_write(¤t->mm->mmap_sem); return ret; } -- cgit v0.10.2 From a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:33 -0800 Subject: mm: mlock: add new mlock system call With the refactored mlock code, introduce a new system call for mlock. The new call will allow the user to specify what lock states are being added. mlock2 is trivial at the moment, but a follow on patch will add a new mlock state making it useful. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Heiko Carstens Cc: Geert Uytterhoeven Cc: Catalin Marinas Cc: Stephen Rothwell Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index caa2c71..f17705e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -382,3 +382,4 @@ 373 i386 shutdown sys_shutdown 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier +376 i386 mlock2 sys_mlock2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 278842f..314a90b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -331,6 +331,7 @@ 322 64 execveat stub_execveat 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier +325 common mlock2 sys_mlock2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a460e2e..a156b82 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ee12400..1324b02 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) __SYSCALL(__NR_userfaultfd, sys_userfaultfd) #define __NR_membarrier 283 __SYSCALL(__NR_membarrier, sys_membarrier) +#define __NR_mlock2 284 +__SYSCALL(__NR_mlock2, sys_mlock2) #undef __NR_syscalls -#define __NR_syscalls 284 +#define __NR_syscalls 285 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a02decf..0623787 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -194,6 +194,7 @@ cond_syscall(sys_mlock); cond_syscall(sys_munlock); cond_syscall(sys_mlockall); cond_syscall(sys_munlockall); +cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); diff --git a/mm/mlock.c b/mm/mlock.c index fbd8c03..35dcf8f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -644,6 +644,14 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) return do_mlock(start, len, VM_LOCKED); } +SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) +{ + if (flags) + return -EINVAL; + + return do_mlock(start, len, VM_LOCKED); +} + SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; -- cgit v0.10.2 From de60f5f10c58d4f34b68622442c0e04180367f3f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:36 -0800 Subject: mm: introduce VM_LOCKONFAULT The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking. For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc). This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in. Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c258f8..906c46a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ @@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE +/* This mask is used to clear all the VMA flags used by mlock */ +#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_MLOCK 0x1000 /* lock present pages */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/kernel/fork.c b/kernel/fork.c index 6ac8942..a30fae4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,7 +454,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= + ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; diff --git a/mm/debug.c b/mm/debug.c index 6c1b3ea..e784110 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_GROWSDOWN, "growsdown" }, {VM_PFNMAP, "pfnmap" }, {VM_DENYWRITE, "denywrite" }, + {VM_LOCKONFAULT, "lockonfault" }, {VM_LOCKED, "locked" }, {VM_IO, "io" }, {VM_SEQ_READ, "seqread" }, diff --git a/mm/gup.c b/mm/gup.c index a798293..deafa2c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -129,7 +129,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; + /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; /* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || @@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE; + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3fd0311..f5c08b4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 241de27..74ef0c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4137,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 35dcf8f..ca38941 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; while (start < end) { struct page *page = NULL; diff --git a/mm/mmap.c b/mm/mmap.c index 220effd..2ce04a6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1661,7 +1661,7 @@ out: vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); else - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; } if (file) -- cgit v0.10.2 From b0f205c2a3082dd9081f9a94e50658c5fa906ff1 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:39 -0800 Subject: mm: mlock: add mlock flags to enable VM_LOCKONFAULT usage The previous patch introduced a flag that specified pages in a VMA should be placed on the unevictable LRU, but they should not be made present when the area is created. This patch adds the ability to set this state via the new mlock system calls. We add MLOCK_ONFAULT for mlock2 and MCL_ONFAULT for mlockall. MLOCK_ONFAULT will set the VM_LOCKONFAULT modifier for VM_LOCKED. MCL_ONFAULT should be used as a modifier to the two other mlockall flags. When used with MCL_CURRENT, all current mappings will be marked with VM_LOCKED | VM_LOCKONFAULT. When used with MCL_FUTURE, the mm->def_flags will be marked with VM_LOCKED | VM_LOCKONFAULT. When used with both MCL_CURRENT and MCL_FUTURE, all current mappings and mm->def_flags will be marked with VM_LOCKED | VM_LOCKONFAULT. Prior to this patch, mlockall() will unconditionally clear the mm->def_flags any time it is called without MCL_FUTURE. This behavior is maintained after adding MCL_ONFAULT. If a call to mlockall(MCL_FUTURE) is followed by mlockall(MCL_CURRENT), the mm->def_flags will be cleared and new VMAs will be unlocked. This remains true with or without MCL_ONFAULT in either mlockall() invocation. munlock() will unconditionally clear both vma flags. munlockall() unconditionally clears for VMA flags on all VMAs and in the mm->def_flags field. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 0086b47..f2f9496 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -37,6 +37,9 @@ #define MCL_CURRENT 8192 /* lock all currently mapped pages */ #define MCL_FUTURE 16384 /* lock all additions to address space */ +#define MCL_ONFAULT 32768 /* lock all pages that are faulted in */ + +#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ #define MADV_NORMAL 0 /* no further special treatment */ #define MADV_RANDOM 1 /* expect random page references */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index cfcb876..97c03f4 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -61,6 +61,12 @@ */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MCL_ONFAULT 4 /* lock all pages that are faulted in */ + +/* + * Flags for mlock + */ +#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ #define MADV_NORMAL 0 /* no further special treatment */ #define MADV_RANDOM 1 /* expect random page references */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 294d251..ecc3ae1 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -31,6 +31,9 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MCL_ONFAULT 4 /* lock all pages that are faulted in */ + +#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ #define MADV_NORMAL 0 /* no further special treatment */ #define MADV_RANDOM 1 /* expect random page references */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 6ea26df..03c06ba 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -22,6 +22,7 @@ #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ +#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index 0b14df3..9765896 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h @@ -17,6 +17,7 @@ #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ +#define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ diff --git a/arch/tile/include/uapi/asm/mman.h b/arch/tile/include/uapi/asm/mman.h index 81b8fc3..63ee13f 100644 --- a/arch/tile/include/uapi/asm/mman.h +++ b/arch/tile/include/uapi/asm/mman.h @@ -36,6 +36,7 @@ */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MCL_ONFAULT 4 /* lock all pages that are faulted in */ #endif /* _ASM_TILE_MMAN_H */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 201aec0..360944e 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -74,6 +74,12 @@ */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MCL_ONFAULT 4 /* lock all pages that are faulted in */ + +/* + * Flags for mlock + */ +#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ #define MADV_NORMAL 0 /* no further special treatment */ #define MADV_RANDOM 1 /* expect random page references */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index ddc3b36..a74dd84 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -25,6 +25,11 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +/* + * Flags for mlock + */ +#define MLOCK_ONFAULT 0x01 /* Lock pages in range after they are faulted in, do not prefault */ + #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ #define MS_SYNC 4 /* synchronous memory sync */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index e9fe6fd..7162cd4 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -17,5 +17,6 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MCL_ONFAULT 4 /* lock all pages that are faulted in */ #endif /* __ASM_GENERIC_MMAN_H */ diff --git a/mm/mlock.c b/mm/mlock.c index ca38941..339d9e0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -506,7 +506,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - goto out; /* don't set VM_LOCKED, don't count */ + /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, @@ -577,7 +578,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, prev = vma; for (nstart = start ; ; ) { - vm_flags_t newflags = vma->vm_flags & ~VM_LOCKED; + vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; newflags |= flags; @@ -646,10 +647,15 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { - if (flags) + vm_flags_t vm_flags = VM_LOCKED; + + if (flags & ~MLOCK_ONFAULT) return -EINVAL; - return do_mlock(start, len, VM_LOCKED); + if (flags & MLOCK_ONFAULT) + vm_flags |= VM_LOCKONFAULT; + + return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) @@ -666,24 +672,43 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) return ret; } +/* + * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) + * and translate into the appropriate modifications to mm->def_flags and/or the + * flags for all current VMAs. + * + * There are a couple of subtleties with this. If mlockall() is called multiple + * times with different flags, the values do not necessarily stack. If mlockall + * is called once including the MCL_FUTURE flag and then a second time without + * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. + */ static int apply_mlockall_flags(int flags) { struct vm_area_struct * vma, * prev = NULL; + vm_flags_t to_add = 0; - if (flags & MCL_FUTURE) + current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; - else - current->mm->def_flags &= ~VM_LOCKED; - if (flags == MCL_FUTURE) - goto out; + if (flags & MCL_ONFAULT) + current->mm->def_flags |= VM_LOCKONFAULT; + + if (!(flags & MCL_CURRENT)) + goto out; + } + + if (flags & MCL_CURRENT) { + to_add |= VM_LOCKED; + if (flags & MCL_ONFAULT) + to_add |= VM_LOCKONFAULT; + } for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags & ~VM_LOCKED; - if (flags & MCL_CURRENT) - newflags |= VM_LOCKED; + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags |= to_add; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); @@ -698,7 +723,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) unsigned long lock_limit; int ret; - if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) return -EINVAL; if (!can_do_mlock()) -- cgit v0.10.2 From b3b0d09c7a2330759ac293f5269bd932439ea0ff Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 5 Nov 2015 18:51:43 -0800 Subject: selftests: vm: add tests for lock on fault Test the mmap() flag, and the mlockall() flag. These tests ensure that pages are not faulted in until they are accessed, that the pages are unevictable once faulted in, and that VMA splitting and merging works with the new VM flag. The second test ensures that mlock limits are respected. Note that the limit test needs to be run a normal user. Also add tests to use the new mlock2 family of system calls. [treding@nvidia.com: : Fix mlock2-tests for 32-bit architectures] [treding@nvidia.com: ensure the mlock2 syscall number can be found] [treding@nvidia.com: use the right arguments for main()] Signed-off-by: Eric B Munson Cc: Shuah Khan Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Kirill A. Shutemov Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Stephen Rothwell Signed-off-by: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 3c53cac..e4bb1de 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -5,6 +5,8 @@ BINARIES = compaction_test BINARIES += hugepage-mmap BINARIES += hugepage-shm BINARIES += map_hugetlb +BINARIES += mlock2-tests +BINARIES += on-fault-limit BINARIES += thuge-gen BINARIES += transhuge-stress BINARIES += userfaultfd diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c new file mode 100644 index 0000000..4431994 --- /dev/null +++ b/tools/testing/selftests/vm/mlock2-tests.c @@ -0,0 +1,736 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MLOCK_ONFAULT +#define MLOCK_ONFAULT 1 +#endif + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int mlock2_(void *start, size_t len, int flags) +{ +#ifdef __NR_mlock2 + return syscall(__NR_mlock2, start, len, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + +struct vm_boundaries { + unsigned long start; + unsigned long end; +}; + +static int get_vm_area(unsigned long addr, struct vm_boundaries *area) +{ + FILE *file; + int ret = 1; + char line[1024] = {0}; + char *end_addr; + char *stop; + unsigned long start; + unsigned long end; + + if (!area) + return ret; + + file = fopen("/proc/self/maps", "r"); + if (!file) { + perror("fopen"); + return ret; + } + + memset(area, 0, sizeof(struct vm_boundaries)); + + while(fgets(line, 1024, file)) { + end_addr = strchr(line, '-'); + if (!end_addr) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + *end_addr = '\0'; + end_addr++; + stop = strchr(end_addr, ' '); + if (!stop) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + stop = '\0'; + + sscanf(line, "%lx", &start); + sscanf(end_addr, "%lx", &end); + + if (start <= addr && end > addr) { + area->start = start; + area->end = end; + ret = 0; + goto out; + } + } +out: + fclose(file); + return ret; +} + +static uint64_t get_pageflags(unsigned long addr) +{ + FILE *file; + uint64_t pfn; + unsigned long offset; + + file = fopen("/proc/self/pagemap", "r"); + if (!file) { + perror("fopen pagemap"); + _exit(1); + } + + offset = addr / getpagesize() * sizeof(pfn); + + if (fseek(file, offset, SEEK_SET)) { + perror("fseek pagemap"); + _exit(1); + } + + if (fread(&pfn, sizeof(pfn), 1, file) != 1) { + perror("fread pagemap"); + _exit(1); + } + + fclose(file); + return pfn; +} + +static uint64_t get_kpageflags(unsigned long pfn) +{ + uint64_t flags; + FILE *file; + + file = fopen("/proc/kpageflags", "r"); + if (!file) { + perror("fopen kpageflags"); + _exit(1); + } + + if (fseek(file, pfn * sizeof(flags), SEEK_SET)) { + perror("fseek kpageflags"); + _exit(1); + } + + if (fread(&flags, sizeof(flags), 1, file) != 1) { + perror("fread kpageflags"); + _exit(1); + } + + fclose(file); + return flags; +} + +static FILE *seek_to_smaps_entry(unsigned long addr) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + unsigned long start, end; + char perms[5]; + unsigned long offset; + char dev[32]; + unsigned long inode; + char path[BUFSIZ]; + + file = fopen("/proc/self/smaps", "r"); + if (!file) { + perror("fopen smaps"); + _exit(1); + } + + while (getline(&line, &size, file) > 0) { + if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", + &start, &end, perms, &offset, dev, &inode, path) < 6) + goto next; + + if (start <= addr && addr < end) + goto out; + +next: + free(line); + line = NULL; + size = 0; + } + + fclose(file); + file = NULL; + +out: + free(line); + return file; +} + +#define VMFLAGS "VmFlags:" + +static bool is_vmflag_set(unsigned long addr, const char *vmflag) +{ + char *line = NULL; + char *flags; + size_t size = 0; + bool ret = false; + FILE *smaps; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, VMFLAGS)) { + free(line); + line = NULL; + size = 0; + continue; + } + + flags = line + strlen(VMFLAGS); + ret = (strstr(flags, vmflag) != NULL); + goto out; + } + +out: + free(line); + fclose(smaps); + return ret; +} + +#define SIZE "Size:" +#define RSS "Rss:" +#define LOCKED "lo" + +static bool is_vma_lock_on_fault(unsigned long addr) +{ + bool ret = false; + bool locked; + FILE *smaps = NULL; + unsigned long vma_size, vma_rss; + char *line = NULL; + char *value; + size_t size = 0; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + goto out; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, SIZE)) { + free(line); + line = NULL; + size = 0; + continue; + } + + value = line + strlen(SIZE); + if (sscanf(value, "%lu kB", &vma_size) < 1) { + printf("Unable to parse smaps entry for Size\n"); + goto out; + } + break; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, RSS)) { + free(line); + line = NULL; + size = 0; + continue; + } + + value = line + strlen(RSS); + if (sscanf(value, "%lu kB", &vma_rss) < 1) { + printf("Unable to parse smaps entry for Rss\n"); + goto out; + } + break; + } + + ret = locked && (vma_rss < vma_size); +out: + free(line); + if (smaps) + fclose(smaps); + return ret; +} + +#define PRESENT_BIT 0x8000000000000000 +#define PFN_MASK 0x007FFFFFFFFFFFFF +#define UNEVICTABLE_BIT (1UL << 18) + +static int lock_check(char *map) +{ + unsigned long page_size = getpagesize(); + uint64_t page1_flags, page2_flags; + + page1_flags = get_pageflags((unsigned long)map); + page2_flags = get_pageflags((unsigned long)map + page_size); + + /* Both pages should be present */ + if (((page1_flags & PRESENT_BIT) == 0) || + ((page2_flags & PRESENT_BIT) == 0)) { + printf("Failed to make both pages present\n"); + return 1; + } + + page1_flags = get_kpageflags(page1_flags & PFN_MASK); + page2_flags = get_kpageflags(page2_flags & PFN_MASK); + + /* Both pages should be unevictable */ + if (((page1_flags & UNEVICTABLE_BIT) == 0) || + ((page2_flags & UNEVICTABLE_BIT) == 0)) { + printf("Failed to make both pages unevictable\n"); + return 1; + } + + if (!is_vmflag_set((unsigned long)map, LOCKED)) { + printf("VMA flag %s is missing on page 1\n", LOCKED); + return 1; + } + + if (!is_vmflag_set((unsigned long)map + page_size, LOCKED)) { + printf("VMA flag %s is missing on page 2\n", LOCKED); + return 1; + } + + return 0; +} + +static int unlock_lock_check(char *map) +{ + unsigned long page_size = getpagesize(); + uint64_t page1_flags, page2_flags; + + page1_flags = get_pageflags((unsigned long)map); + page2_flags = get_pageflags((unsigned long)map + page_size); + page1_flags = get_kpageflags(page1_flags & PFN_MASK); + page2_flags = get_kpageflags(page2_flags & PFN_MASK); + + if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) { + printf("A page is still marked unevictable after unlock\n"); + return 1; + } + + if (is_vmflag_set((unsigned long)map, LOCKED)) { + printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); + return 1; + } + + if (is_vmflag_set((unsigned long)map + page_size, LOCKED)) { + printf("VMA flag %s is present on page 2 after unlock\n", LOCKED); + return 1; + } + + return 0; +} + +static int test_mlock_lock() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, 0)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(0); + } + perror("mlock2(0)"); + goto unmap; + } + + if (lock_check(map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + perror("munlock()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int onfault_check(char *map) +{ + unsigned long page_size = getpagesize(); + uint64_t page1_flags, page2_flags; + + page1_flags = get_pageflags((unsigned long)map); + page2_flags = get_pageflags((unsigned long)map + page_size); + + /* Neither page should be present */ + if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) { + printf("Pages were made present by MLOCK_ONFAULT\n"); + return 1; + } + + *map = 'a'; + page1_flags = get_pageflags((unsigned long)map); + page2_flags = get_pageflags((unsigned long)map + page_size); + + /* Only page 1 should be present */ + if ((page1_flags & PRESENT_BIT) == 0) { + printf("Page 1 is not present after fault\n"); + return 1; + } else if (page2_flags & PRESENT_BIT) { + printf("Page 2 was made present\n"); + return 1; + } + + page1_flags = get_kpageflags(page1_flags & PFN_MASK); + + /* Page 1 should be unevictable */ + if ((page1_flags & UNEVICTABLE_BIT) == 0) { + printf("Failed to make faulted page unevictable\n"); + return 1; + } + + if (!is_vma_lock_on_fault((unsigned long)map)) { + printf("VMA is not marked for lock on fault\n"); + return 1; + } + + if (!is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA is not marked for lock on fault\n"); + return 1; + } + + return 0; +} + +static int unlock_onfault_check(char *map) +{ + unsigned long page_size = getpagesize(); + uint64_t page1_flags; + + page1_flags = get_pageflags((unsigned long)map); + page1_flags = get_kpageflags(page1_flags & PFN_MASK); + + if (page1_flags & UNEVICTABLE_BIT) { + printf("Page 1 is still marked unevictable after unlock\n"); + return 1; + } + + if (is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA is still lock on fault after unlock\n"); + return 1; + } + + return 0; +} + +static int test_mlock_onfault() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(0); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(0); + } + perror("munlock()"); + goto unmap; + } + + ret = unlock_onfault_check(map); +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_lock_onfault_of_present() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + uint64_t page1_flags, page2_flags; + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + *map = 'a'; + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(0); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + page1_flags = get_pageflags((unsigned long)map); + page2_flags = get_pageflags((unsigned long)map + page_size); + page1_flags = get_kpageflags(page1_flags & PFN_MASK); + page2_flags = get_kpageflags(page2_flags & PFN_MASK); + + /* Page 1 should be unevictable */ + if ((page1_flags & UNEVICTABLE_BIT) == 0) { + printf("Failed to make present page unevictable\n"); + goto unmap; + } + + if (!is_vma_lock_on_fault((unsigned long)map) || + !is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA with present pages is not marked lock on fault\n"); + goto unmap; + } + ret = 0; +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_munlockall() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT)) { + perror("mlockall(MCL_CURRENT)"); + goto out; + } + + if (lock_check(map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_lock_check(map)) + goto unmap; + + munmap(map, 2 * page_size); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall second mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { + perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_onfault_check(map)) + goto unmap; + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); + goto out; + } + + if (lock_check(map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + munlockall(); + return ret; +} + +static int test_vma_management(bool call_mlock) +{ + int ret = 1; + void *map; + unsigned long page_size = getpagesize(); + struct vm_boundaries page1; + struct vm_boundaries page2; + struct vm_boundaries page3; + + map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (map == MAP_FAILED) { + perror("mmap()"); + return ret; + } + + if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(0); + } + perror("mlock(ONFAULT)\n"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* + * Before we unlock a portion, we need to that all three pages are in + * the same VMA. If they are not we abort this test (Note that this is + * not a failure) + */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("VMAs are not merged to start, aborting test\n"); + ret = 0; + goto out; + } + + if (munlock(map + page_size, page_size)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* All three VMAs should be different */ + if (page1.start == page2.start || page2.start == page3.start) { + printf("failed to split VMA for munlock\n"); + goto out; + } + + /* Now unlock the first and third page and check the VMAs again */ + if (munlock(map, page_size * 3)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* Now all three VMAs should be the same */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("failed to merge VMAs after munlock\n"); + goto out; + } + + ret = 0; +out: + munmap(map, 3 * page_size); + return ret; +} + +static int test_mlockall(int (test_function)(bool call_mlock)) +{ + int ret = 1; + + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + ret = test_function(false); + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + ret += test_mlock_lock(); + ret += test_mlock_onfault(); + ret += test_munlockall(); + ret += test_lock_onfault_of_present(); + ret += test_vma_management(true); + ret += test_mlockall(test_vma_management); + return ret; +} diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c new file mode 100644 index 0000000..245accc --- /dev/null +++ b/tools/testing/selftests/vm/on-fault-limit.c @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include +#include + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int test_limit(void) +{ + int ret = 1; + struct rlimit lims; + void *map; + + if (getrlimit(RLIMIT_MEMLOCK, &lims)) { + perror("getrlimit"); + return ret; + } + + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0); + if (map != MAP_FAILED) + printf("mmap should have failed, but didn't\n"); + else { + ret = 0; + munmap(map, 2 * lims.rlim_max); + } + + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + + ret += test_limit(); + return ret; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 9179ce8..2df21b3 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -106,4 +106,26 @@ else echo "[PASS]" fi +echo "--------------------" +echo "running on-fault-limit" +echo "--------------------" +sudo -u nobody ./on-fault-limit +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + +echo "--------------------" +echo "running mlock2-tests" +echo "--------------------" +./mlock2-tests +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + exit $exitcode -- cgit v0.10.2