From 35e481761cdc688dbee0ef552a13f49af8eba6cc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2016 17:08:59 -0700 Subject: fsnotify: avoid spurious EMFILE errors from inotify_init() Inotify instance is destroyed when all references to it are dropped. That not only means that the corresponding file descriptor needs to be closed but also that all corresponding instance marks are freed (as each mark holds a reference to the inotify instance). However marks are freed only after SRCU period ends which can take some time and thus if user rapidly creates and frees inotify instances, number of existing inotify instances can exceed max_user_instances limit although from user point of view there is always at most one existing instance. Thus inotify_init() returns EMFILE error which is hard to justify from user point of view. This problem is exposed by LTP inotify06 testcase on some machines. We fix the problem by making sure all group marks are properly freed while destroying inotify instance. We wait for SRCU period to end in that path anyway since we have to make sure there is no event being added to the instance while we are tearing down the instance. So it takes only some plumbing to allow for marks to be destroyed in that path as well and not from a dedicated work item. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jan Kara Reported-by: Xiaoguang Wang Tested-by: Xiaoguang Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index b44c68a..0a3bc2c 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, &mnt->mnt_root->d_lock); } +/* prepare for freeing all marks associated with given group */ +extern void fsnotify_detach_group_marks(struct fsnotify_group *group); +/* + * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list + */ +extern void fsnotify_mark_destroy_list(void); + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/group.c b/fs/notify/group.c index d16b62c..3e2dd85 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { - /* clear all inode marks for this group */ - fsnotify_clear_marks_by_group(group); + /* clear all inode marks for this group, attach them to destroy_list */ + fsnotify_detach_group_marks(group); - synchronize_srcu(&fsnotify_mark_srcu); + /* + * Wait for fsnotify_mark_srcu period to end and free all marks in + * destroy_list + */ + fsnotify_mark_destroy_list(); - /* clear the notification queue of all events */ + /* + * Since we have waited for fsnotify_mark_srcu in + * fsnotify_mark_destroy_list() there can be no outstanding event + * notification against this group. So clearing the notification queue + * of all events is reliable now. + */ fsnotify_flush_notify(group); /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 7115c5d..d3fea0b 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); -static void fsnotify_mark_destroy(struct work_struct *work); -static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); +static void fsnotify_mark_destroy_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) } /* - * Free fsnotify mark. The freeing is actually happening from a kthread which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. + * Prepare mark for freeing and add it to the list of marks prepared for + * freeing. The actual freeing must happen after SRCU period ends and the + * caller is responsible for this. + * + * The function returns true if the mark was added to the list of marks for + * freeing. The function returns false if someone else has already called + * __fsnotify_free_mark() for the mark. */ -void fsnotify_free_mark(struct fsnotify_mark *mark) +static bool __fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; @@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return; + return false; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark @@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); + + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + + return true; +} + +/* + * Free fsnotify mark. The freeing is actually happening from a workqueue which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. + */ +void fsnotify_free_mark(struct fsnotify_mark *mark) +{ + if (__fsnotify_free_mark(mark)) { + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); + } } void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } /* - * Given a group, destroy all of the marks associated with that group. + * Given a group, prepare for freeing all the marks associated with that group. + * The marks are attached to the list of marks prepared for destruction, the + * caller is responsible for freeing marks in that list after SRCU period has + * ended. */ -void fsnotify_clear_marks_by_group(struct fsnotify_group *group) +void fsnotify_detach_group_marks(struct fsnotify_group *group) { - fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); + struct fsnotify_mark *mark; + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&group->marks_list)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&group->marks_list, + struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_detach_mark(mark); + mutex_unlock(&group->mark_mutex); + __fsnotify_free_mark(mark); + fsnotify_put_mark(mark); + } } void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) @@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->free_mark = free_mark; } -static void fsnotify_mark_destroy(struct work_struct *work) +/* + * Destroy all marks in destroy_list, waits for SRCU period to finish before + * actually freeing marks. + */ +void fsnotify_mark_destroy_list(void) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; @@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work) fsnotify_put_mark(mark); } } + +static void fsnotify_mark_destroy_workfn(struct work_struct *work) +{ + fsnotify_mark_destroy_list(); +} diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 1259e53..29f9175 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); -/* run all the marks in a group, and flag them to be freed */ -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_unmount_inodes(struct super_block *sb); -- cgit v0.10.2 From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:02 -0700 Subject: time: add missing implementation for timespec64_add_safe() timespec64_add_safe() has been defined in time64.h for 64 bit systems. But, 32 bit systems only have an extern function prototype defined. Provide a definition for the above function. The function will be necessary as part of y2038 changes. struct timespec is not y2038 safe. All references to timespec will be replaced by struct timespec64. The function is meant to be a replacement for timespec_add_safe(). The implementation is similar to timespec_add_safe(). Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/time64.h b/include/linux/time64.h index 367d5af..1778937 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n /* * timespec64_add_safe assumes both values are positive and checks for - * overflow. It will return TIME_T_MAX if the returned value would be - * smaller then either of the arguments. + * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); - static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { diff --git a/kernel/time/time.c b/kernel/time/time.c index a4064b6..cb1f83e 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } + +#if __BITS_PER_LONG != 64 + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} + +#endif -- cgit v0.10.2 From 766b9f928bd5b9b185d986d40355d1f143484136 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:05 -0700 Subject: fs: poll/select/recvmmsg: use timespec64 for timeout events struct timespec is not y2038 safe. Even though timespec might be sufficient to represent timeouts, use struct timespec64 here as the plan is to get rid of all timespec reference in the kernel. The patch transitions the common functions: poll_select_set_timeout() and select_estimate_accuracy() to use timespec64. And, all the syscalls that use these functions are transitioned in the same patch. The restart block parameters for poll uses monotonic time. Use timespec64 here as well to assign timeout value. This parameter in the restart block need not change because this only holds the monotonic timestamp at which timeout should occur. And, unsigned long data type should be big enough for this timestamp. The system call interfaces will be handled in a separate series. Compat interfaces need not change as timespec64 is an alias to struct timespec on a 64 bit system. Link: http://lkml.kernel.org/r/1461947989-21926-3-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Acked-by: David S. Miller Cc: Alexander Viro Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a74a2a..10db912 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep, return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); } -static inline struct timespec ep_set_mstimeout(long ms) +static inline struct timespec64 ep_set_mstimeout(long ms) { - struct timespec now, ts = { + struct timespec64 now, ts = { .tv_sec = ms / MSEC_PER_SEC, .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), }; - ktime_get_ts(&now); - return timespec_add_safe(now, ts); + ktime_get_ts64(&now); + return timespec64_add_safe(now, ts); } /** @@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, ktime_t expires, *to = NULL; if (timeout > 0) { - struct timespec end_time = ep_set_mstimeout(timeout); + struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; - *to = timespec_to_ktime(end_time); + *to = timespec64_to_ktime(end_time); } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the diff --git a/fs/select.c b/fs/select.c index 8692939..8ed9da5 100644 --- a/fs/select.c +++ b/fs/select.c @@ -47,7 +47,7 @@ #define MAX_SLACK (100 * NSEC_PER_MSEC) -static long __estimate_accuracy(struct timespec *tv) +static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; @@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -u64 select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; - struct timespec now; + struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. @@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv) if (rt_task(current)) return 0; - ktime_get_ts(&now); - now = timespec_sub(*tv, now); + ktime_get_ts64(&now); + now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; @@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value - * @to: pointer to timespec variable for the final timeout + * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * @@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout); * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ -int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { - struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; - if (!timespec_valid(&ts)) + if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { - ktime_get_ts(to); - *to = timespec_add_safe(*to, ts); + ktime_get_ts64(to); + *to = timespec64_add_safe(*to, ts); } return 0; } -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, +static int poll_select_copy_remaining(struct timespec64 *end_time, + void __user *p, int timeval, int ret) { + struct timespec64 rts64; struct timespec rts; struct timeval rtv; @@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&rts); - rts = timespec_sub(*end_time, rts); - if (rts.tv_sec < 0) - rts.tv_sec = rts.tv_nsec = 0; + ktime_get_ts64(&rts64); + rts64 = timespec64_sub(*end_time, rts64); + if (rts64.tv_sec < 0) + rts64.tv_sec = rts64.tv_nsec = 0; + + rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts.tv_sec; - rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts64.tv_sec; + rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; @@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec *end_time) +int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec *end_time) + fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -622,7 +626,7 @@ out_nofds: SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct timeval tv; int ret; @@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 ts64, end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; + ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) return -EINVAL; } @@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, - struct timespec *end_time) + struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; @@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, sizeof(struct pollfd)) int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec *end_time) + struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; @@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec *to = NULL, end_time; + struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { @@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block) SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { @@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 end_time, *to = NULL; int ret; if (tsp) { diff --git a/include/linux/poll.h b/include/linux/poll.h index 9fb4f40..37b057b 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern u64 select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec64 *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) @@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset) #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) -extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time); +extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time); extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, - struct timespec *end_time); + struct timespec64 *end_time); extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec *end_time); + fd_set __user *exp, struct timespec64 *end_time); -extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec); +extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec, + long nsec); #endif /* _LINUX_POLL_H */ diff --git a/net/socket.c b/net/socket.c index e7793f5..a1bd161 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2168,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; - struct timespec end_time; + struct timespec64 end_time; + struct timespec64 timeout64; if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, @@ -2220,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, flags |= MSG_DONTWAIT; if (timeout) { - ktime_get_ts(timeout); - *timeout = timespec_sub(end_time, *timeout); + ktime_get_ts64(&timeout64); + *timeout = timespec64_to_timespec( + timespec64_sub(end_time, timeout64)); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; -- cgit v0.10.2 From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:08 -0700 Subject: time: remove timespec_add_safe() All references to timespec_add_safe() now use timespec64_add_safe(). The plan is to replace struct timespec references with struct timespec64 throughout the kernel as timespec is not y2038 safe. Drop timespec_add_safe() and use timespec64_add_safe() for all architectures. Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/time64.h b/include/linux/time64.h index 1778937..7e5d2fa 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec * # define timespec64_equal timespec_equal # define timespec64_compare timespec_compare # define set_normalized_timespec64 set_normalized_timespec -# define timespec64_add_safe timespec_add_safe # define timespec64_add timespec_add # define timespec64_sub timespec_sub # define timespec64_valid timespec_valid @@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); -/* - * timespec64_add_safe assumes both values are positive and checks for - * overflow. It will return TIME64_MAX in case of overflow. - */ -extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, - const struct timespec64 rhs); - static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { @@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) #endif +/* + * timespec64_add_safe assumes both values are positive and checks for + * overflow. It will return TIME64_MAX in case of overflow. + */ +extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs); + #endif /* _LINUX_TIME64_H */ diff --git a/kernel/time/time.c b/kernel/time/time.c index cb1f83e..667b933 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } -#if __BITS_PER_LONG != 64 - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). @@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } - -#endif -- cgit v0.10.2 From 310c6dd06a0b3c6e808067379cfb56c864e95ffc Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 19 May 2016 17:09:11 -0700 Subject: scripts/decode_stacktrace.sh: handle symbols in modules scripts/decode_stacktrace.sh presently displays module symbols as func+0x0ff/0x5153 [module] Add a third argument: the pathname of a directory where the script should look for the file module.ko so that the output appears as func (foo/bar.c:123) module Without the argument or if the module file isn't found the script prints such symbols as is without decoding. Signed-off-by: Konstantin Khlebnikov Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 00d6d53c..c332684 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -2,15 +2,17 @@ # (c) 2014, Sasha Levin #set -x -if [[ $# != 2 ]]; then +if [[ $# < 2 ]]; then echo "Usage:" - echo " $0 [vmlinux] [base path]" + echo " $0 [vmlinux] [base path] [modules path]" exit 1 fi vmlinux=$1 basepath=$2 +modpath=$3 declare -A cache +declare -A modcache parse_symbol() { # The structure of symbol at this point is: @@ -19,6 +21,17 @@ parse_symbol() { # For example: # do_basic_setup+0x9c/0xbf + if [[ $module == "" ]] ; then + local objfile=$vmlinux + elif [[ "${modcache[$module]+isset}" == "isset" ]]; then + local objfile=${modcache[$module]} + else + [[ $modpath == "" ]] && return + local objfile=$(find "$modpath" -name $module.ko -print -quit) + [[ $objfile == "" ]] && return + modcache[$module]=$objfile + fi + # Remove the englobing parenthesis symbol=${symbol#\(} symbol=${symbol%\)} @@ -29,11 +42,11 @@ parse_symbol() { # Use 'nm vmlinux' to figure out the base address of said symbol. # It's actually faster to call it every time than to load it # all into bash. - if [[ "${cache[$name]+isset}" == "isset" ]]; then - local base_addr=${cache[$name]} + if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then + local base_addr=${cache[$module,$name]} else - local base_addr=$(nm "$vmlinux" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1) - cache["$name"]="$base_addr" + local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1) + cache[$module,$name]="$base_addr" fi # Let's start doing the math to get the exact address into the # symbol. First, strip out the symbol total length. @@ -48,12 +61,12 @@ parse_symbol() { local address=$(printf "%x\n" "$expr") # Pass it to addr2line to get filename and line number - # Could get more than one result - if [[ "${cache[$address]+isset}" == "isset" ]]; then - local code=${cache[$address]} + # Could get more than one result + if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then + local code=${cache[$module,$address]} else - local code=$(addr2line -i -e "$vmlinux" "$address") - cache[$address]=$code + local code=$(addr2line -i -e "$objfile" "$address") + cache[$module,$address]=$code fi # addr2line doesn't return a proper error code if it fails, so @@ -105,13 +118,23 @@ handle_line() { fi done - # The symbol is the last element, process it - symbol=${words[$last]} + if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then + module=${words[$last]} + module=${module#\[} + module=${module%\]} + symbol=${words[$last-1]} + unset words[$last-1] + else + # The symbol is the last element, process it + symbol=${words[$last]} + module= + fi + unset words[$last] parse_symbol # modifies $symbol # Add up the line number to the symbol - echo "${words[@]}" "$symbol" + echo "${words[@]}" "$symbol $module" } while read line; do @@ -121,8 +144,8 @@ while read line; do handle_line "$line" # Is it a code line? elif [[ $line == *Code:* ]]; then - decode_code "$line" - else + decode_code "$line" + else # Nothing special in this line, show it as is echo "$line" fi -- cgit v0.10.2 From bad7de742d8192e9759e7f462bd2055a7e7d71f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 May 2016 17:09:14 -0700 Subject: scripts/spelling.txt: add "fimware" misspelling A few instances of "fimware" instead of "firmware" were found. Fix these and add it to the spelling.txt file. Signed-off-by: Kees Cook Reported-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/media/usb/dvb-usb/dib0700_core.c b/drivers/media/usb/dvb-usb/dib0700_core.c index c16f999..bf890c3 100644 --- a/drivers/media/usb/dvb-usb/dib0700_core.c +++ b/drivers/media/usb/dvb-usb/dib0700_core.c @@ -517,7 +517,7 @@ int dib0700_download_firmware(struct usb_device *udev, const struct firmware *fw if (nb_packet_buffer_size < 1) nb_packet_buffer_size = 1; - /* get the fimware version */ + /* get the firmware version */ usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), REQUEST_GET_VERSION, USB_TYPE_VENDOR | USB_DIR_IN, 0, 0, diff --git a/drivers/scsi/bfa/bfi.h b/drivers/scsi/bfa/bfi.h index 97600dc..5f698d0 100644 --- a/drivers/scsi/bfa/bfi.h +++ b/drivers/scsi/bfa/bfi.h @@ -356,7 +356,7 @@ struct bfi_ioc_image_hdr_s { u8 port0_mode; /* device mode for port 0 */ u8 port1_mode; /* device mode for port 1 */ u32 exec; /* exec vector */ - u32 bootenv; /* fimware boot env */ + u32 bootenv; /* firmware boot env */ u32 rsvd_b[2]; struct bfi_ioc_fwver_s fwver; u32 md5sum[BFI_IOC_MD5SUM_SZ]; diff --git a/drivers/staging/comedi/drivers/daqboard2000.c b/drivers/staging/comedi/drivers/daqboard2000.c index 57ab668..a536a15 100644 --- a/drivers/staging/comedi/drivers/daqboard2000.c +++ b/drivers/staging/comedi/drivers/daqboard2000.c @@ -26,7 +26,7 @@ * Much of the functionality of this driver was determined from reading * the source code for the Windows driver. * - * The FPGA on the board requires fimware, which is available from + * The FPGA on the board requires firmware, which is available from * http://www.comedi.org in the comedi_nonfree_firmware tarball. * * Configuration options: not applicable, uses PCI auto config diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 946caf3..fa79c6d 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -428,6 +428,7 @@ feautures||features fetaure||feature fetaures||features fileystem||filesystem +fimware||firmware finanize||finalize findn||find finilizes||finalizes -- cgit v0.10.2 From b21e91c305bcebf55b7a34638e5885528f3fb453 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 19 May 2016 17:09:17 -0700 Subject: scripts/bloat-o-meter: print percent change This adds an additional line of output (to reduce the chances of breaking any existing output parsers) which prints the total size before and after and the relative difference. add/remove: 39/0 grow/shrink: 12408/55 up/down: 362227/-1430 (360797) function old new delta ext4_fill_super 10556 12590 +2034 _fpadd_parts - 1186 +1186 ntfs_fill_super 5340 6164 +824 ... ... __divdf3 752 386 -366 unlzma 3682 3274 -408 Total: Before=5023101, After=5383898, chg 7.000000% ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Link: http://lkml.kernel.org/r/1463124110-30314-1-git-send-email-vgupta@synopsys.com Signed-off-by: Vineet Gupta Cc: Josh Triplett Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 38b64f4..0254f3ba 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -32,18 +32,21 @@ old = getsizes(sys.argv[1]) new = getsizes(sys.argv[2]) grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0 delta, common = [], {} +otot, ntot = 0, 0 for a in old: if a in new: common[a] = 1 for name in old: + otot += old[name] if name not in common: remove += 1 down += old[name] delta.append((-old[name], name)) for name in new: + ntot += new[name] if name not in common: add += 1 up += new[name] @@ -63,3 +66,6 @@ print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")) for d, n in delta: if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) + +print("Total: Before=%d, After=%d, chg %f%%" % \ + (otot, ntot, (ntot - otot)*100/otot)) -- cgit v0.10.2 From b1e4d9d82df8ab9097f80aa208c40eab6fc29858 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:20 -0700 Subject: debugobjects: make fixup functions return bool instead of int I am going to introduce debugobjects infrastructure to USB subsystem. But before this, I found the code of debugobjects could be improved. This patchset will make fixup functions return bool type instead of int. Because fixup only need report success or no. boolean is the 'real' type. This patch (of 7): The object debugging infrastructure core provides some fixup callbacks for the subsystem who use it. These callbacks are called from the debug code whenever a problem in debug_object_init is detected. And debugobjects core suppose them returns 1 when the fixup was successful, otherwise 0. So the return type is boolean. A bad thing is that debug_object_fixup use the return value for arithmetic operation. It confused me that what is the reall return type. Reading over the whole code, I found some place do use the return value incorrectly(see next patch). So why use bool type instead? Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 98ffcbd..a899f10 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -39,7 +39,8 @@ struct debug_obj { * @debug_hint: function returning address, which have associated * kernel symbol, to allow identify the object * @fixup_init: fixup function, which is called when the init check - * fails + * fails. All fixup functions must return true if fixup + * was successful, otherwise return false * @fixup_activate: fixup function, which is called when the activate check * fails * @fixup_destroy: fixup function, which is called when the destroy check @@ -51,12 +52,12 @@ struct debug_obj { */ struct debug_obj_descr { const char *name; - void *(*debug_hint) (void *addr); - int (*fixup_init) (void *addr, enum debug_obj_state state); - int (*fixup_activate) (void *addr, enum debug_obj_state state); - int (*fixup_destroy) (void *addr, enum debug_obj_state state); - int (*fixup_free) (void *addr, enum debug_obj_state state); - int (*fixup_assert_init)(void *addr, enum debug_obj_state state); + void *(*debug_hint)(void *addr); + bool (*fixup_init)(void *addr, enum debug_obj_state state); + bool (*fixup_activate)(void *addr, enum debug_obj_state state); + bool (*fixup_destroy)(void *addr, enum debug_obj_state state); + bool (*fixup_free)(void *addr, enum debug_obj_state state); + bool (*fixup_assert_init)(void *addr, enum debug_obj_state state); }; #ifdef CONFIG_DEBUG_OBJECTS diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 519b5a1..a9cee16 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg) * Try to repair the damage, so we have a better chance to get useful * debug output. */ -static int -debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), +static bool +debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state), void * addr, enum debug_obj_state state) { - int fixed = 0; - - if (fixup) - fixed = fixup(addr, state); - debug_objects_fixups += fixed; - return fixed; + if (fixup && fixup(addr, state)) { + debug_objects_fixups++; + return true; + } + return false; } static void debug_object_is_on_stack(void *addr, int onstack) @@ -797,7 +796,7 @@ static __initdata struct debug_obj_descr descr_type_test; * fixup_init is called when: * - an active object is initialized */ -static int __init fixup_init(void *addr, enum debug_obj_state state) +static bool __init fixup_init(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -805,9 +804,9 @@ static int __init fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_init(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -816,7 +815,7 @@ static int __init fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int __init fixup_activate(void *addr, enum debug_obj_state state) +static bool __init fixup_activate(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -825,17 +824,17 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state) if (obj->static_init == 1) { debug_object_init(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); - return 0; + return false; } - return 1; + return true; case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -843,7 +842,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state) * fixup_destroy is called when: * - an active object is destroyed */ -static int __init fixup_destroy(void *addr, enum debug_obj_state state) +static bool __init fixup_destroy(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -851,9 +850,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_destroy(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -861,7 +860,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int __init fixup_free(void *addr, enum debug_obj_state state) +static bool __init fixup_free(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -869,9 +868,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_free(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } -- cgit v0.10.2 From e7a8e78bd4ad931660743bd2dbabd9170a715294 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:23 -0700 Subject: debugobjects: correct the usage of fixup call results If debug_object_fixup() return non-zero when problem has been fixed. But the code got it backwards, it taks 0 as fixup successfully. So fix it. Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a9cee16..2f07c8c 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -415,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); ret = debug_object_fixup(descr->fixup_activate, addr, state); - return ret ? -EINVAL : 0; + return ret ? 0 : -EINVAL; case ODEBUG_STATE_DESTROYED: debug_print_object(obj, "activate"); -- cgit v0.10.2 From 02a982a6ec631d871571f940ca13817551759884 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:26 -0700 Subject: workqueue: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to change (debugobjects: make fixup functions return bool instead of int) Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f5068e..6751b18 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -437,7 +437,7 @@ static void *work_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int work_fixup_init(void *addr, enum debug_obj_state state) +static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -445,9 +445,9 @@ static int work_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -456,7 +456,7 @@ static int work_fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int work_fixup_activate(void *addr, enum debug_obj_state state) +static bool work_fixup_activate(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -471,16 +471,16 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); - return 0; + return false; } WARN_ON_ONCE(1); - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -488,7 +488,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int work_fixup_free(void *addr, enum debug_obj_state state) +static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -496,9 +496,9 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } -- cgit v0.10.2 From e3252464da222ef2c2ca52dff383a824080ea3d5 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:29 -0700 Subject: timer: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa0b983..f962a58 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -342,9 +342,9 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -353,19 +353,19 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_NOTAVAILABLE: WARN_ON_ONCE(1); - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -373,7 +373,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -381,9 +381,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 73164c3..be33481 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -493,7 +493,7 @@ static void *timer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -501,9 +501,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -518,7 +518,7 @@ static void stub_timer(unsigned long data) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -534,18 +534,18 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); - return 0; + return false; } else { setup_timer(timer, stub_timer, 0); - return 1; + return true; } - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -553,7 +553,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) +static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -561,9 +561,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -571,7 +571,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) * fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -584,13 +584,13 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) * is tracked in the object tracker. */ debug_object_init(timer, &timer_debug_descr); - return 0; + return false; } else { setup_timer(timer, stub_timer, 0); - return 1; + return true; } default: - return 0; + return false; } } -- cgit v0.10.2 From 3263d28eb5b93b3c1b024366df87b1d0c774228f Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:32 -0700 Subject: rcu: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ccdc8e..a9df198 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -386,7 +386,7 @@ void destroy_rcu_head(struct rcu_head *head) * - an unknown object is activated (might be a statically initialized object) * Activation is performed internally by call_rcu(). */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) { struct rcu_head *head = addr; @@ -399,9 +399,9 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) */ debug_object_init(head, &rcuhead_debug_descr); debug_object_activate(head, &rcuhead_debug_descr); - return 0; + return false; default: - return 1; + return true; } } -- cgit v0.10.2 From d99b1d8912654c4bdeb51063d2e934afc2372cc2 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:35 -0700 Subject: percpu_counter: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f051d69..72d3611 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock); static struct debug_obj_descr percpu_counter_debug_descr; -static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) +static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; @@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); - return 1; + return true; default: - return 0; + return false; } } -- cgit v0.10.2 From 8bad1cd0e1edd124c0f05f925762ef84e6047586 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:38 -0700 Subject: Documentation: update debugobjects doc Update documentation creangponding to change(debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl index 24979f6..7e4f34f 100644 --- a/Documentation/DocBook/debugobjects.tmpl +++ b/Documentation/DocBook/debugobjects.tmpl @@ -316,8 +316,8 @@ - The function returns 1 when the fixup was successful, - otherwise 0. The return value is used to update the + The function returns true when the fixup was successful, + otherwise false. The return value is used to update the statistics. @@ -341,8 +341,8 @@ - The function returns 1 when the fixup was successful, - otherwise 0. The return value is used to update the + The function returns true when the fixup was successful, + otherwise false. The return value is used to update the statistics. @@ -359,7 +359,8 @@ statically initialized object or not. In case it is it calls debug_object_init() and debug_object_activate() to make the object known to the tracker and marked active. In this case - the function should return 0 because this is not a real fixup. + the function should return false because this is not a real + fixup. @@ -376,8 +377,8 @@ - The function returns 1 when the fixup was successful, - otherwise 0. The return value is used to update the + The function returns true when the fixup was successful, + otherwise false. The return value is used to update the statistics. @@ -397,8 +398,8 @@ - The function returns 1 when the fixup was successful, - otherwise 0. The return value is used to update the + The function returns true when the fixup was successful, + otherwise false. The return value is used to update the statistics. @@ -414,8 +415,8 @@ debug bucket. - The function returns 1 when the fixup was successful, - otherwise 0. The return value is used to update the + The function returns true when the fixup was successful, + otherwise false. The return value is used to update the statistics. @@ -427,7 +428,8 @@ case. The fixup function should check if this is a legitimate case of a statically initialized object or not. In this case only debug_object_init() should be called to make the object known to - the tracker. Then the function should return 0 because this is not + the tracker. Then the function should return false because this + is not a real fixup. -- cgit v0.10.2 From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:41 -0700 Subject: debugobjects: insulate non-fixup logic related to static obj from fixup callbacks When activating a static object we need make sure that the object is tracked in the object tracker. If it is a non-static object then the activation is illegal. In previous implementation, each subsystem need take care of this in their fixup callbacks. Actually we can put it into debugobjects core. Thus we can save duplicated code, and have *pure* fixup callbacks. To achieve this, a new callback "is_static_object" is introduced to let the type specific code decide whether a object is static or not. If yes, we take it into object tracker, otherwise give warning and invoke fixup callback. This change has paassed debugobjects selftest, and I also do some test with all debugobjects supports enabled. At last, I have a concern about the fixups that can it change the object which is in incorrect state on fixup? Because the 'addr' may not point to any valid object if a non-static object is not tracked. Then Change such object can overwrite someone's memory and cause unexpected behaviour. For example, the timer_fixup_activate bind timer to function stub_timer. Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com [changbin.du@intel.com: improve code comments where invoke the new is_static_object callback] Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index a899f10..46056cb 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -38,6 +38,7 @@ struct debug_obj { * @name: name of the object typee * @debug_hint: function returning address, which have associated * kernel symbol, to allow identify the object + * @is_static_object return true if the obj is static, otherwise return false * @fixup_init: fixup function, which is called when the init check * fails. All fixup functions must return true if fixup * was successful, otherwise return false @@ -53,6 +54,7 @@ struct debug_obj { struct debug_obj_descr { const char *name; void *(*debug_hint)(void *addr); + bool (*is_static_object)(void *addr); bool (*fixup_init)(void *addr, enum debug_obj_state state); bool (*fixup_activate)(void *addr, enum debug_obj_state state); bool (*fixup_destroy)(void *addr, enum debug_obj_state state); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a9df198..3e888cd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_is_static_object(void *addr) { - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return false; - default: - return true; - } + return true; } /** @@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, + .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f962a58..8c7392c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return false; - case ODEBUG_STATE_ACTIVE: WARN_ON(1); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index be33481..3a95f97 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr) return ((struct timer_list *) addr)->function; } +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + /* * fixup_init is called when: * - an active object is initialized @@ -516,30 +524,16 @@ static void stub_timer(unsigned long data) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.pprev == NULL && - timer->entry.next == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } - return false; + setup_timer(timer, stub_timer, 0); + return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); @@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.next == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } + setup_timer(timer, stub_timer, 0); + return true; default: return false; } @@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6751b18..e1c0e99 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr) return ((struct work_struct *) addr)->func; } +static bool work_is_static_object(void *addr) +{ + struct work_struct *work = addr; + + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); +} + /* * fixup_init is called when: * - an active object is initialized @@ -452,39 +459,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state) } /* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static bool work_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return false; - } - WARN_ON_ONCE(1); - return false; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return false; - } -} - -/* * fixup_free is called when: * - an active object is freed */ @@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, + .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2f07c8c..a8e1260 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -431,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * This happens when a static object is activated. We - * let the type specific code decide whether this is - * true or not. + * We are here when a static object is activated. We + * let the type specific code confirm whether this is + * true or not. if true, we just make sure that the + * static object is tracked in the object tracker. If + * not, this must be a bug, so we try to fix it up. */ - if (debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE)) { + if (descr->is_static_object && descr->is_static_object(addr)) { + /* track this static object */ + debug_object_init(addr, descr); + debug_object_activate(addr, descr); + } else { debug_print_object(&o, "activate"); - return -EINVAL; + ret = debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } return 0; } @@ -602,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * Maybe the object is static. Let the type specific - * code decide what to do. + * Maybe the object is static, and we let the type specific + * code confirm. Track this static object if true, else invoke + * fixup. */ - if (debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE)) + if (descr->is_static_object && descr->is_static_object(addr)) { + /* Track this static object */ + debug_object_init(addr, descr); + } else { debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, + ODEBUG_STATE_NOTAVAILABLE); + } return; } @@ -792,6 +805,13 @@ struct self_test { static __initdata struct debug_obj_descr descr_type_test; +static bool __init is_static_object(void *addr) +{ + struct self_test *obj = addr; + + return obj->static_init; +} + /* * fixup_init is called when: * - an active object is initialized @@ -813,7 +833,7 @@ static bool __init fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool __init fixup_activate(void *addr, enum debug_obj_state state) { @@ -821,13 +841,7 @@ static bool __init fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (obj->static_init == 1) { - debug_object_init(obj, &descr_type_test); - debug_object_activate(obj, &descr_type_test); - return false; - } return true; - case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); @@ -916,6 +930,7 @@ out: static __initdata struct debug_obj_descr descr_type_test = { .name = "selftest", + .is_static_object = is_static_object, .fixup_init = fixup_init, .fixup_activate = fixup_activate, .fixup_destroy = fixup_destroy, -- cgit v0.10.2 From 8ba442214c332a2a10ac90bc24ebb00aea4ae0ec Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Thu, 19 May 2016 17:09:44 -0700 Subject: ocfs2: fix comment in struct ocfs2_extended_slot The comment in ocfs2_extended_slot has the offset wrong. Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 540ab5b..44d178b 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -580,7 +580,7 @@ struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; -/*10*/ +/*08*/ }; /* -- cgit v0.10.2 From c14688ea248c5ddfb6b8b97151de0a8620ba4a68 Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 19 May 2016 17:09:47 -0700 Subject: ocfs2: clean up an unused variable 'wants_rotate' in ocfs2_truncate_rec Clean up an unused variable 'wants_rotate' in ocfs2_truncate_rec. Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e361d1a..460c0ce 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle, { int ret; u32 left_cpos, rec_range, trunc_range; - int wants_rotate = 0, is_rightmost_tree_rec = 0; + int is_rightmost_tree_rec = 0; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle, memset(rec, 0, sizeof(*rec)); ocfs2_cleanup_merge(el, index); - wants_rotate = 1; next_free = le16_to_cpu(el->l_next_free_rec); if (is_rightmost_tree_rec && next_free > 1) { -- cgit v0.10.2 From aa6913dbd26af386e7703f6c0d4ed2a848b7a052 Mon Sep 17 00:00:00 2001 From: Jun Piao Date: Thu, 19 May 2016 17:09:50 -0700 Subject: ocfs2: clean up unused parameter 'count' in o2hb_read_block_input() Clean up unused parameter 'count' in o2hb_read_block_input(). Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 1934abb..a8d15be 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1456,7 +1456,6 @@ static void o2hb_region_release(struct config_item *item) static int o2hb_read_block_input(struct o2hb_region *reg, const char *page, - size_t count, unsigned long *ret_bytes, unsigned int *ret_bits) { @@ -1499,8 +1498,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, if (reg->hr_bdev) return -EINVAL; - status = o2hb_read_block_input(reg, page, count, - &block_bytes, &block_bits); + status = o2hb_read_block_input(reg, page, &block_bytes, + &block_bits); if (status) return status; -- cgit v0.10.2 From 8f9b1802c20ace68a7d6603e1fd82a44fedb4078 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Thu, 19 May 2016 17:09:53 -0700 Subject: ocfs2: clean up an unneeded goto in ocfs2_put_slot() The goto is not useful in ocfs2_put_slot(), so delete it. Signed-off-by: Guozhonghua Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1e09592..d740799 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb) spin_unlock(&osb->osb_lock); status = ocfs2_update_disk_slot(osb, si, slot_num); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto bail; - } -bail: ocfs2_free_slot_info(osb); } - -- cgit v0.10.2 From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Thu, 19 May 2016 17:09:56 -0700 Subject: kernel/padata.c: removed unused code By accident I stumbled across code that has never been used. This driver has EXPORT_SYMBOL functions, and the only user of the code is pcrypt.c, but this only uses a subset of the exported symbols. According to 'git log -G', the functions, padata_set_cpumasks, padata_add_cpu, and padata_remove_cpu have never been used since they were first introduced. This patch removes the unused code. On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than 4k smaller. kbuild_hp> size $KBUILD_OUTPUT/vmlinux text data bss dec hex filename 10566658 4678360 1122304 16367322 f9beda vmlinux 10561984 4678360 1122304 16362648 f9ac98 vmlinux On another config, 32 bit, the saving is about 0.5k bytes. kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux 6012005 2409513 2785280 11206798 ab008e vmlinux 6011491 2409513 2785280 11206284 aafe8c vmlinux Signed-off-by: Richard Cochran Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/padata.h b/include/linux/padata.h index 4386946..113ee62 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst, extern void padata_do_serial(struct padata_priv *padata); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); -extern int padata_set_cpumasks(struct padata_instance *pinst, - cpumask_var_t pcpumask, - cpumask_var_t cbcpumask); -extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask); -extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask); extern int padata_start(struct padata_instance *pinst); extern void padata_stop(struct padata_instance *pinst); extern int padata_register_cpumask_notifier(struct padata_instance *pinst, diff --git a/kernel/padata.c b/kernel/padata.c index b38bea9..67ddd4a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -607,33 +607,6 @@ out_replace: } /** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - -/** * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value * equivalent to @cpumask. * @@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd = NULL; @@ -1091,7 +1028,6 @@ err_free_inst: err: return NULL; } -EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance -- cgit v0.10.2 From 19d795b677bda354644cfb87a196b087fdc2a965 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 May 2016 17:09:59 -0700 Subject: kernel/padata.c: hide unused functions A recent cleanup removed some exported functions that were not used anywhere, which in turn exposed the fact that some other functions in the same file are only used in some configurations. We now get a warning about them when CONFIG_HOTPLUG_CPU is disabled: kernel/padata.c:670:12: error: '__padata_remove_cpu' defined but not used [-Werror=unused-function] static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) ^~~~~~~~~~~~~~~~~~~ kernel/padata.c:650:12: error: '__padata_add_cpu' defined but not used [-Werror=unused-function] static int __padata_add_cpu(struct padata_instance *pinst, int cpu) This rearranges the code so the __padata_remove_cpu/__padata_add_cpu functions are within the #ifdef that protects the code that calls them. [akpm@linux-foundation.org: coding-style fixes] Fixes: 4ba6d78c671e ("kernel/padata.c: removed unused code") Signed-off-by: Arnd Bergmann Cc: Richard Cochran Cc: Steffen Klassert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/padata.c b/kernel/padata.c index 67ddd4a..9932788 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -647,6 +647,43 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); +/** + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +int padata_start(struct padata_instance *pinst) +{ + int err = 0; + + mutex_lock(&pinst->lock); + + if (pinst->flags & PADATA_INVALID) + err = -EINVAL; + + __padata_start(pinst); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_start); + +/** + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + mutex_lock(&pinst->lock); + __padata_stop(pinst); + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +#ifdef CONFIG_HOTPLUG_CPU + static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; @@ -726,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) } EXPORT_SYMBOL(padata_remove_cpu); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err =-EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - -#ifdef CONFIG_HOTPLUG_CPU - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || -- cgit v0.10.2 From 18726ca8b34bbfb3ab5a1c0a52a5d8dd392466ed Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:02 -0700 Subject: mm/slab: fix the theoretical race by holding proper lock While processing concurrent allocation, SLAB could be contended a lot because it did a lots of work with holding a lock. This patchset try to reduce the number of critical section to reduce lock contention. Major changes are lockless decision to allocate more slab and lockless cpu cache refill from the newly allocated slab. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=365/806 Kmalloc N*alloc N*free(64): Average=452/690 Kmalloc N*alloc N*free(128): Average=736/886 Kmalloc N*alloc N*free(256): Average=1167/985 Kmalloc N*alloc N*free(512): Average=2088/1125 Kmalloc N*alloc N*free(1024): Average=4115/1184 Kmalloc N*alloc N*free(2048): Average=8451/1748 Kmalloc N*alloc N*free(4096): Average=16024/2048 * After Kmalloc N*alloc N*free(32): Average=344/792 Kmalloc N*alloc N*free(64): Average=347/882 Kmalloc N*alloc N*free(128): Average=390/959 Kmalloc N*alloc N*free(256): Average=393/1067 Kmalloc N*alloc N*free(512): Average=683/1229 Kmalloc N*alloc N*free(1024): Average=1295/1325 Kmalloc N*alloc N*free(2048): Average=2513/1664 Kmalloc N*alloc N*free(4096): Average=4742/2172 It shows that performance improves greatly (roughly more than 50%) for the object class whose size is more than 128 bytes. This patch (of 11): If we don't hold neither the slab_mutex nor the node lock, node's shared array cache could be freed and re-populated. If __kmem_cache_shrink() is called at the same time, it will call drain_array() with n->shared without holding node lock so problem can happen. This patch fix the situation by holding the node lock before trying to drain the shared array. In addition, add a debug check to confirm that n->shared access race doesn't exist. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 17e2848..3f1cc1c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2180,6 +2180,11 @@ static void check_irq_on(void) BUG_ON(irqs_disabled()); } +static void check_mutex_acquired(void) +{ + BUG_ON(!mutex_is_locked(&slab_mutex)); +} + static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP @@ -2199,13 +2204,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) +#define check_mutex_acquired() do { } while(0) #define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, - int force, int node); +static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, + int node, bool free_all, struct list_head *list) +{ + int tofree; + + if (!ac || !ac->avail) + return; + + tofree = free_all ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + + free_block(cachep, ac->entry, tofree, node, list); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); +} static void do_drain(void *arg) { @@ -2229,6 +2248,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_cache_node *n; int node; + LIST_HEAD(list); on_each_cpu(do_drain, cachep, 1); check_irq_on(); @@ -2236,8 +2256,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep) if (n->alien) drain_alien_cache(cachep, n->alien); - for_each_kmem_cache_node(cachep, node, n) - drain_array(cachep, n, n->shared, 1, node); + for_each_kmem_cache_node(cachep, node, n) { + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } } /* @@ -3869,29 +3894,26 @@ skip_setup: * if drain_array() is used on the shared array. */ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, int force, int node) + struct array_cache *ac, int node) { LIST_HEAD(list); - int tofree; + + /* ac from n->shared can be freed if we don't hold the slab_mutex. */ + check_mutex_acquired(); if (!ac || !ac->avail) return; - if (ac->touched && !force) { + + if (ac->touched) { ac->touched = 0; - } else { - spin_lock_irq(&n->list_lock); - if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node, &list); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); - } - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); + return; } + + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); } /** @@ -3929,7 +3951,7 @@ static void cache_reap(struct work_struct *w) reap_alien(searchp, n); - drain_array(searchp, n, cpu_cache_get(searchp), 0, node); + drain_array(searchp, n, cpu_cache_get(searchp), node); /* * These are racy checks but it does not matter @@ -3940,7 +3962,7 @@ static void cache_reap(struct work_struct *w) n->next_reap = jiffies + REAPTIMEOUT_NODE; - drain_array(searchp, n, n->shared, 0, node); + drain_array(searchp, n, n->shared, node); if (n->free_touched) n->free_touched = 0; -- cgit v0.10.2 From 8888177ea116d4d14ca0a2ba054d02f35b0dae29 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:05 -0700 Subject: mm/slab: remove BAD_ALIEN_MAGIC again Initial attemp to remove BAD_ALIEN_MAGIC is once reverted by 'commit edcad2509550 ("Revert "slab: remove BAD_ALIEN_MAGIC"")' because it causes a problem on m68k which has many node but !CONFIG_NUMA. In this case, although alien cache isn't used at all but to cope with some initialization path, garbage value is used and that is BAD_ALIEN_MAGIC. Now, this patch set use_alien_caches to 0 when !CONFIG_NUMA, there is no initialization path problem so we don't need BAD_ALIEN_MAGIC at all. So remove it. Signed-off-by: Joonsoo Kim Tested-by: Geert Uytterhoeven Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 3f1cc1c..f36d349 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -421,8 +421,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -644,7 +642,7 @@ static int transfer_objects(struct array_cache *to, static inline struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - return (struct alien_cache **)BAD_ALIEN_MAGIC; + return NULL; } static inline void free_alien_cache(struct alien_cache **ac_ptr) @@ -1212,7 +1210,7 @@ void __init kmem_cache_init(void) sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; - if (num_possible_nodes() == 1) + if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) -- cgit v0.10.2 From a5aa63a5f7352aa8991f64d46854dcb8d3788d55 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:08 -0700 Subject: mm/slab: drain the free slab as much as possible slabs_tofree() implies freeing all free slab. We can do it with just providing INT_MAX. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index f36d349..a998d35 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -895,12 +895,6 @@ static int init_cache_node_node(int node) return 0; } -static inline int slabs_tofree(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ - return (n->free_objects + cachep->num - 1) / cachep->num; -} - static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -965,7 +959,7 @@ free_slab: n = get_node(cachep, node); if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); } } @@ -1117,7 +1111,7 @@ static int __meminit drain_cache_node_node(int node) if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); if (!list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial)) { @@ -2311,7 +2305,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) check_irq_on(); for_each_kmem_cache_node(cachep, node, n) { - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); ret += !list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial); -- cgit v0.10.2 From ded0ecf61118930988f0943e741056c8fd5d439c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:11 -0700 Subject: mm/slab: factor out kmem_cache_node initialization code It can be reused on other place, so factor out it. Following patch will use it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index a998d35..9bef33b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -848,6 +848,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags) } #endif +static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) +{ + struct kmem_cache_node *n; + + /* + * Set up the kmem_cache_node for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + n = get_node(cachep, node); + if (n) { + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; + spin_unlock_irq(&n->list_lock); + + return 0; + } + + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) + return -ENOMEM; + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + n->free_limit = + (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; + + /* + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient + * protection here. + */ + cachep->node[node] = n; + + return 0; +} + /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node @@ -859,39 +899,15 @@ static inline gfp_t gfp_exact_node(gfp_t flags) */ static int init_cache_node_node(int node) { + int ret; struct kmem_cache *cachep; - struct kmem_cache_node *n; - const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (!n) { - n = kmalloc_node(memsize, GFP_KERNEL, node); - if (!n) - return -ENOMEM; - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex is sufficient - * protection here. - */ - cachep->node[node] = n; - } - - spin_lock_irq(&n->list_lock); - n->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + ret = init_cache_node(cachep, node, GFP_KERNEL); + if (ret) + return ret; } + return 0; } -- cgit v0.10.2 From c3d332b6b2c11ddda9cce3e2f3135b68929d4b82 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:14 -0700 Subject: mm/slab: clean-up kmem_cache_node setup There are mostly same code for setting up kmem_cache_node either in cpuup_prepare() or alloc_kmem_cache_node(). Factor out and clean-up them. Signed-off-by: Joonsoo Kim Tested-by: Nishanth Menon Tested-by: Jon Hunter Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 9bef33b..f1db679 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -911,6 +911,63 @@ static int init_cache_node_node(int node) return 0; } +static int setup_kmem_cache_node(struct kmem_cache *cachep, + int node, gfp_t gfp, bool force_change) +{ + int ret = -ENOMEM; + struct kmem_cache_node *n; + struct array_cache *old_shared = NULL; + struct array_cache *new_shared = NULL; + struct alien_cache **new_alien = NULL; + LIST_HEAD(list); + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } + + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); + if (!new_shared) + goto fail; + } + + ret = init_cache_node(cachep, node, gfp); + if (ret) + goto fail; + + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); + n->shared->avail = 0; + } + + if (!n->shared || force_change) { + old_shared = n->shared; + n->shared = new_shared; + new_shared = NULL; + } + + if (!n->alien) { + n->alien = new_alien; + new_alien = NULL; + } + + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + +fail: + kfree(old_shared); + kfree(new_shared); + free_alien_cache(new_alien); + + return ret; +} + static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -982,7 +1039,6 @@ free_slab: static int cpuup_prepare(long cpu) { struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); int err; @@ -1001,44 +1057,9 @@ static int cpuup_prepare(long cpu) * array caches */ list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *shared = NULL; - struct alien_cache **alien = NULL; - - if (cachep->shared) { - shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, - 0xbaadf00d, GFP_KERNEL); - if (!shared) - goto bad; - } - if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); - if (!alien) { - kfree(shared); - goto bad; - } - } - n = get_node(cachep, node); - BUG_ON(!n); - - spin_lock_irq(&n->list_lock); - if (!n->shared) { - /* - * We are serialised from CPU_DEAD or - * CPU_UP_CANCELLED by the cpucontrol lock - */ - n->shared = shared; - shared = NULL; - } -#ifdef CONFIG_NUMA - if (!n->alien) { - n->alien = alien; - alien = NULL; - } -#endif - spin_unlock_irq(&n->list_lock); - kfree(shared); - free_alien_cache(alien); + err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); + if (err) + goto bad; } return 0; @@ -3678,72 +3699,19 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) +static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) { + int ret; int node; struct kmem_cache_node *n; - struct array_cache *new_shared; - struct alien_cache **new_alien = NULL; for_each_online_node(node) { - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - new_shared = NULL; - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared*cachep->batchcount, - 0xbaadf00d, gfp); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; - } - } - - n = get_node(cachep, node); - if (n) { - struct array_cache *shared = n->shared; - LIST_HEAD(list); - - spin_lock_irq(&n->list_lock); - - if (shared) - free_block(cachep, shared->entry, - shared->avail, node, &list); - - n->shared = new_shared; - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - kfree(shared); - free_alien_cache(new_alien); - continue; - } - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) { - free_alien_cache(new_alien); - kfree(new_shared); + ret = setup_kmem_cache_node(cachep, node, gfp, true); + if (ret) goto fail; - } - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - n->shared = new_shared; - n->alien = new_alien; - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - cachep->node[node] = n; } + return 0; fail: @@ -3785,7 +3753,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; if (!prev) - goto alloc_node; + goto setup_node; for_each_online_cpu(cpu) { LIST_HEAD(list); @@ -3802,8 +3770,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, } free_percpu(prev); -alloc_node: - return alloc_kmem_cache_node(cachep, gfp); +setup_node: + return setup_kmem_cache_nodes(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, -- cgit v0.10.2 From 6052b7880a95554993898f7cac075c2669f1dd7c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:17 -0700 Subject: mm/slab: don't keep free slabs if free_objects exceeds free_limit Currently, determination to free a slab is done whenever each freed object is put into the slab. This has a following problem. Assume free_limit = 10 and nr_free = 9. Free happens as following sequence and nr_free changes as following. free(become a free slab) free(not become a free slab) nr_free: 9 -> 10 (at first free) -> 11 (at second free) If we try to check if we can free current slab or not on each object free, we can't free any slab in this situation because current slab isn't a free slab when nr_free exceed free_limit (at second free) even if there is a free slab. However, if we check it lastly, we can free 1 free slab. This problem would cause to keep too much memory in the slab subsystem. This patch try to fix it by checking number of free object after all free work is done. If there is free slab at that time, we can free slab as much as possible so we keep free slab as minimal. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index f1db679..3f16475 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3296,6 +3296,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp, { int i; struct kmem_cache_node *n = get_node(cachep, node); + struct page *page; + + n->free_objects += nr_objects; for (i = 0; i < nr_objects; i++) { void *objp; @@ -3308,17 +3311,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp, check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); - n->free_objects++; /* fixup slab chains */ - if (page->active == 0) { - if (n->free_objects > n->free_limit) { - n->free_objects -= cachep->num; - list_add_tail(&page->lru, list); - } else { - list_add(&page->lru, &n->slabs_free); - } - } else { + if (page->active == 0) + list_add(&page->lru, &n->slabs_free); + else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3326,6 +3323,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp, list_add_tail(&page->lru, &n->slabs_partial); } } + + while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { + n->free_objects -= cachep->num; + + page = list_last_entry(&n->slabs_free, struct page, lru); + list_del(&page->lru); + list_add(&page->lru, list); + } } static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) -- cgit v0.10.2 From 03d1d43a1262b347a9aa814980438fff8eb32edc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:20 -0700 Subject: mm/slab: racy access/modify the slab color Slab color isn't needed to be changed strictly. Because locking for changing slab color could cause more lock contention so this patch implements racy access/modify the slab color. This is a preparation step to implement lockless allocation path when there is no free objects in the kmem_cache. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=365/806 Kmalloc N*alloc N*free(64): Average=452/690 Kmalloc N*alloc N*free(128): Average=736/886 Kmalloc N*alloc N*free(256): Average=1167/985 Kmalloc N*alloc N*free(512): Average=2088/1125 Kmalloc N*alloc N*free(1024): Average=4115/1184 Kmalloc N*alloc N*free(2048): Average=8451/1748 Kmalloc N*alloc N*free(4096): Average=16024/2048 * After Kmalloc N*alloc N*free(32): Average=355/750 Kmalloc N*alloc N*free(64): Average=452/812 Kmalloc N*alloc N*free(128): Average=559/1070 Kmalloc N*alloc N*free(256): Average=1176/980 Kmalloc N*alloc N*free(512): Average=1939/1189 Kmalloc N*alloc N*free(1024): Average=3521/1278 Kmalloc N*alloc N*free(2048): Average=7152/1838 Kmalloc N*alloc N*free(4096): Average=13438/2013 It shows that contention is reduced for object size >= 1024 and performance increases by roughly 15%. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 3f16475..e181cfb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2574,20 +2574,7 @@ static int cache_grow(struct kmem_cache *cachep, } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = get_node(cachep, nodeid); - spin_lock(&n->list_lock); - - /* Get colour for the slab, and cal the next value. */ - offset = n->colour_next; - n->colour_next++; - if (n->colour_next >= cachep->colour) - n->colour_next = 0; - spin_unlock(&n->list_lock); - - offset *= cachep->colour_off; - if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); @@ -2608,6 +2595,19 @@ static int cache_grow(struct kmem_cache *cachep, if (!page) goto failed; + n = get_node(cachep, nodeid); + + /* Get colour for the slab, and cal the next value. */ + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + + offset = n->colour_next; + if (offset >= cachep->colour) + offset = 0; + + offset *= cachep->colour_off; + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); -- cgit v0.10.2 From 511e3a05881221a7fc63e36f3d604887040fc845 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:23 -0700 Subject: mm/slab: make cache_grow() handle the page allocated on arbitrary node Currently, cache_grow() assumes that allocated page's nodeid would be same with parameter nodeid which is used for allocation request. If we discard this assumption, we can handle fallback_alloc() case gracefully. So, this patch makes cache_grow() handle the page allocated on arbitrary node and clean-up relevant code. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index e181cfb..b303c04 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2556,13 +2556,14 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, struct page *page) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *freelist; size_t offset; gfp_t local_flags; + int page_node; struct kmem_cache_node *n; + struct page *page; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2590,12 +2591,12 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!page) - page = kmem_getpages(cachep, local_flags, nodeid); + page = kmem_getpages(cachep, local_flags, nodeid); if (!page) goto failed; - n = get_node(cachep, nodeid); + page_node = page_to_nid(page); + n = get_node(cachep, page_node); /* Get colour for the slab, and cal the next value. */ n->colour_next++; @@ -2610,7 +2611,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, - local_flags & ~GFP_CONSTRAINT_MASK, nodeid); + local_flags & ~GFP_CONSTRAINT_MASK, page_node); if (OFF_SLAB(cachep) && !freelist) goto opps1; @@ -2629,13 +2630,13 @@ static int cache_grow(struct kmem_cache *cachep, STATS_INC_GROWN(cachep); n->free_objects += cachep->num; spin_unlock(&n->list_lock); - return 1; + return page_node; opps1: kmem_freepages(cachep, page); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return 0; + return -1; } #if DEBUG @@ -2916,14 +2917,14 @@ alloc_done: return obj; } - x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), node); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && ac->avail == 0) + if (x < 0 && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -3052,7 +3053,6 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { struct zonelist *zonelist; - gfp_t local_flags; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); @@ -3063,8 +3063,6 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); @@ -3094,33 +3092,17 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - struct page *page; + nid = cache_grow(cache, flags, numa_mem_id()); + if (nid >= 0) { + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); - if (gfpflags_allow_blocking(local_flags)) - local_irq_enable(); - kmem_flagcheck(cache, flags); - page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); - if (page) { /* - * Insert into the appropriate per node queues + * Another processor may allocate the objects in + * the slab since we are not holding any locks. */ - nid = page_to_nid(page); - if (cache_grow(cache, flags, nid, page)) { - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - if (!obj) - /* - * Another processor may allocate the - * objects in the slab since we are - * not holding any locks. - */ - goto retry; - } else { - /* cache_grow already freed obj */ - obj = NULL; - } + if (!obj) + goto retry; } } @@ -3171,8 +3153,8 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); - if (x) + x = cache_grow(cachep, gfp_exact_node(flags), nodeid); + if (x >= 0) goto retry; return fallback_alloc(cachep, flags); -- cgit v0.10.2 From 76b342bdc71badea2cbac7bf6590aa86e895c507 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:26 -0700 Subject: mm/slab: separate cache_grow() to two parts This is a preparation step to implement lockless allocation path when there is no free objects in kmem_cache. What we'd like to do here is to refill cpu cache without holding a node lock. To accomplish this purpose, refill should be done after new slab allocation but before attaching the slab to the management list. So, this patch separates cache_grow() to two parts, allocation and attaching to the list in order to add some code inbetween them in the following patch. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index b303c04..8c4db214 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list); +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list); static int slab_early_init = 1; #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) @@ -1810,7 +1815,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, /* * Needed to avoid possible looping condition - * in cache_grow() + * in cache_grow_begin() */ if (OFF_SLAB(freelist_cache)) continue; @@ -2556,7 +2561,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *cache_grow_begin(struct kmem_cache *cachep, + gfp_t flags, int nodeid) { void *freelist; size_t offset; @@ -2622,21 +2628,40 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - check_irq_off(); - spin_lock(&n->list_lock); - /* Make slab active. */ - list_add_tail(&page->lru, &(n->slabs_free)); - STATS_INC_GROWN(cachep); - n->free_objects += cachep->num; - spin_unlock(&n->list_lock); - return page_node; + return page; + opps1: kmem_freepages(cachep, page); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return -1; + return NULL; +} + +static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +{ + struct kmem_cache_node *n; + void *list = NULL; + + check_irq_off(); + + if (!page) + return; + + INIT_LIST_HEAD(&page->lru); + n = get_node(cachep, page_to_nid(page)); + + spin_lock(&n->list_lock); + if (!page->active) + list_add_tail(&page->lru, &(n->slabs_free)); + else + fixup_slab_list(cachep, n, page, &list); + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - page->active; + spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); } #if DEBUG @@ -2847,6 +2872,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; int node; void *list = NULL; + struct page *page; check_irq_off(); node = numa_mem_id(); @@ -2874,7 +2900,6 @@ retry: } while (batchcount > 0) { - struct page *page; /* Get slab alloc is to come from. */ page = get_first_slab(n, false); if (!page) @@ -2907,8 +2932,6 @@ alloc_done: fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { - int x; - /* Check if we can use obj in pfmemalloc slab */ if (sk_memalloc_socks()) { void *obj = cache_alloc_pfmemalloc(cachep, n, flags); @@ -2917,14 +2940,18 @@ alloc_done: return obj; } - x = cache_grow(cachep, gfp_exact_node(flags), node); + page = cache_grow_begin(cachep, gfp_exact_node(flags), node); + cache_grow_end(cachep, page); - /* cache_grow can reenable interrupts, then ac could change. */ + /* + * cache_grow_begin() can reenable interrupts, + * then ac could change. + */ ac = cpu_cache_get(cachep); node = numa_mem_id(); /* no objects in sight? abort */ - if (x < 0 && ac->avail == 0) + if (!page && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -3057,6 +3084,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; + struct page *page; int nid; unsigned int cpuset_mems_cookie; @@ -3092,8 +3120,10 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - nid = cache_grow(cache, flags, numa_mem_id()); - if (nid >= 0) { + page = cache_grow_begin(cache, flags, numa_mem_id()); + cache_grow_end(cache, page); + if (page) { + nid = page_to_nid(page); obj = ____cache_alloc_node(cache, gfp_exact_node(flags), nid); @@ -3121,7 +3151,6 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct kmem_cache_node *n; void *obj; void *list = NULL; - int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); @@ -3153,8 +3182,9 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, gfp_exact_node(flags), nodeid); - if (x >= 0) + page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + cache_grow_end(cachep, page); + if (page) goto retry; return fallback_alloc(cachep, flags); -- cgit v0.10.2 From 213b46958c65c7adaaf3201102da16ce0264e9cf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:29 -0700 Subject: mm/slab: refill cpu cache through a new slab without holding a node lock Until now, cache growing makes a free slab on node's slab list and then we can allocate free objects from it. This necessarily requires to hold a node lock which is very contended. If we refill cpu cache before attaching it to node's slab list, we can avoid holding a node lock as much as possible because this newly allocated slab is only visible to the current task. This will reduce lock contention. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=355/750 Kmalloc N*alloc N*free(64): Average=452/812 Kmalloc N*alloc N*free(128): Average=559/1070 Kmalloc N*alloc N*free(256): Average=1176/980 Kmalloc N*alloc N*free(512): Average=1939/1189 Kmalloc N*alloc N*free(1024): Average=3521/1278 Kmalloc N*alloc N*free(2048): Average=7152/1838 Kmalloc N*alloc N*free(4096): Average=13438/2013 * After Kmalloc N*alloc N*free(32): Average=248/966 Kmalloc N*alloc N*free(64): Average=261/949 Kmalloc N*alloc N*free(128): Average=314/1016 Kmalloc N*alloc N*free(256): Average=741/1061 Kmalloc N*alloc N*free(512): Average=1246/1152 Kmalloc N*alloc N*free(1024): Average=2437/1259 Kmalloc N*alloc N*free(2048): Average=4980/1800 Kmalloc N*alloc N*free(4096): Average=9000/2078 It shows that contention is reduced for all the object sizes and performance increases by 30 ~ 40%. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 8c4db214..37600e9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2865,6 +2865,30 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, return obj; } +/* + * Slab list should be fixed up by fixup_slab_list() for existing slab + * or cache_grow_end() for new slab + */ +static __always_inline int alloc_block(struct kmem_cache *cachep, + struct array_cache *ac, struct page *page, int batchcount) +{ + /* + * There must be at least one object available for + * allocation. + */ + BUG_ON(page->active >= cachep->num); + + while (page->active < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac->entry[ac->avail++] = slab_get_obj(cachep, page); + } + + return batchcount; +} + static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; @@ -2877,7 +2901,6 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); node = numa_mem_id(); -retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2907,21 +2930,7 @@ retry: check_spinlock_acquired(cachep); - /* - * The slab was either on partial or free list so - * there must be at least one object available for - * allocation. - */ - BUG_ON(page->active >= cachep->num); - - while (page->active < cachep->num && batchcount--) { - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - ac->entry[ac->avail++] = slab_get_obj(cachep, page); - } - + batchcount = alloc_block(cachep, ac, page, batchcount); fixup_slab_list(cachep, n, page, &list); } @@ -2941,21 +2950,18 @@ alloc_done: } page = cache_grow_begin(cachep, gfp_exact_node(flags), node); - cache_grow_end(cachep, page); /* * cache_grow_begin() can reenable interrupts, * then ac could change. */ ac = cpu_cache_get(cachep); - node = numa_mem_id(); + if (!ac->avail && page) + alloc_block(cachep, ac, page, batchcount); + cache_grow_end(cachep, page); - /* no objects in sight? abort */ - if (!page && ac->avail == 0) + if (!ac->avail) return NULL; - - if (!ac->avail) /* objects refilled by interrupt? */ - goto retry; } ac->touched = 1; @@ -3149,14 +3155,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, { struct page *page; struct kmem_cache_node *n; - void *obj; + void *obj = NULL; void *list = NULL; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); -retry: check_irq_off(); spin_lock(&n->list_lock); page = get_first_slab(n, false); @@ -3178,19 +3183,18 @@ retry: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); - goto done; + return obj; must_grow: spin_unlock(&n->list_lock); page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (page) { + /* This slab isn't counted yet so don't update free_objects */ + obj = slab_get_obj(cachep, page); + } cache_grow_end(cachep, page); - if (page) - goto retry; - return fallback_alloc(cachep, flags); - -done: - return obj; + return obj ? obj : fallback_alloc(cachep, flags); } static __always_inline void * -- cgit v0.10.2 From 801faf0db8947e01877920e848a4d338dd7a99e7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:31 -0700 Subject: mm/slab: lockless decision to grow cache To check whether free objects exist or not precisely, we need to grab a lock. But, accuracy isn't that important because race window would be even small and if there is too much free object, cache reaper would reap it. So, this patch makes the check for free object exisistence not to hold a lock. This will reduce lock contention in heavily allocation case. Note that until now, n->shared can be freed during the processing by writing slabinfo, but, with some trick in this patch, we can access it freely within interrupt disabled period. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=248/966 Kmalloc N*alloc N*free(64): Average=261/949 Kmalloc N*alloc N*free(128): Average=314/1016 Kmalloc N*alloc N*free(256): Average=741/1061 Kmalloc N*alloc N*free(512): Average=1246/1152 Kmalloc N*alloc N*free(1024): Average=2437/1259 Kmalloc N*alloc N*free(2048): Average=4980/1800 Kmalloc N*alloc N*free(4096): Average=9000/2078 * After Kmalloc N*alloc N*free(32): Average=344/792 Kmalloc N*alloc N*free(64): Average=347/882 Kmalloc N*alloc N*free(128): Average=390/959 Kmalloc N*alloc N*free(256): Average=393/1067 Kmalloc N*alloc N*free(512): Average=683/1229 Kmalloc N*alloc N*free(1024): Average=1295/1325 Kmalloc N*alloc N*free(2048): Average=2513/1664 Kmalloc N*alloc N*free(4096): Average=4742/2172 It shows that allocation performance decreases for the object size up to 128 and it may be due to extra checks in cache_alloc_refill(). But, with considering improvement of free performance, net result looks the same. Result for other size class looks very promising, roughly, 50% performance improvement. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 37600e9..8133ebe 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -965,6 +965,15 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); + /* + * To protect lockless access to n->shared during irq disabled context. + * If n->shared isn't NULL in irq disabled context, accessing to it is + * guaranteed to be valid until irq is re-enabled, because it will be + * freed after synchronize_sched(). + */ + if (force_change) + synchronize_sched(); + fail: kfree(old_shared); kfree(new_shared); @@ -2893,7 +2902,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; - struct array_cache *ac; + struct array_cache *ac, *shared; int node; void *list = NULL; struct page *page; @@ -2914,11 +2923,16 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); + shared = READ_ONCE(n->shared); + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ - if (n->shared && transfer_objects(ac, n->shared, batchcount)) { - n->shared->touched = 1; + if (shared && transfer_objects(ac, shared, batchcount)) { + shared->touched = 1; goto alloc_done; } @@ -2940,6 +2954,7 @@ alloc_done: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); +direct_grow: if (unlikely(!ac->avail)) { /* Check if we can use obj in pfmemalloc slab */ if (sk_memalloc_socks()) { -- cgit v0.10.2 From 81ae6d03952c1bfb96e1a716809bd65e7cd14360 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 19 May 2016 17:10:34 -0700 Subject: mm/slub.c: replace kick_all_cpus_sync() with synchronize_sched() in kmem_cache_shrink() When we call __kmem_cache_shrink on memory cgroup removal, we need to synchronize kmem_cache->cpu_partial update with put_cpu_partial that might be running on other cpus. Currently, we achieve that by using kick_all_cpus_sync, which works as a system wide memory barrier. Though fast it is, this method has a flaw - it issues a lot of IPIs, which might hurt high performance or real-time workloads. To fix this, let's replace kick_all_cpus_sync with synchronize_sched. Although the latter one may take much longer to finish, it shouldn't be a problem in this particular case, because memory cgroups are destroyed asynchronously from a workqueue so that no user visible effects should be introduced. OTOH, it will save us from excessive IPIs when someone removes a cgroup. Anyway, even if using synchronize_sched turns out to take too long, we can always introduce a kind of __kmem_cache_shrink batching so that this method would only be called once per one cgroup destruction (not per each per memcg kmem cache as it is now). Signed-off-by: Vladimir Davydov Reported-by: Peter Zijlstra Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 4dbb109e..ba81cf6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) * s->cpu_partial is checked locklessly (see put_cpu_partial), * so we have to make sure the change is visible. */ - kick_all_cpus_sync(); + synchronize_sched(); } flush_all(s); -- cgit v0.10.2 From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 19 May 2016 17:10:37 -0700 Subject: mm: SLAB freelist randomization Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize the SLAB freelist. The list is randomized during initialization of a new set of pages. The order on different freelist sizes is pre-computed at boot for performance. Each kmem_cache has its own randomized freelist. Before pre-computed lists are available freelists are generated dynamically. This security feature reduces the predictability of the kernel SLAB allocator against heap overflows rendering attacks much less stable. For example this attack against SLUB (also applicable against SLAB) would be affected: https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/ Also, since v4.6 the freelist was moved at the end of the SLAB. It means a controllable heap is opened to new attacks not yet publicly discussed. A kernel heap overflow can be transformed to multiple use-after-free. This feature makes this type of attack harder too. To generate entropy, we use get_random_bytes_arch because 0 bits of entropy is available in the boot stage. In the worse case this function will fallback to the get_random_bytes sub API. We also generate a shift random number to shift pre-computed freelist for each new set of pages. The config option name is not specific to the SLAB as this approach will be extended to other allocators like SLUB. Performance results highlighted no major changes: Hackbench (running 90 10 times): Before average: 0.0698 After average: 0.0663 (-5.01%) slab_test 1 run on boot. Difference only seen on the 2048 size test being the worse case scenario covered by freelist randomization. New slab pages are constantly being created on the 10000 allocations. Variance should be mainly due to getting new pages every few allocations. Before: Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles 10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles 10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles 10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles 10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles 10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles 10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles 10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles 10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles 10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles 10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles 10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 121 cycles 10000 times kmalloc(16)/kfree -> 121 cycles 10000 times kmalloc(32)/kfree -> 121 cycles 10000 times kmalloc(64)/kfree -> 121 cycles 10000 times kmalloc(128)/kfree -> 121 cycles 10000 times kmalloc(256)/kfree -> 119 cycles 10000 times kmalloc(512)/kfree -> 119 cycles 10000 times kmalloc(1024)/kfree -> 119 cycles 10000 times kmalloc(2048)/kfree -> 119 cycles 10000 times kmalloc(4096)/kfree -> 121 cycles 10000 times kmalloc(8192)/kfree -> 119 cycles 10000 times kmalloc(16384)/kfree -> 119 cycles After: Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles 10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles 10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles 10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles 10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles 10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles 10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles 10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles 10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles 10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles 10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles 10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 121 cycles 10000 times kmalloc(16)/kfree -> 121 cycles 10000 times kmalloc(32)/kfree -> 123 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 121 cycles 10000 times kmalloc(256)/kfree -> 119 cycles 10000 times kmalloc(512)/kfree -> 119 cycles 10000 times kmalloc(1024)/kfree -> 119 cycles 10000 times kmalloc(2048)/kfree -> 119 cycles 10000 times kmalloc(4096)/kfree -> 119 cycles 10000 times kmalloc(8192)/kfree -> 119 cycles 10000 times kmalloc(16384)/kfree -> 119 cycles [akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()] Signed-off-by: Thomas Garnier Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Greg Thelen Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 9edbbf3..8694f7a 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -80,6 +80,10 @@ struct kmem_cache { struct kasan_cache kasan_info; #endif +#ifdef CONFIG_SLAB_FREELIST_RANDOM + void *random_seq; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/init/Kconfig b/init/Kconfig index 0dfd09d..79a91a2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1742,6 +1742,15 @@ config SLOB endchoice +config SLAB_FREELIST_RANDOM + default n + depends on SLAB + bool "SLAB freelist randomization" + help + Randomizes the freelist order used on creating new SLABs. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/mm/slab.c b/mm/slab.c index 8133ebe..1ee26a0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1243,6 +1243,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + unsigned int seed, count = cachep->num; + struct rnd_state state; + + if (count < 2) + return 0; + + /* If it fails, we will just use the global lists */ + cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage */ + get_random_bytes_arch(&seed, sizeof(seed)); + prandom_seed_state(&state, seed); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +static void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -2374,6 +2429,8 @@ void __kmem_cache_release(struct kmem_cache *cachep) int i; struct kmem_cache_node *n; + cache_random_seq_destroy(cachep); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ @@ -2480,15 +2537,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) #endif } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Hold information during a freelist initialization */ +union freelist_init_state { + struct { + unsigned int pos; + freelist_idx_t *list; + unsigned int count; + unsigned int rand; + }; + struct rnd_state rnd_state; +}; + +/* + * Initialize the state based on the randomization methode available. + * return true if the pre-computed list is available, false otherwize. + */ +static bool freelist_state_initialize(union freelist_init_state *state, + struct kmem_cache *cachep, + unsigned int count) +{ + bool ret; + unsigned int rand; + + /* Use best entropy available to define a random shift */ + get_random_bytes_arch(&rand, sizeof(rand)); + + /* Use a random state if the pre-computed list is not available */ + if (!cachep->random_seq) { + prandom_seed_state(&state->rnd_state, rand); + ret = false; + } else { + state->list = cachep->random_seq; + state->count = count; + state->pos = 0; + state->rand = rand; + ret = true; + } + return ret; +} + +/* Get the next entry on the list and randomize it using a random shift */ +static freelist_idx_t next_random_slot(union freelist_init_state *state) +{ + return (state->list[state->pos++] + state->rand) % state->count; +} + +/* + * Shuffle the freelist initialization state based on pre-computed lists. + * return true if the list was successfully shuffled, false otherwise. + */ +static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) +{ + unsigned int objfreelist = 0, i, count = cachep->num; + union freelist_init_state state; + bool precomputed; + + if (count < 2) + return false; + + precomputed = freelist_state_initialize(&state, cachep, count); + + /* Take a random entry as the objfreelist */ + if (OBJFREELIST_SLAB(cachep)) { + if (!precomputed) + objfreelist = count - 1; + else + objfreelist = next_random_slot(&state); + page->freelist = index_to_obj(cachep, page, objfreelist) + + obj_offset(cachep); + count--; + } + + /* + * On early boot, generate the list dynamically. + * Later use a pre-computed list for speed. + */ + if (!precomputed) { + freelist_randomize(&state.rnd_state, page->freelist, count); + } else { + for (i = 0; i < count; i++) + set_free_obj(page, i, next_random_slot(&state)); + } + + if (OBJFREELIST_SLAB(cachep)) + set_free_obj(page, cachep->num - 1, objfreelist); + + return true; +} +#else +static inline bool shuffle_freelist(struct kmem_cache *cachep, + struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; void *objp; + bool shuffled; cache_init_objs_debug(cachep, page); - if (OBJFREELIST_SLAB(cachep)) { + /* Try to randomize the freelist if enabled */ + shuffled = shuffle_freelist(cachep, page); + + if (!shuffled && OBJFREELIST_SLAB(cachep)) { page->freelist = index_to_obj(cachep, page, cachep->num - 1) + obj_offset(cachep); } @@ -2502,7 +2659,8 @@ static void cache_init_objs(struct kmem_cache *cachep, kasan_poison_object_data(cachep, objp); } - set_free_obj(page, i, i); + if (!shuffled) + set_free_obj(page, i, i); } } @@ -3841,6 +3999,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; + err = cache_random_seq_create(cachep, gfp); + if (err) + goto end; + if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; @@ -3894,6 +4056,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); +end: if (err) pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); -- cgit v0.10.2 From a3187e438bc6565d6e54a550a19073d1b453f041 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2016 17:10:41 -0700 Subject: mm: slab: remove ZONE_DMA_FLAG Now we have IS_ENABLED helper to check if a Kconfig option is enabled or not, so ZONE_DMA_FLAG sounds no longer useful. And, the use of ZONE_DMA_FLAG in slab looks pointless according to the comment [1] from Johannes Weiner, so remove them and ORing passed in flags with the cache gfp flags has been done in kmem_getpages(). [1] https://lkml.org/lkml/2014/9/25/553 Link: http://lkml.kernel.org/r/1462381297-11009-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/Kconfig b/mm/Kconfig index 989f8f3..d6e9042 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -268,11 +268,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -config ZONE_DMA_FLAG - int - default "0" if !ZONE_DMA - default "1" - config BOUNCE bool "Enable bounce buffers" default y diff --git a/mm/slab.c b/mm/slab.c index 1ee26a0..d81565a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2236,7 +2236,7 @@ done: cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; - if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) + if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -2664,16 +2664,6 @@ static void cache_init_objs(struct kmem_cache *cachep, } } -static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) -{ - if (CONFIG_ZONE_DMA_FLAG) { - if (flags & GFP_DMA) - BUG_ON(!(cachep->allocflags & GFP_DMA)); - else - BUG_ON(cachep->allocflags & GFP_DMA); - } -} - static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; @@ -2753,14 +2743,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, local_irq_enable(); /* - * The test for missing atomic flag is performed here, rather than - * the more obvious place, simply to reduce the critical path length - * in kmem_cache_alloc(). If a caller is seriously mis-behaving they - * will eventually be caught here (where it matters). - */ - kmem_flagcheck(cachep, flags); - - /* * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ @@ -3145,9 +3127,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { might_sleep_if(gfpflags_allow_blocking(flags)); -#if DEBUG - kmem_flagcheck(cachep, flags); -#endif } #if DEBUG -- cgit v0.10.2 From 43efd3ea64f3cf8920e8793e6953321a466023e3 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Thu, 19 May 2016 17:10:43 -0700 Subject: mm/slub.c: fix sysfs filename in comment /sys/kernel/slab/xx/defrag_ratio should be remote_node_defrag_ratio. Link: http://lkml.kernel.org/r/1463449242-5366-1-git-send-email-lip@dtdream.com Signed-off-by: Li Peng Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index ba81cf6..8671de2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation will - * first attempt to defrag slab caches on other nodes. This means - * scanning over all nodes to look for partial slabs which may be - * expensive if we do it every time we are trying to find a slab + * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 + * (which makes defrag_ratio = 1000) then every (well almost) + * allocation will first attempt to defrag slab caches on other nodes. + * This means scanning over all nodes to look for partial slabs which + * may be expensive if we do it every time we are trying to find a slab * with available objects. */ if (!s->remote_node_defrag_ratio || -- cgit v0.10.2 From 6d061f9f6136d477932088c24ce155d7dc785746 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:46 -0700 Subject: mm/page_ref: use page_ref helper instead of direct modification of _count page_reference manipulation functions are introduced to track down reference count change of the page. Use it instead of direct modification of _count. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index 06b819d..0ff8e60 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -23,7 +23,7 @@ static void nicvf_get_page(struct nicvf *nic) if (!nic->rb_pageref || !nic->rb_page) return; - atomic_add(nic->rb_pageref, &nic->rb_page->_count); + page_ref_add(nic->rb_page, nic->rb_pageref); nic->rb_pageref = 0; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 8114541..3aabfc0 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -920,7 +920,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev, * network stack to take the ownership of the page * which can be recycled multiple times by the driver. */ - atomic_inc(&curr_cons->data->_count); + page_ref_inc(curr_cons->data); qede_reuse_page(edev, rxq, curr_cons); } diff --git a/mm/filemap.c b/mm/filemap.c index 182b218..0169033 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) * some other bad page check should catch it later. */ page_mapcount_reset(page); - atomic_sub(mapcount, &page->_count); + page_ref_sub(page, mapcount); } } diff --git a/net/wireless/util.c b/net/wireless/util.c index 219bd19..4e809e9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page, struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; - atomic_inc(&page->_count); + page_ref_inc(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } -- cgit v0.10.2 From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:49 -0700 Subject: mm: rename _count, field of the struct page, to _refcount Many developers already know that field for reference count of the struct page is _count and atomic type. They would try to handle it directly and this could break the purpose of page reference count tracepoint. To prevent direct _count modification, this patch rename it to _refcount and add warning message on the code. After that, developer who need to handle reference count will find that field should not be accessed directly. [akpm@linux-foundation.org: fix comments, per Vlastimil] [akpm@linux-foundation.org: Documentation/vm/transhuge.txt too] [sfr@canb.auug.org.au: sync ethernet driver changes] Signed-off-by: Joonsoo Kim Signed-off-by: Stephen Rothwell Cc: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Cc: Manish Chopra Cc: Yuval Mintz Cc: Tariq Toukan Cc: Saeed Mahameed Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index d9cb65c..fb0e1f2 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock. Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate in head page's ->_count. + - get_page()/put_page() and GUP operate in head page's ->_refcount. - - ->_count in tail pages is always zero: get_page_unless_zero() never + - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeed on tail pages. - map/unmap of the pages with PTE entry increment/decrement ->_mapcount @@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to sum of mapcount of all sub-pages plus one (split_huge_page caller must have reference for head page). -split_huge_page uses migration entries to stabilize page->_count and +split_huge_page uses migration entries to stabilize page->_refcount and page->_mapcount. We safe against physical memory scanners too: the only legitimate way scanner can get reference to a page is get_page_unless_zero(). -All tail pages has zero ->_count until atomic_add(). It prevent scanner +All tail pages has zero ->_refcount until atomic_add(). It prevent scanner from geting reference to tail page up to the point. After the atomic_add() -we don't care about ->_count value. We already known how many references +we don't care about ->_refcount value. We already known how many references with should uncharge from head page. For head page get_page_unless_zero() will succeed and we don't mind. It's diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index a0582b7..adce254 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end) * Hacky direct set to avoid unnecessary * lock take/release for EVERY page here. */ - p->_count.counter = 0; + p->_refcount.counter = 0; p->_mapcount.counter = -1; } init_page_count(page); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 437b3a8..d597e43 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -861,7 +861,7 @@ rqbiocnt(struct request *r) * discussion. * * We cannot use get_page in the workaround, because it insists on a - * positive page count as a precondition. So we use _count directly. + * positive page count as a precondition. So we use _refcount directly. */ static void bio_pageinc(struct bio *bio) diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index d9d6022..d220914 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma) if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex)) return; - /* drop page _counts */ + /* drop page _refcounts */ for (pg = 0; pg < msc->nr_pages; pg++) { struct page *page = msc_buffer_get_page(msc, pg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f345679..bd94770 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i))) goto err_unmap; - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_add(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -452,8 +452,8 @@ err_unmap: while (--i >= 0) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq, */ split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->dma_info.page[i]._count); + page_ref_add(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq, dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz, PCI_DMA_FROMDEVICE); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->dma_info.page[i]._count); + page_ref_sub(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(&wi->dma_info.page[i]); } } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 3aabfc0..73dd525 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev, /* Incr page ref count to reuse on allocation failure * so that it doesn't get freed while freeing SKB. */ - atomic_inc(¤t_bd->data->_count); + page_ref_inc(current_bd->data); goto out; } @@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget) * freeing SKB. */ - atomic_inc(&sw_rx_data->data->_count); + page_ref_inc(sw_rx_data->data); rxq->rx_alloc_errors++; qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num); diff --git a/fs/proc/page.c b/fs/proc/page.c index 712f1b9..3ecd445 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_count will only be set + * Caveats on high order pages: page->_refcount will only be set * -1 on the head page; SLUB/SLQB do the same for PG_slab; * SLOB won't set PG_slab at all on compound pages. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 727f7997..1193a54 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -734,7 +734,7 @@ static inline void get_page(struct page *page) page = compound_head(page); /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. + * requires to already have an elevated page->_refcount. */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c2d75b4..1fda9c9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,9 +73,9 @@ struct page { unsigned long counters; #else /* - * Keep _count separate from slub cmpxchg_double data. - * As the rest of the double word is protected by - * slab_lock but _count is not. + * Keep _refcount separate from slub cmpxchg_double + * data. As the rest of the double word is protected by + * slab_lock but _refcount is not. */ unsigned counters; #endif @@ -97,7 +97,11 @@ struct page { }; int units; /* SLOB */ }; - atomic_t _count; /* Usage count, see below. */ + /* + * Usage count, *USE WRAPPER FUNCTION* + * when manual accounting. See page_ref.h + */ + atomic_t _refcount; }; unsigned int active; /* SLAB */ }; @@ -248,7 +252,7 @@ struct page_frag_cache { __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. + * containing page->_refcount every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index e596d5d9..8b5e0a9 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v) static inline int page_ref_count(struct page *page) { - return atomic_read(&page->_count); + return atomic_read(&page->_refcount); } static inline int page_count(struct page *page) { - return atomic_read(&compound_head(page)->_count); + return atomic_read(&compound_head(page)->_refcount); } static inline void set_page_count(struct page *page, int v) { - atomic_set(&page->_count, v); + atomic_set(&page->_refcount, v); if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) __page_ref_set(page, v); } @@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { - atomic_add(nr, &page->_count); + atomic_add(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { - atomic_sub(nr, &page->_count); + atomic_sub(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -nr); } static inline void page_ref_inc(struct page *page) { - atomic_inc(&page->_count); + atomic_inc(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { - atomic_dec(&page->_count); + atomic_dec(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -1); } static inline int page_ref_sub_and_test(struct page *page, int nr) { - int ret = atomic_sub_and_test(nr, &page->_count); + int ret = atomic_sub_and_test(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -nr, ret); @@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) static inline int page_ref_dec_and_test(struct page *page) { - int ret = atomic_dec_and_test(&page->_count); + int ret = atomic_dec_and_test(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -1, ret); @@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page) static inline int page_ref_dec_return(struct page *page) { - int ret = atomic_dec_return(&page->_count); + int ret = atomic_dec_return(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) __page_ref_mod_and_return(page, -1, ret); @@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page) static inline int page_ref_add_unless(struct page *page, int nr, int u) { - int ret = atomic_add_unless(&page->_count, nr, u); + int ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); @@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u) static inline int page_ref_freeze(struct page *page, int count) { - int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count); + int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) __page_ref_freeze(page, count, ret); @@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); - atomic_set(&page->_count, count); + atomic_set(&page->_refcount, count); if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) __page_ref_unfreeze(page, count); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7e1ab15..fe1513f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. - * If the page is free (_count == 0), then _count is untouched, and 0 - * is returned. Otherwise, _count is incremented by 1 and 1 is returned. + * If the page is free (_refcount == 0), then _refcount is untouched, and 0 + * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _count. + * this allows allocators to use a synchronize_rcu() to stabilize _refcount. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher @@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold); * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * - * Remove-side that cares about stability of _count (eg. reclaim) has the + * Remove-side that cares about stability of _refcount (eg. reclaim) has the * following (with tree_lock held for write): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3e..1c03dfb 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b49ee12..f8ac8f5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); diff --git a/mm/internal.h b/mm/internal.h index b79abb6..098a89e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, } /* - * Turn a non-refcounted page (->_count == 0) into refcounted with + * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1069ef..4ce57f9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) diff --git a/mm/slub.c b/mm/slub.c index 8671de2..cf1faa4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count tmp.counters = counters_new; /* * page->counters can cover frozen/inuse/objects as well - * as page->_count. If we assign to ->counters directly - * we run the risk of losing updates to page->_count, so + * as page->_refcount. If we assign to ->counters directly + * we run the risk of losing updates to page->_refcount, so * be careful and only assign to the fields we need. */ page->frozen = tmp.frozen; diff --git a/mm/vmscan.c b/mm/vmscan.c index 142cb61..d3a02ac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. @@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * - * The downside is that we have to touch page->_count against each page. + * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ -- cgit v0.10.2 From d64e85d3e1c59c3664b9ec1183052ec4641ea1e2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 19 May 2016 17:10:52 -0700 Subject: compiler.h: add support for malloc attribute gcc as far back as at least 3.04 documents the function attribute __malloc__. Add a shorthand for attaching that to a function declaration. This was also suggested by Andi Kleen way back in 2002 [1], but didn't get applied, perhaps because gcc at that time generated the exact same code with and without this attribute. This attribute tells the compiler that the return value (if non-NULL) can be assumed not to alias any other valid pointers at the time of the call. Please note that the documentation for a range of gcc versions (starting from around 4.7) contained a somewhat confusing and self-contradicting text: The malloc attribute is used to tell the compiler that a function may be treated as if any non-NULL pointer it returns cannot alias any other pointer valid when the function returns and *that the memory has undefined content*. [...] Standard functions with this property include malloc and *calloc*. (emphasis mine). The intended meaning has later been clarified [2]: This tells the compiler that a function is malloc-like, i.e., that the pointer P returned by the function cannot alias any other pointer valid when the function returns, and moreover no pointers to valid objects occur in any storage addressed by P. What this means is that we can apply the attribute to kmalloc and friends, and it is ok for the returned memory to have well-defined contents (__GFP_ZERO). But it is not ok to apply it to kmemdup(), nor to other functions which both allocate and possibly initialize the memory with existing pointers. So unless someone is doing something pretty perverted kstrdup() should also be a fine candidate. [1] http://thread.gmane.org/gmane.linux.kernel/57172 [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56955 Signed-off-by: Rasmus Villemoes Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 3d5202e..e294939 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -142,6 +142,7 @@ #if GCC_VERSION >= 30400 #define __must_check __attribute__((warn_unused_result)) +#define __malloc __attribute__((__malloc__)) #endif #if GCC_VERSION >= 40000 diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b5ff988..793c082 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define __deprecated_for_modules #endif +#ifndef __malloc +#define __malloc +#endif + /* * Allow us to avoid 'defined but not used' warnings on functions and data, * as well as force them to be emitted to the assembly file. -- cgit v0.10.2 From 48a270554a3251681ae11173f2fd6389d943e183 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 19 May 2016 17:10:55 -0700 Subject: include/linux: apply __malloc attribute Attach the malloc attribute to a few allocation functions. This helps gcc generate better code by telling it that the return value doesn't alias any existing pointers (which is even more valuable given the pessimizations implied by -fno-strict-aliasing). A simple example of what this allows gcc to do can be seen by looking at the last part of drm_atomic_helper_plane_reset: plane->state = kzalloc(sizeof(*plane->state), GFP_KERNEL); if (plane->state) { plane->state->plane = plane; plane->state->rotation = BIT(DRM_ROTATE_0); } which compiles to e8 99 bf d6 ff callq ffffffff8116d540 48 85 c0 test %rax,%rax 48 89 83 40 02 00 00 mov %rax,0x240(%rbx) 74 11 je ffffffff814015c4 48 89 18 mov %rbx,(%rax) 48 8b 83 40 02 00 00 mov 0x240(%rbx),%rax [*] c7 40 40 01 00 00 00 movl $0x1,0x40(%rax) With this patch applied, the instruction at [*] is elided, since the store to plane->state->plane is known to not alter the value of plane->state. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Rasmus Villemoes Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 35b22f9..f9be326 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size, unsigned long goal); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; void *__alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit) __malloc; extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; void *__alloc_bootmem_low_nopanic(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal) __malloc; #ifdef CONFIG_NO_BOOTMEM /* We are using top down, so it is safe to use 0 here */ diff --git a/include/linux/device.h b/include/linux/device.h index b130304..ca90ad8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data); #ifdef CONFIG_DEBUG_DEVRES extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, - int nid, const char *name); + int nid, const char *name) __malloc; #define devres_alloc(release, size, gfp) \ __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release) #define devres_alloc_node(release, size, gfp, nid) \ __devres_alloc_node(release, size, gfp, nid, #release) #else extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, - int nid); + int nid) __malloc; static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp) { return devres_alloc_node(release, size, gfp, NUMA_NO_NODE); @@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id); extern int devres_release_group(struct device *dev, void *id); /* managed devm_k.alloc/kfree for device drivers */ -extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp); +extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc; extern __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, - va_list ap); + va_list ap) __malloc; extern __printf(3, 4) -char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...); +char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc; static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp) { return devm_kmalloc(dev, size, gfp | __GFP_ZERO); @@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev, return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO); } extern void devm_kfree(struct device *dev, void *p); -extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp); +extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; extern void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2f7775e..cc73982 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -412,9 +412,9 @@ extern __printf(3, 4) int scnprintf(char *buf, size_t size, const char *fmt, ...); extern __printf(3, 0) int vscnprintf(char *buf, size_t size, const char *fmt, va_list args); -extern __printf(2, 3) +extern __printf(2, 3) __malloc char *kasprintf(gfp_t gfp, const char *fmt, ...); -extern __printf(2, 0) +extern __printf(2, 0) __malloc char *kvasprintf(gfp_t gfp, const char *fmt, va_list args); extern __printf(2, 0) const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args); diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 69b6951..b1086c9 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -5,6 +5,7 @@ #define _LINUX_MEMPOOL_H #include +#include struct kmem_cache; @@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); -extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask); +extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* diff --git a/include/linux/slab.h b/include/linux/slab.h index 508bd82..aeb3e6d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size) } #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment; -void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment; +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; +void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *, void *); /* @@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p) } #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment; -void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment; +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; +void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; #else static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) { @@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment; +extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment; + int node, size_t size) __assume_slab_alignment __malloc; #else static __always_inline void * kmem_cache_alloc_node_trace(struct kmem_cache *s, @@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment; +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; #ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment; +extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; #else static __always_inline void * kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) diff --git a/include/linux/string.h b/include/linux/string.h index d3993a7..26b6f6a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new); extern void kfree_const(const void *x); -extern char *kstrdup(const char *s, gfp_t gfp); +extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); -- cgit v0.10.2 From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:10:58 -0700 Subject: include/linux/nodemask.h: create next_node_in() helper Lots of code does node = next_node(node, XXX); if (node == MAX_NUMNODES) node = first_node(XXX); so create next_node_in() to do this and use it in various places. [mhocko@suse.com: use next_node_in() helper] Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Michal Hocko Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Cc: Hui Zhu Cc: Wang Xiaoqiang Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index a992238..153020a 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void) cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]); cpu_2_node[best_cpu] = node; cpumask_clear_cpu(best_cpu, &unbound_cpus); - node = next_node(node, default_nodes); - if (node == MAX_NUMNODES) - node = first_node(default_nodes); + node = next_node_in(node, default_nodes); } /* Print out node assignments and set defaults for disabled cpus */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f70c1ff..9c086c5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -617,9 +617,7 @@ static void __init numa_init_array(void) if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); + rr = next_node_in(rr, node_online_map); } } diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 6e85889..f746e44 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -43,8 +43,10 @@ * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * int next_node_in(node, mask) Next node past 'node', or wrap to first, + * or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or - * MAX_NUMNODES. + * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set @@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp) return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } +/* + * Find the next present node in src, starting after node n, wrapping around to + * the first node in src if needed. Returns MAX_NUMNODES if src is empty. + */ +#define next_node_in(n, src) __next_node_in((n), &(src)) +int __next_node_in(int node, const nodemask_t *srcp); + static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1902956..611cc69 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) diff --git a/lib/Makefile b/lib/Makefile index 931396a..42b6918 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/nodemask.c b/lib/nodemask.c new file mode 100644 index 0000000..e42a5bf4 --- /dev/null +++ b/lib/nodemask.c @@ -0,0 +1,30 @@ +#include +#include +#include + +int __next_node_in(int node, const nodemask_t *srcp) +{ + int ret = __next_node(node, srcp); + + if (ret == MAX_NUMNODES) + ret = __first_node(srcp); + return ret; +} +EXPORT_SYMBOL(__next_node_in); + +#ifdef CONFIG_NUMA +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08..5856093 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -937,9 +937,7 @@ err: */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); + nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe787f5..6740c4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) mem_cgroup_may_update_nodemask(memcg); node = memcg->last_scanned_node; - node = next_node(node, memcg->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(memcg->scan_nodes); + node = next_node_in(node, memcg->scan_nodes); /* * We call this when we hit limit, not when pages are added to LRU. * No LRU may hold pages because all pages are UNEVICTABLE or diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cc01b..8d369ce 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,7 +97,6 @@ #include #include -#include #include "internal.h" @@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, BUG(); if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); + current->il_next = next_node_in(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) current->il_next = numa_node_id(); } @@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = next_node_in(nid, policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; @@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } -/* - * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) - */ -int node_random(const nodemask_t *maskp) -{ - int w, bit = NUMA_NO_NODE; - - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c4f5682..67bedd1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * accordance with memory policy of the user process if possible. For * now as a simple work-around, we use the next node for destination. */ - if (PageHuge(page)) { - int node = next_online_node(page_to_nid(page)); - if (node == MAX_NUMNODES) - node = first_online_node; + if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - } + next_node_in(page_to_nid(page), + node_online_map)); if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/slab.c b/mm/slab.c index d81565a..c11bf50 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { - int node; - - node = next_node(cpu_to_mem(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(slab_reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); } static void next_reap_node(void) { int node = __this_cpu_read(slab_reap_node); - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); + node = next_node_in(node, node_online_map); __this_cpu_write(slab_reap_node, node); } -- cgit v0.10.2 From 09a95e29cb30a3930db22d340ddd072a82b6b0db Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 19 May 2016 17:11:01 -0700 Subject: mm/hugetlb: optimize minimum size (min_size) accounting It was observed that minimum size accounting associated with the hugetlbfs min_size mount option may not perform optimally and as expected. As huge pages/reservations are released from the filesystem and given back to the global pools, they are reserved for subsequent filesystem use as long as the subpool reserved count is less than subpool minimum size. It does not take into account used pages within the filesystem. The filesystem size limits are not exceeded and this is technically not a bug. However, better behavior would be to wait for the number of used pages/reservations associated with the filesystem to drop below the minimum size before taking reservations to satisfy minimum size. An optimization is also made to the hugepage_subpool_get_pages() routine which is called when pages/reservations are allocated. This does not change behavior, but simply avoids the accounting if all reservations have already been taken (subpool reserved count == 0). Signed-off-by: Mike Kravetz Acked-by: Naoya Horiguchi Cc: Hillf Danton Cc: David Rientjes Cc: Dave Hansen Cc: "Kirill A. Shutemov" Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5856093..fb37ef8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -144,7 +144,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } } - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on @@ -182,7 +183,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else -- cgit v0.10.2 From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:04 -0700 Subject: mm/hugetlb: introduce hugetlb_bad_size() When any unsupported hugepage size is specified, 'hugepagesz=' and 'hugepages=' should be ignored during command line parsing until any supported hugepage size is found. But currently incorrect number of hugepages are allocated when unsupported size is specified as it fails to ignore the 'hugepages=' command. Test case: Note that this is specific to x86 architecture. Boot the kernel with command line option 'hugepagesz=256M hugepages=X'. After boot, dmesg output shows that X number of hugepages of the size 2M is pre-allocated instead of 0. So, to handle such command line options, introduce new routine hugetlb_bad_size. The routine hugetlb_bad_size sets the global variable parsed_valid_hugepagesz. We are using parsed_valid_hugepagesz to save the state when unsupported hugepagesize is found so that we can ignore the 'hugepages=' parameters after that and then reset the variable when supported hugepage size is found. The routine hugetlb_bad_size can be called while setting 'hugepagesz=' parameter in an architecture specific code. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Cc: Benjamin Herrenschmidt Cc: James Hogan Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d953c2..e44c578 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); +void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); struct hstate *size_to_hstate(unsigned long size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fb37ef8..0adb74d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages); static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; +static bool __initdata parsed_valid_hugepagesz = true; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -2659,6 +2660,11 @@ static int __init hugetlb_init(void) subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_bad_size(void) +{ + parsed_valid_hugepagesz = false; +} + void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; @@ -2691,11 +2697,17 @@ static int __init hugetlb_nrpages_setup(char *s) unsigned long *mhp; static unsigned long *last_mhp; + if (!parsed_valid_hugepagesz) { + pr_warn("hugepages = %s preceded by " + "an unsupported hugepagesz, ignoring\n", s); + parsed_valid_hugepagesz = true; + return 1; + } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!hugetlb_max_hstate) + else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; -- cgit v0.10.2 From d77e20cea7183145acff928f64b9cba4c825fd83 Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:08 -0700 Subject: arm64: mm: use hugetlb_bad_size() Update setup_hugepagesz() to call hugetlb_bad_size() when unsupported hugepage size is found. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 589fd28..aa8aee7 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -307,6 +307,7 @@ static __init int setup_hugepagesz(char *opt) } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { + hugetlb_bad_size(); pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; } -- cgit v0.10.2 From 9cc3387fa29fe7827ea6249a1e138ba2a5965082 Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:11 -0700 Subject: metag: mm: use hugetlb_bad_size() Update setup_hugepagesz() to call hugetlb_bad_size() when unsupported hugepage size is found. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index b38700ae..db1b7da 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -239,6 +239,7 @@ static __init int setup_hugepagesz(char *opt) if (ps == (1 << HPAGE_SHIFT)) { hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT); } else { + hugetlb_bad_size(); pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); return 0; -- cgit v0.10.2 From 71bf79cc3ff8e7be3e61399c6a9787d0c57c5cd1 Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:14 -0700 Subject: powerpc: mm: use hugetlb_bad_size() Update setup_hugepagesz() to call hugetlb_bad_size() when unsupported hugepage size is found. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d991b9e..a4a90a8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -772,8 +772,10 @@ static int __init hugepage_setup_sz(char *str) size = memparse(str, &str); - if (add_huge_page_size(size) != 0) - printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); + if (add_huge_page_size(size) != 0) { + hugetlb_bad_size(); + pr_err("Invalid huge page size specified(%llu)\n", size); + } return 1; } -- cgit v0.10.2 From b3d424f1a534949bce27847aa871b3ce75c60f4d Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:17 -0700 Subject: tile: mm: use hugetlb_bad_size() Update setup_hugepagesz() to call hugetlb_bad_size() when unsupported hugepage size is found. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Cc: James Hogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index e212c64..77ceaa3 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -308,11 +308,16 @@ static bool saw_hugepagesz; static __init int setup_hugepagesz(char *opt) { + int rc; + if (!saw_hugepagesz) { saw_hugepagesz = true; memset(huge_shift, 0, sizeof(huge_shift)); } - return __setup_hugepagesz(memparse(opt, NULL)); + rc = __setup_hugepagesz(memparse(opt, NULL)); + if (rc) + hugetlb_bad_size(); + return rc; } __setup("hugepagesz=", setup_hugepagesz); -- cgit v0.10.2 From 2b18e5321f512aa85fd91db835a4c0dcdb2cd063 Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:20 -0700 Subject: x86: mm: use hugetlb_bad_size() Update setup_hugepagesz() to call hugetlb_bad_size() when unsupported hugepage size is found. Signed-off-by: Vaishali Thakkar Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 14a9505..2ae8584 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -165,6 +165,7 @@ static __init int setup_hugepagesz(char *opt) } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { + hugetlb_bad_size(); printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", ps >> 20); return 0; -- cgit v0.10.2 From 32f6271dbdc351dce96b11c5f3567bae8188004f Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:23 -0700 Subject: mm/hugetlb: is_vm_hugetlb_page() can return bool Make is_vm_hugetlb_page() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 2bb681f..a4e7ca0 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -5,16 +5,16 @@ #include -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else -static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { - return 0; + return false; } #endif -- cgit v0.10.2 From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:26 -0700 Subject: mm/memory_hotplug: is_mem_section_removable() can return bool Make is_mem_section_removable() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index adbef58..20d8a5d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); +extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern void remove_memory(int nid, u64 start, u64 size); #else -static inline int is_mem_section_removable(unsigned long pfn, +static inline bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages) { - return 0; + return false; } static inline void try_offline_node(int nid) {} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa34431..b21d889 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1410,7 +1410,7 @@ static struct page *next_active_pageblock(struct page *page) } /* Checks if this range of memory is likely to be hot-removable. */ -int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; @@ -1418,12 +1418,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { if (!is_pageblock_removable_nolock(page)) - return 0; + return false; cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ - return 1; + return true; } /* -- cgit v0.10.2 From bb00a789e565b96c52b2224c2280f7ac83175bec Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:29 -0700 Subject: mm/vmalloc.c: is_vmalloc_addr() can return bool Make is_vmalloc_addr() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 1193a54..5b37513 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ -static inline int is_vmalloc_addr(const void *x) +static inline bool is_vmalloc_addr(const void *x) { #ifdef CONFIG_MMU unsigned long addr = (unsigned long)x; return addr >= VMALLOC_START && addr < VMALLOC_END; #else - return 0; + return false; #endif } #ifdef CONFIG_MMU -- cgit v0.10.2 From 4ee815be1d34a6f254b3d09bdebcb27f294f2bd3 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:32 -0700 Subject: mm/mempolicy.c: vma_migratable() can return bool Make vma_migratable() return bool due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 2696c1f..6978a99 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol); extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ -static inline int vma_migratable(struct vm_area_struct *vma) +static inline bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - return 0; + return false; #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION if (vma->vm_flags & VM_HUGETLB) - return 0; + return false; #endif /* @@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) - return 0; - return 1; + return false; + return true; } extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); -- cgit v0.10.2 From fda3d69be9fe7a24ad32b840cb2ed7c30b6ba1c9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:11:34 -0700 Subject: mm/memcontrol.c:mem_cgroup_select_victim_node(): clarify comment > The comment seems to have not much to do with the code? I guess the comment tries to say that the code path is triggered when we charge the page which happens _before_ it is added to the LRU list and so last_scanned_node might contain the stale data. Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6740c4c..011dac8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1391,10 +1391,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) node = next_node_in(node, memcg->scan_nodes); /* - * We call this when we hit limit, not when pages are added to LRU. - * No LRU may hold pages because all pages are UNEVICTABLE or - * memcg is too small and all pages are not on LRU. In that case, - * we use curret node. + * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages + * last time it really checked all the LRUs due to rate limiting. + * Fallback to the current node in that case for simplicity. */ if (unlikely(node == MAX_NUMNODES)) node = numa_node_id(); -- cgit v0.10.2 From 949698a31af3b3808b0ff0cca26f36e68953bd1f Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 19 May 2016 17:11:37 -0700 Subject: mm/page_alloc: Remove useless parameter of __free_pages_boot_core __free_pages_boot_core has parameter pfn which is not used at all. Remove it. Signed-off-by: Li Zhang Reviewed-by: Pan Xinhui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ce57f9..34f688b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1076,8 +1076,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, - unsigned long pfn, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1154,7 +1153,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, pfn, order); + return __free_pages_boot_core(page, order); } /* @@ -1239,12 +1238,12 @@ static void __init deferred_free_range(struct page *page, if (nr_pages == MAX_ORDER_NR_PAGES && (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pfn, MAX_ORDER-1); + __free_pages_boot_core(page, MAX_ORDER-1); return; } - for (i = 0; i < nr_pages; i++, page++, pfn++) - __free_pages_boot_core(page, pfn, 0); + for (i = 0; i < nr_pages; i++, page++) + __free_pages_boot_core(page, 0); } /* Completion tracking for deferred_init_memmap() threads */ -- cgit v0.10.2 From 54f18d35263334ebcc6bf409fee3c0c8c22e5588 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:11:40 -0700 Subject: mm/hugetlb.c: use first_memory_node Instead of open-coding it. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0adb74d..0f580ea 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2684,8 +2684,8 @@ void __init hugetlb_add_hstate(unsigned int order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); - h->next_nid_to_free = first_node(node_states[N_MEMORY]); + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); -- cgit v0.10.2 From fee83b3aba4b7ddb0cb1497a04ddebcaa43f236e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:11:43 -0700 Subject: mm/mempolicy.c:offset_il_node() document and clarify This code was pretty obscure and was relying upon obscure side-effects of next_node(-1, ...) and was relying upon NUMA_NO_NODE being equal to -1. Clean that all up and document the function's intent. Acked-by: Vlastimil Babka Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8d369ce..7f80ebc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1758,23 +1758,25 @@ unsigned int mempolicy_slab_node(void) } } -/* Do static interleaving for a VMA with known offset. */ +/* + * Do static interleaving for a VMA with known offset @n. Returns the n'th + * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the + * number of present nodes. + */ static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) + struct vm_area_struct *vma, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; - int c; - int nid = NUMA_NO_NODE; + int i; + int nid; if (!nnodes) return numa_node_id(); - target = (unsigned int)off % nnodes; - c = 0; - do { + target = (unsigned int)n % nnodes; + nid = first_node(pol->v.nodes); + for (i = 0; i < target; i++) nid = next_node(nid, pol->v.nodes); - c++; - } while (c <= target); return nid; } -- cgit v0.10.2 From e4c5800a3991f0c6a766983535dfc10d51802cf6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 19 May 2016 17:11:46 -0700 Subject: mm/rmap: replace BUG_ON(anon_vma->degree) with VM_WARN_ON This check effectively catches anon vma hierarchy inconsistence and some vma corruptions. It was effective for catching corner cases in anon vma reusing logic. For now this code seems stable so check could be hidden under CONFIG_DEBUG_VM and replaced with WARN because it's not so fatal. Signed-off-by: Konstantin Khlebnikov Suggested-by: Vasily Averin Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/rmap.c b/mm/rmap.c index 307b555..4cebe8a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - BUG_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->degree); put_anon_vma(anon_vma); list_del(&avc->same_vma); -- cgit v0.10.2 From 06b6640a3902d6d50c1bb4fb1f29a46b207dbf08 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:11:48 -0700 Subject: mm, compaction: wrap calculating first and last pfn of pageblock Compaction code has accumulated numerous instances of manual calculations of the first (inclusive) and last (exclusive) pfn of a pageblock (or a smaller block of given order), given a pfn within the pageblock. Wrap these calculations by introducing pageblock_start_pfn(pfn) and pageblock_end_pfn(pfn) macros. [vbabka@suse.cz: fix crash in get_pfnblock_flags_mask() from isolate_freepages():] Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Cc: Minchan Kim Cc: Michal Hocko Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 8fa2540..017a1a1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define CREATE_TRACE_POINTS #include +#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) +#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) +#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) +#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone) zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = - round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); + pageblock_start_pfn(zone_end_pfn(zone) - 1); } /* @@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc, LIST_HEAD(freelist); pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn += isolated, block_start_pfn = block_end_pfn, @@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); } @@ -834,10 +839,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn = block_end_pfn, block_start_pfn = block_end_pfn, @@ -924,10 +929,10 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = pageblock_start_pfn(cc->free_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); - low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + low_pfn = pageblock_end_pfn(cc->migrate_pfn); /* * Isolate free pages until enough are available to migrate the @@ -1081,12 +1086,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; - block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(low_pfn); /* * Iterate over whole pageblocks until we find the first suitable. @@ -1343,7 +1348,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { @@ -1411,7 +1416,7 @@ check_drain: if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = - cc->migrate_pfn & ~((1UL << cc->order) - 1); + block_start_pfn(cc->migrate_pfn, cc->order); if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu(); @@ -1436,7 +1441,7 @@ out: cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ - free_pfn &= ~(pageblock_nr_pages-1); + free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() -- cgit v0.10.2 From a34753d275576896b06af9baa6f54bee258368c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:11:51 -0700 Subject: mm, compaction: reduce spurious pcplist drains Compaction drains the local pcplists each time migration scanner moves away from a cc->order aligned block where it isolated pages for migration, so that the pages freed by migrations can merge into higher orders. The detection is currently coarser than it could be. The cc->last_migrated_pfn variable should track the lowest pfn that was isolated for migration. But it is set to the pfn where isolate_migratepages_block() starts scanning, which is typically the first pfn of the pageblock. There, the scanner might fail to isolate several order-aligned blocks, and then isolate COMPACT_CLUSTER_MAX in another block. This would cause the pcplists drain to be performed, although the scanner didn't yet finish the block where it isolated from. This patch thus makes cc->last_migrated_pfn handling more accurate by setting it to the pfn of an actually isolated page in isolate_migratepages_block(). Although practical effects of this patch are likely low, it arguably makes the intent of the code more obvious. Also the next patch will make async direct compaction skip blocks more aggressively, and draining pcplists due to skipped blocks is wasteful. Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 017a1a1..329973a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -787,6 +787,15 @@ isolate_success: cc->nr_migratepages++; nr_isolated++; + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that was isolated and likely be + * then freed by migration. + */ + if (!cc->last_migrated_pfn) + cc->last_migrated_pfn = low_pfn; + /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; @@ -1075,7 +1084,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long block_start_pfn; unsigned long block_end_pfn; unsigned long low_pfn; - unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1130,7 +1138,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ - isolate_start_pfn = low_pfn; low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); @@ -1140,15 +1147,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (cc->nr_migratepages && !cc->last_migrated_pfn) - cc->last_migrated_pfn = isolate_start_pfn; - - /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. -- cgit v0.10.2 From fdd048e12c9a46d058f69822cb15641adae181e1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:11:55 -0700 Subject: mm, compaction: skip blocks where isolation fails in async direct compaction The goal of direct compaction is to quickly make a high-order page available for the pending allocation. Within an aligned block of pages of desired order, a single allocated page that cannot be isolated for migration means that the block cannot fully merge to a buddy page that would satisfy the allocation request. Therefore we can reduce the allocation stall by skipping the rest of the block immediately on isolation failure. For async compaction, this also means a higher chance of succeeding until it detects contention. We however shouldn't completely sacrifice the second objective of compaction, which is to reduce overal long-term memory fragmentation. As a compromise, perform the eager skipping only in direct async compaction, while sync compaction (including kcompactd) remains thorough. Testing was done using stress-highalloc from mmtests, configured for order-4 GFP_KERNEL allocations: 4.6-rc1 4.6-rc1 before after Success 1 Min 24.00 ( 0.00%) 27.00 (-12.50%) Success 1 Mean 30.20 ( 0.00%) 31.60 ( -4.64%) Success 1 Max 37.00 ( 0.00%) 35.00 ( 5.41%) Success 2 Min 42.00 ( 0.00%) 32.00 ( 23.81%) Success 2 Mean 44.00 ( 0.00%) 44.80 ( -1.82%) Success 2 Max 48.00 ( 0.00%) 52.00 ( -8.33%) Success 3 Min 91.00 ( 0.00%) 92.00 ( -1.10%) Success 3 Mean 92.20 ( 0.00%) 92.80 ( -0.65%) Success 3 Max 94.00 ( 0.00%) 93.00 ( 1.06%) We can see that success rates are unaffected by the skipping. 4.6-rc1 4.6-rc1 before after User 2587.42 2566.53 System 482.89 471.20 Elapsed 1395.68 1382.00 Times are not so useful metric for this benchmark as main portion is the interfering kernel builds, but results do hint at reduced system times. 4.6-rc1 4.6-rc1 before after Direct pages scanned 163614 159608 Kswapd pages scanned 2070139 2078790 Kswapd pages reclaimed 2061707 2069757 Direct pages reclaimed 163354 159505 Reduced direct reclaim was unintended, but could be explained by more successful first attempt at (async) direct compaction, which is attempted before the first reclaim attempt in __alloc_pages_slowpath(). Compaction stalls 33052 39853 Compaction success 12121 19773 Compaction failures 20931 20079 Compaction is indeed more successful, and thus less likely to get deferred, so there are also more direct compaction stalls. Page migrate success 3781876 3326819 Page migrate failure 45817 41774 Compaction pages isolated 7868232 6941457 Compaction migrate scanned 168160492 127269354 Compaction migrate prescanned 0 0 Compaction free scanned 2522142582 2326342620 Compaction free direct alloc 0 0 Compaction free dir. all. miss 0 0 Compaction cost 5252 4476 The patch reduces migration scanned pages by 25% thanks to the eager skipping. [hughd@google.com: prevent nr_isolated_* from going negative] Signed-off-by: Vlastimil Babka Signed-off-by: Hugh Dickins Cc: Joonsoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 329973a..7487067 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -638,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; - struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; + bool skip_on_failure = false; + unsigned long next_skip_pfn = 0; /* * Ensure that there are not too many pages isolated from the LRU @@ -664,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (compact_should_abort(cc)) return 0; + if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { + skip_on_failure = true; + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { bool is_lru; + if (skip_on_failure && low_pfn >= next_skip_pfn) { + /* + * We have isolated all migration candidates in the + * previous order-aligned block, and did not skip it due + * to failure. We should migrate the pages now and + * hopefully succeed compaction. + */ + if (nr_isolated) + break; + + /* + * We failed to isolate in the previous order-aligned + * block. Set the new boundary to the end of the + * current block. Note we can't simply increase + * next_skip_pfn by 1 << order, as low_pfn might have + * been incremented by a higher number due to skipping + * a compound or a high-order buddy page in the + * previous loop iteration. + */ + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -679,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, break; if (!pfn_valid_within(low_pfn)) - continue; + goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); @@ -734,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (likely(comp_order < MAX_ORDER)) low_pfn += (1UL << comp_order) - 1; - continue; + goto isolate_fail; } if (!is_lru) - continue; + goto isolate_fail; /* * Migration will fail if an anonymous page is pinned in memory, @@ -747,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!page_mapping(page) && page_count(page) > page_mapcount(page)) - continue; + goto isolate_fail; /* If we already hold the lock, we can skip some rechecking */ if (!locked) { @@ -758,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) - continue; + goto isolate_fail; /* * Page become compound since the non-locked check, @@ -767,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (unlikely(PageCompound(page))) { low_pfn += (1UL << compound_order(page)) - 1; - continue; + goto isolate_fail; } } @@ -775,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) - continue; + goto isolate_fail; VM_BUG_ON_PAGE(PageCompound(page), page); @@ -783,7 +811,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - list_add(&page->lru, migratelist); + list_add(&page->lru, &cc->migratepages); cc->nr_migratepages++; nr_isolated++; @@ -801,6 +829,37 @@ isolate_success: ++low_pfn; break; } + + continue; +isolate_fail: + if (!skip_on_failure) + continue; + + /* + * We have isolated some pages, but then failed. Release them + * instead of migrating, as we cannot form the cc->order buddy + * page anyway. + */ + if (nr_isolated) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } + acct_isolated(zone, cc); + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + cc->last_migrated_pfn = 0; + nr_isolated = 0; + } + + if (low_pfn < next_skip_pfn) { + low_pfn = next_skip_pfn - 1; + /* + * The check near the loop beginning would have updated + * next_skip_pfn too, but this is a bit simpler. + */ + next_skip_pfn += 1UL << cc->order; + } } /* @@ -1401,6 +1460,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = COMPACT_CONTENDED; goto out; } + /* + * We failed to migrate at least one page in the current + * order-aligned block, so skip the rest of it. + */ + if (cc->direct_compaction && + (cc->mode == MIGRATE_ASYNC)) { + cc->migrate_pfn = block_end_pfn( + cc->migrate_pfn - 1, cc->order); + /* Draining pcplists is useless in this case */ + cc->last_migrated_pfn = 0; + + } } check_drain: -- cgit v0.10.2 From 29f9cb53d25cd9916537b44b0af7f0b95a2e4438 Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Thu, 19 May 2016 17:11:57 -0700 Subject: mm/highmem: simplify is_highmem() is_highmem() can be simplified by use of is_highmem_idx(). This patch removes redundant code and will make it easier to maintain if the zone policy is changed or a new zone is added. (akpm: saves me 25 bytes of text per is_highmem() callsite) Signed-off-by: Chanho Min Reviewed-by: Dan Williams Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c60df92..150c604 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -828,10 +828,7 @@ static inline int is_highmem_idx(enum zone_type idx) static inline int is_highmem(struct zone *zone) { #ifdef CONFIG_HIGHMEM - int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; - return zone_off == ZONE_HIGHMEM * sizeof(*zone) || - (zone_off == ZONE_MOVABLE * sizeof(*zone) && - zone_movable_is_highmem()); + return is_highmem_idx(zone_idx(zone)); #else return 0; #endif -- cgit v0.10.2 From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:12:00 -0700 Subject: mm: uninline page_mapped() It's huge. Uninlining it saves 206 bytes per callsite. Shaves 4924 bytes from the x86_64 allmodconfig vmlinux. [akpm@linux-foundation.org: coding-style fixes] Cc: Steve Capper Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b37513..9c2852c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page) return page->index; } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped. - */ -static inline bool page_mapped(struct page *page) -{ - int i; - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - page = compound_head(page); - if (atomic_read(compound_mapcount_ptr(page)) >= 0) - return true; - if (PageHuge(page)) - return false; - for (i = 0; i < hpage_nr_pages(page); i++) { - if (atomic_read(&page[i]._mapcount) >= 0) - return true; - } - return false; -} +bool page_mapped(struct page *page); /* * Return true only if the page has been allocated with diff --git a/mm/util.c b/mm/util.c index 6cc81e7..8a1b3a1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -346,6 +346,29 @@ void *page_rmapping(struct page *page) return __page_rmapping(page); } +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. + */ +bool page_mapped(struct page *page) +{ + int i; + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + if (PageHuge(page)) + return false; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; +} +EXPORT_SYMBOL(page_mapped); + struct anon_vma *page_anon_vma(struct page *page) { unsigned long mapping; -- cgit v0.10.2 From f44b2dda8bc29de36ccdc1e04092de7d0b2d5868 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:03 -0700 Subject: mm/hugetlb: add same zone check in pfn_range_valid_gigantic() This patchset deals with some problematic sites that iterate pfn ranges. There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to take care of this overlapping when iterating pfn range. I audit many iterating sites that uses pfn_valid(), pfn_valid_within(), zone_start_pfn and etc. and others looks safe to me. This is a preparation step for a new CMA implementation, ZONE_CMA (https://lkml.org/lkml/2015/2/12/95), because it would be easily overlapped with other zones. But, zone overlap check is also needed for the general case so I send it separately. This patch (of 5): alloc_gigantic_page() uses alloc_contig_range() and this requires that the requested range is in a single zone. To satisfy this requirement, add this check to pfn_range_valid_gigantic(). Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0f580ea..949d806 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1031,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); } -static bool pfn_range_valid_gigantic(unsigned long start_pfn, - unsigned long nr_pages) +static bool pfn_range_valid_gigantic(struct zone *z, + unsigned long start_pfn, unsigned long nr_pages) { unsigned long i, end_pfn = start_pfn + nr_pages; struct page *page; @@ -1043,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn, page = pfn_to_page(i); + if (page_zone(page) != z) + return false; + if (PageReserved(page)) return false; @@ -1075,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) pfn = ALIGN(z->zone_start_pfn, nr_pages); while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(pfn, nr_pages)) { + if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone -- cgit v0.10.2 From b9eb63191a6ad70127ebdd83650b810cdc1d1117 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:06 -0700 Subject: mm/memory_hotplug: add comment to some functions related to memory hotplug __offline_isolated_pages() and test_pages_isolated() are used by memory hotplug. These functions require that range is in a single zone but there is no code to do this because memory hotplug checks it before calling these functions. To avoid confusing future user of these functions, this patch adds comments to them. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34f688b..19fe7e9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7176,7 +7176,8 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be isolated before calling this. + * All pages in the range must be in a single zone and isolated + * before calling this. */ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 67bedd1..612122b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } +/* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { -- cgit v0.10.2 From a91c43c7313a995a8908f8f6b911a85d00fdbffd Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:10 -0700 Subject: mm/vmstat: add zone range overlapping check There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to care this overlapping when iterating pfn range. There are two places in vmstat.c that iterates pfn range and they don't consider this overlapping. Add it. Without this patch, above system could over count pageblock number on a zone. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmstat.c b/mm/vmstat.c index 5e43004..0a726e3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1010,6 +1010,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!memmap_valid_within(pfn, page, zone)) continue; + if (page_zone(page) != zone) + continue; + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) @@ -1076,6 +1079,10 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (PageBuddy(page)) { pfn += (1UL << page_order(page)) - 1; continue; -- cgit v0.10.2 From 9d43f5aec9506d98ad492a783aa8a18226c5d474 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:13 -0700 Subject: mm/page_owner: add zone range overlapping check There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to care this overlapping when iterating pfn range. There are one place in page_owner.c that iterates pfn range and it doesn't consider this overlapping. Add it. Without this patch, above system could over count early allocated page number before page_owner is activated. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_owner.c b/mm/page_owner.c index ac3d8d1..438768c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + /* * We are safe to check buddy flag and order, because * this is init stage and only single thread runs. -- cgit v0.10.2 From ba6b0979e346fd91d3b7ef6956d7155877308f0f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:16 -0700 Subject: power: add zone range overlapping check There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to care this overlapping when iterating pfn range. mark_free_pages() iterates requested zone's pfn range and unset all range's bitmap first. And then it marks freepages in a zone to the bitmap. If there is an overlapping zone, above unset could clear previous marked bit and reference to this bitmap in the future will cause the problem. To prevent it, this patch adds a zone check in mark_free_pages(). Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19fe7e9..e132705 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2156,6 +2156,10 @@ void mark_free_pages(struct zone *zone) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } -- cgit v0.10.2 From 09b4ab3c433733127f8644290e94548a4cb8122f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:20 -0700 Subject: mm/writeback: correct dirty page calculation for highmem ZONE_MOVABLE could be treated as highmem so we need to consider it for accurate calculation of dirty pages. And, in following patches, ZONE_CMA will be introduced and it can be treated as highmem, too. So, instead of manually adding stat of ZONE_MOVABLE, looping all zones and check whether the zone is highmem or not and add stat of the zone which can be treated as highmem. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bc5149d..3b88795 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; + int i; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *z = &NODE_DATA(node)->node_zones[i]; - x += zone_dirtyable_memory(z); + if (is_highmem(z)) + x += zone_dirtyable_memory(z); + } } /* * Unreclaimable memory (kernel memory or anonymous memory -- cgit v0.10.2 From fc2bd799c7c79c84a59da6f9221370bc6f38c503 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:23 -0700 Subject: mm/page_alloc: correct highmem memory statistics ZONE_MOVABLE could be treated as highmem so we need to consider it for accurate statistics. And, in following patches, ZONE_CMA will be introduced and it can be treated as highmem, too. So, instead of manually adding stat of ZONE_MOVABLE, looping all zones and check whether the zone is highmem or not and add stat of the zone which can be treated as highmem. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e132705..da6d339 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3793,6 +3793,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) @@ -3801,12 +3803,19 @@ void si_meminfo_node(struct sysinfo *val, int nid) val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; - val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone->managed_pages; + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #else - val->totalhigh = 0; - val->freehigh = 0; + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } -- cgit v0.10.2 From 33499bfe507c844f2c6f55ae8cec17705d0eda95 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:26 -0700 Subject: mm/highmem: make nr_free_highpages() handles all highmem zones by itself nr_free_highpages() manually adds statistics per each highmem zone and returns a total value for them. Whenever we add a new highmem zone, we need to consider this function and it's really troublesome. Make it handle all highmem zones by itself. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/highmem.c b/mm/highmem.c index 123bcd3..50b4ca6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); unsigned int nr_free_highpages (void) { - pg_data_t *pgdat; + struct zone *zone; unsigned int pages = 0; - for_each_online_pgdat(pgdat) { - pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); - if (zone_movable_is_highmem()) - pages += zone_page_state( - &pgdat->node_zones[ZONE_MOVABLE], - NR_FREE_PAGES); + for_each_populated_zone(zone) { + if (is_highmem(zone)) + pages += zone_page_state(zone, NR_FREE_PAGES); } return pages; -- cgit v0.10.2 From e87d59f7a2002aa2d4582d4ea16da91dd3c72752 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:29 -0700 Subject: mm/vmstat: make node_page_state() handles all zones by itself node_page_state() manually adds statistics per each zone and returns total value for all zones. Whenever we add a new zone, we need to consider this function and it's really troublesome. Make it handle all zones by itself. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmstat.c b/mm/vmstat.c index 0a726e3..a7de9ad 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -600,19 +600,13 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) unsigned long node_page_state(int node, enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; - return -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], item) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], item) + -#endif -#ifdef CONFIG_HIGHMEM - zone_page_state(&zones[ZONE_HIGHMEM], item) + -#endif - zone_page_state(&zones[ZONE_NORMAL], item) + - zone_page_state(&zones[ZONE_MOVABLE], item); + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_page_state(zones + i, item); + + return count; } #endif -- cgit v0.10.2 From 1269019e69a6798db15edea8921f83215ef954d6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 19 May 2016 17:12:32 -0700 Subject: mm/mmap: kill hook arch_rebalance_pgtables() Nobody uses it. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index bd2e1a53..fba246b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -55,10 +55,6 @@ #define arch_mmap_check(addr, len, flags) (0) #endif -#ifndef arch_rebalance_pgtables -#define arch_rebalance_pgtables(addr, len) (addr) -#endif - #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; @@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (offset_in_page(addr)) return -EINVAL; - addr = arch_rebalance_pgtables(addr, len); error = security_mmap_addr(addr); return error ? error : addr; } -- cgit v0.10.2 From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:35 -0700 Subject: mm: update_lru_size warn and reset bad lru_size Though debug kernels have a VM_BUG_ON to help protect from misaccounting lru_size, non-debug kernels are liable to wrap it around: and then the vast unsigned long size draws page reclaim into a loop of repeatedly doing nothing on an empty list, without even a cond_resched(). That soft lockup looks confusingly like an over-busy reclaim scenario, with lots of contention on the lru_lock in shrink_inactive_list(): yet has a totally different origin. Help differentiate with a custom warning in mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the size to avoid the lockup. But the particular bug which suggested this change was mine alone, and since fixed. Make it a WARN_ONCE: the first occurrence is the most informative, a flurry may follow, yet even when rate-limited little more is learnt. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 712e8c3..d8cea81 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { int nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_del(&page->lru); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 011dac8..6a01997 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1023,22 +1023,38 @@ out: * @lru: index of lru list the page is sitting on * @nr_pages: positive when adding or negative when removing * - * This function must be called when a page is added to or removed from an - * lru list. + * This function must be called under lru_lock, just before a page is added + * to or just after a page is removed from an lru list (that ordering being + * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { struct mem_cgroup_per_zone *mz; unsigned long *lru_size; + long size; + bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); lru_size = mz->lru_size + lru; - *lru_size += nr_pages; - VM_BUG_ON((long)(*lru_size) < 0); + empty = list_empty(lruvec->lists + lru); + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; + if (WARN_ONCE(size < 0 || empty != !size, + "%s(%p, %d, %d): lru_size %ld but %sempty\n", + __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + VM_BUG_ON(1); + *lru_size = 0; + } + + if (nr_pages > 0) + *lru_size += nr_pages; } bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -- cgit v0.10.2 From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:38 -0700 Subject: mm: update_lru_size do the __mod_zone_page_state Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy reclaim was removed) that lru_size can be updated by -nr_taken once per call to isolate_lru_pages(), instead of page by page. Update it inside isolate_lru_pages(), or at its two callsites? I chose to update it at the callsites, rearranging and grouping the updates by nr_taken and nr_scanned together in both. With one exception, mem_cgroup_update_lru_size(,lru,) is then used where __mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding some more calls in a future commit. Make the code a little smaller and simpler by incorporating stat update in lru_size update. The exception was move_active_pages_to_lru(), which aggregated the pgmoved stat update separately from the individual lru_size updates; but I still think this a simplification worth making. However, the __mod_zone_page_state is not peculiar to mem_cgroups: so better use the name update_lru_size, calls mem_cgroup_update_lru_size when CONFIG_MEMCG. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1191d79..94da967 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return 0; } -static inline void -mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int increment) -{ -} - static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d8cea81..5bd29ba 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page) return !PageSwapBacked(page); } +static __always_inline void __update_lru_size(struct lruvec *lruvec, + enum lru_list lru, int nr_pages) +{ + __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); +} + +static __always_inline void update_lru_size(struct lruvec *lruvec, + enum lru_list lru, int nr_pages) +{ +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); +#else + __update_lru_size(lruvec, lru, nr_pages); +#endif +} + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - int nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - int nr_pages = hpage_nr_pages(page); list_del(&page->lru); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages); + update_lru_size(lruvec, lru, -hpage_nr_pages(page)); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a01997..1b40dca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1035,6 +1035,8 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, long size; bool empty; + __update_lru_size(lruvec, lru, nr_pages); + if (mem_cgroup_disabled()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index d3a02ac..dcfdfc1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { struct page *page; - int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); @@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); + nr_taken += hpage_nr_pages(page); list_move(&page->lru, dst); - nr_taken += nr_pages; break; case -EBUSY: @@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + update_lru_size(lruvec, lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); @@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { if (current_is_kswapd()) __count_zone_vm_events(PGSTEAL_KSWAPD, zone, @@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, pages_to_free); } } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) __count_vm_events(PGDEACTIVATE, pgmoved); } @@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + update_lru_size(lruvec, lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; + if (global_reclaim(sc)) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); __count_zone_vm_events(PGREFILL, zone, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { -- cgit v0.10.2 From fa9949da59a15017a02c86b087c7499d7b5702be Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:41 -0700 Subject: mm: use __SetPageSwapBacked and dont ClearPageSwapBacked v3.16 commit 07a427884348 ("mm: shmem: avoid atomic operation during shmem_getpage_gfp") rightly replaced one instance of SetPageSwapBacked by __SetPageSwapBacked, pointing out that the newly allocated page is not yet visible to other users (except speculative get_page_unless_zero- ers, who may not update page flags before their further checks). That was part of a series in which Mel was focused on tmpfs profiles: but almost all SetPageSwapBacked uses can be so optimized, with the same justification. Remove ClearPageSwapBacked from __read_swap_cache_async() error path: it's not an error to free a page with PG_swapbacked set. Follow a convention of __SetPageLocked, __SetPageSwapBacked instead of doing it differently in different places; but that's for tidiness - if the ordering actually mattered, we should not be using the __variants. There's probably scope for further __SetPageFlags in other places, but SwapBacked is the one I'm interested in at the moment. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Reviewed-by: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index f9dfb18..53ab639 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); return MIGRATEPAGE_SUCCESS; } @@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { @@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 4cebe8a..8a83993 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page, int nr = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - SetPageSwapBacked(page); + __SetPageSwapBacked(page); if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ diff --git a/mm/shmem.c b/mm/shmem.c index e684a91..9e609d5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1085,8 +1085,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, flush_dcache_page(newpage); __SetPageLocked(newpage); + __SetPageSwapBacked(newpage); SetPageUptodate(newpage); - SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1276,8 +1276,8 @@ repeat: goto decused; } - __SetPageSwapBacked(page); __SetPageLocked(page); + __SetPageSwapBacked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 366ce35..0d457e7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); @@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return new_page; } radix_tree_preload_end(); - ClearPageSwapBacked(new_page); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely -- cgit v0.10.2 From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:44 -0700 Subject: tmpfs: preliminary minor tidyups Make a few cleanups in mm/shmem.c, before going on to complicate it. shmem_alloc_page() will become more complicated: we can't afford to to have that complication duplicated between a CONFIG_NUMA version and a !CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a single shmem_swapin() and a single shmem_alloc_page(). Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA configurations, but eliminating it is a larger cleanup: I have an alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and bug-prone, and changed yet again since my last version. Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to shmem_alloc_page(): that SwapBacked flag will be useful in future, to help to distinguish different cases appropriately. And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of little use (IIRC it dates back to when shmem_getpage() returned the page unlocked): kill it and do the necessary in shmem_file_read_iter(). But an arm64 build then complained that info may be uninitialized (where shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and advancing to an "sgp <= SGP_CACHE" test jogged it back to reality. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 6978a99..4429d25 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p) { } +static inline struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + return NULL; +} + #define vma_policy(vma) NULL static inline int diff --git a/mm/shmem.c b/mm/shmem.c index 9e609d5..6d2de2c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,7 +101,6 @@ struct shmem_falloc { enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; @@ -169,7 +168,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as - * pages are allocated, in order to allow huge sparse files. + * pages are allocated, in order to allow large sparse files. * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ @@ -947,8 +946,7 @@ redirty: return 0; } -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -972,7 +970,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } return mpol; } -#endif /* CONFIG_TMPFS */ +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ +#ifndef CONFIG_NUMA +#define vm_policy vm_private_data +#endif static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) @@ -1008,39 +1017,17 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + } /* Drop reference taken by mpol_shared_policy_lookup() */ mpol_cond_put(pvma.vm_policy); return page; } -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -{ -} -#endif /* CONFIG_TMPFS */ - -static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return swapin_readahead(swap, gfp, NULL, 0); -} - -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return alloc_page(gfp); -} -#endif /* CONFIG_NUMA */ - -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) -static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) -{ - return NULL; -} -#endif /* * When a page is moved from swapcache to shmem filecache (either by the @@ -1084,8 +1071,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); SetPageUptodate(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1155,7 +1140,7 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; @@ -1275,9 +1260,6 @@ repeat: error = -ENOMEM; goto decused; } - - __SetPageLocked(page); - __SetPageSwapBacked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); @@ -1321,12 +1303,10 @@ clear: flush_dcache_page(page); SetPageUptodate(page); } - if (sgp == SGP_DIRTY) - set_page_dirty(page); } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); @@ -1633,7 +1613,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and even mark them dirty, so it cannot exceed the max_blocks limit. */ if (!iter_is_iovec(to)) - sgp = SGP_DIRTY; + sgp = SGP_CACHE; index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; @@ -1659,8 +1639,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = 0; break; } - if (page) + if (page) { + if (sgp == SGP_CACHE) + set_page_dirty(page); unlock_page(page); + } /* * We must evaluate after, since reads (unlike writes) -- cgit v0.10.2 From 9e18eb29356b7dfd55183bd42cf73919d1590835 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Thu, 19 May 2016 17:12:47 -0700 Subject: tmpfs: mem_cgroup charge fault to vm_mm not current mm Although shmem_fault() has been careful to count a major fault to vm_mm, shmem_getpage_gfp() has been careless in charging a remote access fault to current->mm owner's memcg instead of to vma->vm_mm owner's memcg: that is inconsistent with all the mem_cgroup charging on remote access faults in mm/memory.c. Fix it by passing fault_mm along with fault_type to shmem_get_page_gfp(); but in that case, now knowing the right mm, it's better for it to handle the PGMAJFAULT updates itself. And let's keep this clutter out of most callers' way: change the common shmem_getpage() wrapper to hide fault_mm and fault_type as well as gfp. Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index 6d2de2c..e418a99 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -121,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); static inline int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, int *fault_type) + struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), fault_type); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -527,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (partial_start) { struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + shmem_getpage(inode, start - 1, &page, SGP_READ); if (page) { unsigned int top = PAGE_SIZE; if (start > end) { @@ -542,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } if (partial_end) { struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ, NULL); + shmem_getpage(inode, end, &page, SGP_READ); if (page) { zero_user_segment(page, 0, partial_end); set_page_dirty(page); @@ -1115,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap - * entry since a page cannot live in both the swap and page cache + * entry since a page cannot live in both the swap and page cache. + * + * fault_mm and fault_type are only supplied by shmem_fault: + * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) + struct page **pagep, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; @@ -1168,14 +1174,19 @@ repeat: */ info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = fault_mm ? : current->mm; if (swap.val) { /* Look it up and read it in.. */ page = lookup_swap_cache(swap); if (!page) { - /* here we actually do the io */ - if (fault_type) + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + } + /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); if (!page) { error = -ENOMEM; @@ -1202,7 +1213,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1263,7 +1274,7 @@ repeat: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (error) goto decused; @@ -1352,6 +1363,7 @@ unlock: static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); int error; int ret = VM_FAULT_LOCKED; @@ -1413,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); + error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + gfp, vma->vm_mm, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - - if (ret & VM_FAULT_MAJOR) { - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - } return ret; } @@ -1567,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + return shmem_getpage(inode, index, pagep, SGP_WRITE); } static int @@ -1633,7 +1641,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, sgp, NULL); + error = shmem_getpage(inode, index, &page, sgp); if (error) { if (error == -EINVAL) error = 0; @@ -1749,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, error = 0; while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -1771,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, - SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -2215,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC, - NULL); + error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, @@ -2534,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + error = shmem_getpage(inode, 0, &page, SGP_WRITE); if (error) { iput(inode); return error; @@ -2575,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ, NULL); + error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); unlock_page(page); @@ -3479,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, int error; BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + gfp, NULL, NULL); if (error) page = ERR_PTR(error); else -- cgit v0.10.2 From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:50 -0700 Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 34a5fec..720355c 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm: - panic_on_oom - percpu_pagelist_fraction - stat_interval +- stat_refresh - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -755,6 +756,19 @@ is 1 second. ============================================================== +stat_refresh + +Any read or write (by root only) flushes all the per-cpu vm statistics +into their global totals, for more accurate reports when testing +e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo + +As a side-effect, it also checks for negative totals (elsewhere reported +as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. +(At time of writing, a few stats are known sometimes to be found negative, +with no ill effects: errors and warnings on these stats are suppressed.) + +============================================================== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 73fae8c..02fce41 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -193,6 +193,10 @@ void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); +struct ctl_table; +int vmstat_refresh(struct ctl_table *, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b3186..2effd84 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { diff --git a/mm/vmstat.c b/mm/vmstat.c index a7de9ad..c831be3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_stat[i]); + if (val < 0) { + switch (i) { + case NR_ALLOC_BATCH: + case NR_PAGES_SCANNED: + /* + * These are often seen to go negative in + * recent kernels, but not to go permanently + * negative. Whilst it would be nicer not to + * have exceptions, rooting them out would be + * another task, of rather low priority. + */ + break; + default: + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; + break; + } + } + } + if (err) + return err; + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ + static void vmstat_update(struct work_struct *w) { if (refresh_cpu_vm_stats(true)) { -- cgit v0.10.2 From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:54 -0700 Subject: huge mm: move_huge_pmd does not need new_vma Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as it was in the old vma, alignment and size permitting. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d7b9e53..419fb9e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); -extern bool move_huge_pmd(struct vm_area_struct *vma, - struct vm_area_struct *new_vma, - unsigned long old_addr, +extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8ac8f5..66675ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, - unsigned long old_addr, +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; - struct mm_struct *mm = vma->vm_mm; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE || - (new_vma->vm_flags & VM_NOHUGEPAGE)) + old_end - old_addr < HPAGE_PMD_SIZE) return false; /* diff --git a/mm/mremap.c b/mm/mremap.c index 3fa0a467..7d98fe1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -198,9 +198,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); - moved = move_huge_pmd(vma, new_vma, old_addr, - new_addr, old_end, - old_pmd, new_pmd); + moved = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); if (need_rmap_locks) anon_vma_unlock_write(vma->anon_vma); if (moved) { -- cgit v0.10.2 From 1d069b7dd56728a0eb6acb138dce0d37600dee00 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:57 -0700 Subject: huge pagecache: extend mremap pmd rmap lockout to files Whatever huge pagecache implementation we go with, file rmap locking must be added to anon rmap locking, when mremap's move_page_tables() finds a pmd_trans_huge pmd entry: a simple change, let's do it now. Factor out take_rmap_locks() and drop_rmap_locks() to handle the locking for make move_ptes() and move_page_tables(), and delete the VM_BUG_ON_VMA which rejected vm_file and required anon_vma. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mremap.c b/mm/mremap.c index 7d98fe1..9dc4999 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return pmd; } +static void take_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); +} + +static void drop_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); +} + static pte_t move_soft_dirty_pte(pte_t pte) { /* @@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) { - struct address_space *mapping = NULL; - struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ - if (need_rmap_locks) { - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - } - if (vma->anon_vma) { - anon_vma = vma->anon_vma; - anon_vma_lock_write(anon_vma); - } - } + if (need_rmap_locks) + take_rmap_locks(vma); /* * We don't have to worry about the ordering of src and dst @@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); - if (anon_vma) - anon_vma_unlock_write(anon_vma); - if (mapping) - i_mmap_unlock_write(mapping); + if (need_rmap_locks) + drop_rmap_locks(vma); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -193,15 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; - VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, - vma); /* See comment in move_ptes() */ if (need_rmap_locks) - anon_vma_lock_write(vma->anon_vma); + take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, old_end, old_pmd, new_pmd); if (need_rmap_locks) - anon_vma_unlock_write(vma->anon_vma); + drop_rmap_locks(vma); if (moved) { need_flush = true; continue; -- cgit v0.10.2 From fd8cfd3000191cb7f5b9ea8640bd46181f6b4b74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:13:00 -0700 Subject: arch: fix has_transparent_hugepage() I've just discovered that the useful-sounding has_transparent_hugepage() is actually an architecture-dependent minefield: on some arches it only builds if CONFIG_TRANSPARENT_HUGEPAGE=y, on others it's also there when not, but on some of those (arm and arm64) it then gives the wrong answer; and on mips alone it's marked __init, which would crash if called later (but so far it has not been called later). Straighten this out: make it available to all configs, with a sensible default in asm-generic/pgtable.h, removing its definitions from those arches (arc, arm, arm64, sparc, tile) which are served by the default, adding #define has_transparent_hugepage has_transparent_hugepage to those (mips, powerpc, s390, x86) which need to override the default at runtime, and removing the __init from mips (but maybe that kind of code should be avoided after init: set a static variable the first time it's called). Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Acked-by: David S. Miller Acked-by: Vineet Gupta [arch/arc] Acked-by: Gerald Schaefer [arch/s390] Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 7afe335..317ff77 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -61,8 +61,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); -#define has_transparent_hugepage() 1 - /* Generic variants assume pgtable_t is struct page *, hence need for these */ #define __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index dc46398..fa70db7 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -281,11 +281,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, flush_pmd_entry(pmdp); } -static inline int has_transparent_hugepage(void) -{ - return 1; -} - #endif /* __ASSEMBLY__ */ #endif /* _ASM_PGTABLE_3LEVEL_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2da46ae..a7ac45a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -314,11 +314,6 @@ static inline int pmd_protnone(pmd_t pmd) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) -static inline int has_transparent_hugepage(void) -{ - return 1; -} - #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9a4fe01..f53a7e3a4 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -468,6 +468,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); static inline int pmd_trans_huge(pmd_t pmd) diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index c17d762..2d93b63 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -400,19 +400,20 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int __init has_transparent_hugepage(void) +int has_transparent_hugepage(void) { - unsigned int mask; - unsigned long flags; - - local_irq_save(flags); - write_c0_pagemask(PM_HUGE_MASK); - back_to_back_c0_hazard(); - mask = read_c0_pagemask(); - write_c0_pagemask(PM_DEFAULT_MASK); + static unsigned int mask = -1; - local_irq_restore(flags); + if (mask == -1) { /* first call comes during __init */ + unsigned long flags; + local_irq_save(flags); + write_c0_pagemask(PM_HUGE_MASK); + back_to_back_c0_hazard(); + mask = read_c0_pagemask(); + write_c0_pagemask(PM_DEFAULT_MASK); + local_irq_restore(flags); + } return mask == PM_HUGE_MASK; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 77d3ce0..8fe6f6b4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -219,6 +219,7 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); +#define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 47897a3..ee09e99 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, struct page **pages, int *nr); #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 -#define has_transparent_hugepage() 0 #endif pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *shift); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2f66645..18d2beb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1223,6 +1223,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return MACHINE_HAS_HPAGE ? 1 : 0; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index f089cfa..93ce0ad 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -681,8 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd) return pte_val(pte) & _PAGE_PMD_HUGE; } -#define has_transparent_hugepage() 1 - static inline pmd_t pmd_mkold(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 96cecf5..2a26cc4 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -487,7 +487,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 #define pmd_trans_huge pmd_huge_page #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f86491a..1a27396 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9401f48..d4458b6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd) #define io_remap_pfn_range remap_pfn_range #endif +#ifndef has_transparent_hugepage +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define has_transparent_hugepage() 1 +#else +#define has_transparent_hugepage() 0 +#endif +#endif + #endif /* _ASM_GENERIC_PGTABLE_H */ -- cgit v0.10.2 From 8604d9e534a3e662600e288bcfd1a5acd2763d28 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 19 May 2016 17:13:03 -0700 Subject: memory_hotplug: introduce CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE This patchset continues the work I started with commit 31bc3858ea3e ("memory-hotplug: add automatic onlining policy for the newly added memory"). Initially I was going to stop there and bring the policy setting logic to userspace. I met two issues on this way: 1) It is possible to have memory hotplugged at boot (e.g. with QEMU). These blocks stay offlined if we turn the onlining policy on by userspace. 2) My attempt to bring this policy setting to systemd failed, systemd maintainers suggest to change the default in kernel or ... to use tmpfiles.d to alter the policy (which looks like a hack to me): https://github.com/systemd/systemd/pull/2938 Here I suggest to add a config option to set the default value for the policy and a kernel command line parameter to make the override. This patch (of 2): Introduce config option to set the default value for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks). The reason one would want to turn this option on are to have early onlining for hotpluggable memory available at boot and to not require any userspace actions to make memory hotplug work. [akpm@linux-foundation.org: tweak Kconfig text] Signed-off-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Dan Williams Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: David Vrabel Cc: David Rientjes Cc: Igor Mammedov Cc: Lennart Poettering Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 443f4b4..0d7cb95 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -261,10 +261,11 @@ it according to the policy which can be read from "auto_online_blocks" file: % cat /sys/devices/system/memory/auto_online_blocks -The default is "offline" which means the newly added memory is not in a -ready-to-use state and you have to "online" the newly added memory blocks -manually. Automatic onlining can be requested by writing "online" to -"auto_online_blocks" file: +The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config +option. If it is disabled the default is "offline" which means the newly added +memory is not in a ready-to-use state and you have to "online" the newly added +memory blocks manually. Automatic onlining can be requested by writing "online" +to "auto_online_blocks" file: % echo online > /sys/devices/system/memory/auto_online_blocks diff --git a/mm/Kconfig b/mm/Kconfig index d6e9042..b0432b7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE def_bool y depends on SPARSEMEM && MEMORY_HOTPLUG +config MEMORY_HOTPLUG_DEFAULT_ONLINE + bool "Online the newly added memory blocks by default" + default n + depends on MEMORY_HOTPLUG + help + This option sets the default policy setting for memory hotplug + onlining policy (/sys/devices/system/memory/auto_online_blocks) which + determines what happens to newly added memory regions. Policy setting + can always be changed at runtime. + See Documentation/memory-hotplug.txt for more information. + + Say Y here if you want all hot-plugged memory blocks to appear in + 'online' state by default. + Say N here if you want the default policy to keep all hot-plugged + memory blocks in 'offline' state. + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select MEMORY_ISOLATION diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b21d889..fcafbfc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -78,7 +78,11 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; +#else +bool memhp_auto_online = true; +#endif EXPORT_SYMBOL_GPL(memhp_auto_online); void get_online_mems(void) -- cgit v0.10.2 From 86dd995d63241039e0ad9123f9b424013c611510 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 19 May 2016 17:13:06 -0700 Subject: memory_hotplug: introduce memhp_default_state= command line parameter CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE specifies the default value for the memory hotplug onlining policy. Add a command line parameter to make it possible to override the default. It may come handy for debug and testing purposes. Signed-off-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Dan Williams Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: David Vrabel Cc: David Rientjes Cc: Igor Mammedov Cc: Lennart Poettering Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7944031..2edb27b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2161,6 +2161,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL,SH] Allow user to override the default size for per-device physically contiguous DMA buffers. + memhp_default_state=online/offline + [KNL] Set the initial state for the memory hotplug + onlining policy. If not specified, the default value is + set according to the + CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config + option. + See Documentation/memory-hotplug.txt. + memmap=exactmap [KNL,X86] Enable setting of an exact E820 memory map, as specified by the user. Such memmap=exactmap lines can be constructed based on diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fcafbfc..caf2a14 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -85,6 +85,17 @@ bool memhp_auto_online = true; #endif EXPORT_SYMBOL_GPL(memhp_auto_online); +static int __init setup_memhp_default_state(char *str) +{ + if (!strcmp(str, "online")) + memhp_auto_online = true; + else if (!strcmp(str, "offline")) + memhp_auto_online = false; + + return 1; +} +__setup("memhp_default_state=", setup_memhp_default_state); + void get_online_mems(void) { might_sleep(); -- cgit v0.10.2 From 3da88fb3bacfaa33ff9d13730d17110bb2d9604d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:09 -0700 Subject: mm, oom: move GFP_NOFS check to out_of_memory __alloc_pages_may_oom is the central place to decide when the out_of_memory should be invoked. This is a good approach for most checks there because they are page allocator specific and the allocation fails right after for all of them. The notable exception is GFP_NOFS context which is faking did_some_progress and keep the page allocator looping even though there couldn't have been any progress from the OOM killer. This patch doesn't change this behavior because we are not ready to allow those allocation requests to fail yet (and maybe we will face the reality that we will never manage to safely fail these request). Instead __GFP_FS check is moved down to out_of_memory and prevent from OOM victim selection there. There are two reasons for that - OOM notifiers might release some memory even from this context as none of the registered notifier seems to be FS related - this might help a dying thread to get an access to memory reserves and move on which will make the behavior more consistent with the case when the task gets killed from a different context. Keep a comment in __alloc_pages_may_oom to make sure we do not forget how GFP_NOFS is special and that we really want to do something about it. Note to the current oom_notifier users: The observable difference for you is that oom notifiers cannot depend on any fs locks because we could deadlock. Not that this would be allowed today because that would just lockup machine in most of the cases and ruling out the OOM killer along the way. Another difference is that callbacks might be invoked sooner now because GFP_NOFS is a weaker reclaim context and so there could be reclaimable memory which is just not reachable now. That would require GFP_NOFS only loads which are really rare and more importantly the observable result would be dropping of reconstructible object and potential performance drop which is not such a big deal when we are struggling to fulfill other important allocation requests. Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8634958..32d8210 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -877,6 +877,15 @@ bool out_of_memory(struct oom_control *oc) } /* + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least + * ___GFP_DIRECT_RECLAIM to get here. + */ + if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + return true; + + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da6d339..6d1c8b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2875,22 +2875,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for IO-less reclaim */ - if (!(gfp_mask & __GFP_FS)) { - /* - * XXX: Page reclaim didn't yield anything, - * and the OOM killer can't be invoked, but - * keep looping as per tradition. - * - * But do not keep looping if oom_killer_disable() - * was already called, for the system is trying to - * enter a quiescent state during suspend. - */ - *did_some_progress = !oom_killer_disabled; - goto out; - } if (pm_suspended_storage()) goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; -- cgit v0.10.2 From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:12 -0700 Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path If either the current task is already killed or PF_EXITING or a selected task is PF_EXITING then the oom killer is suppressed and so is the oom reaper. This patch adds try_oom_reaper which checks the given task and queues it for the oom reaper if that is safe to be done meaning that the task doesn't share the mm with an alive process. This might help to release the memory pressure while the task tries to exit. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/oom.h b/include/linux/oom.h index 628a432..83b9c39 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p) extern void mark_oom_victim(struct task_struct *tsk); +#ifdef CONFIG_MMU +extern void try_oom_reaper(struct task_struct *tsk); +#else +static inline void try_oom_reaper(struct task_struct *tsk) +{ +} +#endif + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b40dca..d71d387 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1275,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { mark_oom_victim(current); + try_oom_reaper(current); goto unlock; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 32d8210..850b6ff 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -563,6 +582,53 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + struct task_struct *p; + + if (!mm) + return; + + /* + * There might be other threads/processes which are either not + * dying or even not killable. + */ + if (atomic_read(&mm->mm_users) > 1) { + rcu_read_lock(); + for_each_process(p) { + bool exiting; + + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, tsk)) + continue; + if (fatal_signal_pending(p)) + continue; + + /* + * If the task is exiting make sure the whole thread group + * is exiting and cannot acces mm anymore. + */ + spin_lock_irq(&p->sighand->siglock); + exiting = signal_group_exit(p->signal); + spin_unlock_irq(&p->sighand->siglock); + if (exiting) + continue; + + /* Give up */ + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + } + + wake_oom_reaper(tsk); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -653,24 +719,6 @@ void oom_killer_enable(void) } /* - * task->mm can be NULL if the task is the exited group leader. So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ - struct task_struct *t; - - for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); - if (t_mm) - return t_mm == mm; - } - return false; -} - -/* * Must be called while holding a reference to p, which will be released upon * returning. */ @@ -694,6 +742,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_lock(p); if (p->mm && task_will_free_mem(p)) { mark_oom_victim(p); + try_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -873,6 +922,7 @@ bool out_of_memory(struct oom_control *oc) if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); + try_oom_reaper(current); return true; } -- cgit v0.10.2 From 449d777d7ad6d7f9ac5ed8f618fa13e6ff36c32f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:15 -0700 Subject: mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper Right now the oom reaper will clear TIF_MEMDIE only for tasks which were successfully reaped. This is the safest option because we know that such an oom victim would only block forward progress of the oom killer without a good reason because it is highly unlikely it would release much more memory. Basically most of its memory has been already torn down. We can relax this assumption to catch more corner cases though. The first obvious one is when the oom victim clears its mm and gets stuck later on. oom_reaper would back of on find_lock_task_mm returning NULL. We can safely try to clear TIF_MEMDIE in this case because such a task would be ignored by the oom killer anyway. The flag would be cleared by that time already most of the time anyway. The less obvious one is when the oom reaper fails due to mmap_sem contention. Even if we clear TIF_MEMDIE for this task then it is not very likely that we would select another task too easily because we haven't reaped the last victim and so it would be still the #1 candidate. There is a rare race condition possible when the current victim terminates before the next select_bad_process but considering that oom_reap_task had retried several times before giving up then this sounds like a borderline thing. After this patch we should have a guarantee that the OOM killer will not be block for unbounded amount of time for most cases. Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 850b6ff..415f7eb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -510,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk) up_read(&mm->mmap_sem); /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore. OOM killer can continue - * by selecting other victim if unmapping hasn't led to any - * improvements. This also means that selecting this task doesn't - * make any sense. + * This task can be safely ignored because we cannot do much more + * to release its memory. */ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; - exit_oom_victim(tsk); out: mmput(mm); return ret; @@ -538,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); } + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore or it is not a good candidate + * for the oom victim right now because it cannot release its memory + * itself nor by the oom reaper. + */ + tsk->oom_reaper_list = NULL; + exit_oom_victim(tsk); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } -- cgit v0.10.2 From d61f8590397480981f0d3ee7a2b38b5ea990db52 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:18 -0700 Subject: mm, page_alloc: only check PageCompound for high-order pages Another year, another round of page allocator optimisations focusing this time on the alloc and free fast paths. This should be of help to workloads that are allocator-intensive from kernel space where the cost of zeroing is not nceessraily incurred. The series is motivated by the observation that page alloc microbenchmarks on multiple machines regressed between 3.12.44 and 4.4. Second, there is discussions before LSF/MM considering the possibility of adding another page allocator which is potentially hazardous but a patch series improving performance is better than whining. After the series is applied, there are still hazards. In the free paths, the debugging checking and page zone/pageblock lookups dominate but there was not an obvious solution to that. In the alloc path, the major contributers are dealing with zonelists, new page preperation, the fair zone allocation and numerous statistic updates. The fair zone allocator is removed by the per-node LRU series if that gets merged so it's nor a major concern at the moment. On normal userspace benchmarks, there is little impact as the zeroing cost is significant but it's visible aim9 4.6.0-rc3 4.6.0-rc3 vanilla deferalloc-v3 Min page_test 828693.33 ( 0.00%) 887060.00 ( 7.04%) Min brk_test 4847266.67 ( 0.00%) 4966266.67 ( 2.45%) Min exec_test 1271.00 ( 0.00%) 1275.67 ( 0.37%) Min fork_test 12371.75 ( 0.00%) 12380.00 ( 0.07%) The overall impact on a page allocator microbenchmark for a range of orders and number of pages allocated in a batch is 4.6.0-rc3 4.6.0-rc3 vanilla deferalloc-v3r7 Min alloc-odr0-1 428.00 ( 0.00%) 316.00 ( 26.17%) Min alloc-odr0-2 314.00 ( 0.00%) 231.00 ( 26.43%) Min alloc-odr0-4 256.00 ( 0.00%) 192.00 ( 25.00%) Min alloc-odr0-8 222.00 ( 0.00%) 166.00 ( 25.23%) Min alloc-odr0-16 207.00 ( 0.00%) 154.00 ( 25.60%) Min alloc-odr0-32 197.00 ( 0.00%) 148.00 ( 24.87%) Min alloc-odr0-64 193.00 ( 0.00%) 144.00 ( 25.39%) Min alloc-odr0-128 191.00 ( 0.00%) 143.00 ( 25.13%) Min alloc-odr0-256 203.00 ( 0.00%) 153.00 ( 24.63%) Min alloc-odr0-512 212.00 ( 0.00%) 165.00 ( 22.17%) Min alloc-odr0-1024 221.00 ( 0.00%) 172.00 ( 22.17%) Min alloc-odr0-2048 225.00 ( 0.00%) 179.00 ( 20.44%) Min alloc-odr0-4096 232.00 ( 0.00%) 185.00 ( 20.26%) Min alloc-odr0-8192 235.00 ( 0.00%) 187.00 ( 20.43%) Min alloc-odr0-16384 236.00 ( 0.00%) 188.00 ( 20.34%) Min alloc-odr1-1 519.00 ( 0.00%) 450.00 ( 13.29%) Min alloc-odr1-2 391.00 ( 0.00%) 336.00 ( 14.07%) Min alloc-odr1-4 313.00 ( 0.00%) 268.00 ( 14.38%) Min alloc-odr1-8 277.00 ( 0.00%) 235.00 ( 15.16%) Min alloc-odr1-16 256.00 ( 0.00%) 218.00 ( 14.84%) Min alloc-odr1-32 252.00 ( 0.00%) 212.00 ( 15.87%) Min alloc-odr1-64 244.00 ( 0.00%) 206.00 ( 15.57%) Min alloc-odr1-128 244.00 ( 0.00%) 207.00 ( 15.16%) Min alloc-odr1-256 243.00 ( 0.00%) 207.00 ( 14.81%) Min alloc-odr1-512 245.00 ( 0.00%) 209.00 ( 14.69%) Min alloc-odr1-1024 248.00 ( 0.00%) 214.00 ( 13.71%) Min alloc-odr1-2048 253.00 ( 0.00%) 220.00 ( 13.04%) Min alloc-odr1-4096 258.00 ( 0.00%) 224.00 ( 13.18%) Min alloc-odr1-8192 261.00 ( 0.00%) 229.00 ( 12.26%) Min alloc-odr2-1 560.00 ( 0.00%) 753.00 (-34.46%) Min alloc-odr2-2 424.00 ( 0.00%) 351.00 ( 17.22%) Min alloc-odr2-4 339.00 ( 0.00%) 393.00 (-15.93%) Min alloc-odr2-8 298.00 ( 0.00%) 246.00 ( 17.45%) Min alloc-odr2-16 276.00 ( 0.00%) 227.00 ( 17.75%) Min alloc-odr2-32 271.00 ( 0.00%) 221.00 ( 18.45%) Min alloc-odr2-64 264.00 ( 0.00%) 217.00 ( 17.80%) Min alloc-odr2-128 264.00 ( 0.00%) 217.00 ( 17.80%) Min alloc-odr2-256 264.00 ( 0.00%) 218.00 ( 17.42%) Min alloc-odr2-512 269.00 ( 0.00%) 223.00 ( 17.10%) Min alloc-odr2-1024 279.00 ( 0.00%) 230.00 ( 17.56%) Min alloc-odr2-2048 283.00 ( 0.00%) 235.00 ( 16.96%) Min alloc-odr2-4096 285.00 ( 0.00%) 239.00 ( 16.14%) Min alloc-odr3-1 629.00 ( 0.00%) 505.00 ( 19.71%) Min alloc-odr3-2 472.00 ( 0.00%) 374.00 ( 20.76%) Min alloc-odr3-4 383.00 ( 0.00%) 301.00 ( 21.41%) Min alloc-odr3-8 341.00 ( 0.00%) 266.00 ( 21.99%) Min alloc-odr3-16 316.00 ( 0.00%) 248.00 ( 21.52%) Min alloc-odr3-32 308.00 ( 0.00%) 241.00 ( 21.75%) Min alloc-odr3-64 305.00 ( 0.00%) 241.00 ( 20.98%) Min alloc-odr3-128 308.00 ( 0.00%) 244.00 ( 20.78%) Min alloc-odr3-256 317.00 ( 0.00%) 249.00 ( 21.45%) Min alloc-odr3-512 327.00 ( 0.00%) 256.00 ( 21.71%) Min alloc-odr3-1024 331.00 ( 0.00%) 261.00 ( 21.15%) Min alloc-odr3-2048 333.00 ( 0.00%) 266.00 ( 20.12%) Min alloc-odr4-1 767.00 ( 0.00%) 572.00 ( 25.42%) Min alloc-odr4-2 578.00 ( 0.00%) 429.00 ( 25.78%) Min alloc-odr4-4 474.00 ( 0.00%) 346.00 ( 27.00%) Min alloc-odr4-8 422.00 ( 0.00%) 310.00 ( 26.54%) Min alloc-odr4-16 399.00 ( 0.00%) 295.00 ( 26.07%) Min alloc-odr4-32 392.00 ( 0.00%) 293.00 ( 25.26%) Min alloc-odr4-64 394.00 ( 0.00%) 293.00 ( 25.63%) Min alloc-odr4-128 405.00 ( 0.00%) 305.00 ( 24.69%) Min alloc-odr4-256 417.00 ( 0.00%) 319.00 ( 23.50%) Min alloc-odr4-512 425.00 ( 0.00%) 326.00 ( 23.29%) Min alloc-odr4-1024 426.00 ( 0.00%) 329.00 ( 22.77%) Min free-odr0-1 216.00 ( 0.00%) 178.00 ( 17.59%) Min free-odr0-2 152.00 ( 0.00%) 125.00 ( 17.76%) Min free-odr0-4 120.00 ( 0.00%) 99.00 ( 17.50%) Min free-odr0-8 106.00 ( 0.00%) 85.00 ( 19.81%) Min free-odr0-16 97.00 ( 0.00%) 80.00 ( 17.53%) Min free-odr0-32 92.00 ( 0.00%) 76.00 ( 17.39%) Min free-odr0-64 89.00 ( 0.00%) 74.00 ( 16.85%) Min free-odr0-128 89.00 ( 0.00%) 73.00 ( 17.98%) Min free-odr0-256 107.00 ( 0.00%) 90.00 ( 15.89%) Min free-odr0-512 117.00 ( 0.00%) 108.00 ( 7.69%) Min free-odr0-1024 125.00 ( 0.00%) 118.00 ( 5.60%) Min free-odr0-2048 132.00 ( 0.00%) 125.00 ( 5.30%) Min free-odr0-4096 135.00 ( 0.00%) 130.00 ( 3.70%) Min free-odr0-8192 137.00 ( 0.00%) 130.00 ( 5.11%) Min free-odr0-16384 137.00 ( 0.00%) 131.00 ( 4.38%) Min free-odr1-1 318.00 ( 0.00%) 289.00 ( 9.12%) Min free-odr1-2 228.00 ( 0.00%) 207.00 ( 9.21%) Min free-odr1-4 182.00 ( 0.00%) 165.00 ( 9.34%) Min free-odr1-8 163.00 ( 0.00%) 146.00 ( 10.43%) Min free-odr1-16 151.00 ( 0.00%) 135.00 ( 10.60%) Min free-odr1-32 146.00 ( 0.00%) 129.00 ( 11.64%) Min free-odr1-64 145.00 ( 0.00%) 130.00 ( 10.34%) Min free-odr1-128 148.00 ( 0.00%) 134.00 ( 9.46%) Min free-odr1-256 148.00 ( 0.00%) 137.00 ( 7.43%) Min free-odr1-512 151.00 ( 0.00%) 140.00 ( 7.28%) Min free-odr1-1024 154.00 ( 0.00%) 143.00 ( 7.14%) Min free-odr1-2048 156.00 ( 0.00%) 144.00 ( 7.69%) Min free-odr1-4096 156.00 ( 0.00%) 142.00 ( 8.97%) Min free-odr1-8192 156.00 ( 0.00%) 140.00 ( 10.26%) Min free-odr2-1 361.00 ( 0.00%) 457.00 (-26.59%) Min free-odr2-2 258.00 ( 0.00%) 224.00 ( 13.18%) Min free-odr2-4 208.00 ( 0.00%) 223.00 ( -7.21%) Min free-odr2-8 185.00 ( 0.00%) 160.00 ( 13.51%) Min free-odr2-16 173.00 ( 0.00%) 149.00 ( 13.87%) Min free-odr2-32 166.00 ( 0.00%) 145.00 ( 12.65%) Min free-odr2-64 166.00 ( 0.00%) 146.00 ( 12.05%) Min free-odr2-128 169.00 ( 0.00%) 148.00 ( 12.43%) Min free-odr2-256 170.00 ( 0.00%) 152.00 ( 10.59%) Min free-odr2-512 177.00 ( 0.00%) 156.00 ( 11.86%) Min free-odr2-1024 182.00 ( 0.00%) 162.00 ( 10.99%) Min free-odr2-2048 181.00 ( 0.00%) 160.00 ( 11.60%) Min free-odr2-4096 180.00 ( 0.00%) 159.00 ( 11.67%) Min free-odr3-1 431.00 ( 0.00%) 367.00 ( 14.85%) Min free-odr3-2 306.00 ( 0.00%) 259.00 ( 15.36%) Min free-odr3-4 249.00 ( 0.00%) 208.00 ( 16.47%) Min free-odr3-8 224.00 ( 0.00%) 186.00 ( 16.96%) Min free-odr3-16 208.00 ( 0.00%) 176.00 ( 15.38%) Min free-odr3-32 206.00 ( 0.00%) 174.00 ( 15.53%) Min free-odr3-64 210.00 ( 0.00%) 178.00 ( 15.24%) Min free-odr3-128 215.00 ( 0.00%) 182.00 ( 15.35%) Min free-odr3-256 224.00 ( 0.00%) 189.00 ( 15.62%) Min free-odr3-512 232.00 ( 0.00%) 195.00 ( 15.95%) Min free-odr3-1024 230.00 ( 0.00%) 195.00 ( 15.22%) Min free-odr3-2048 229.00 ( 0.00%) 193.00 ( 15.72%) Min free-odr4-1 561.00 ( 0.00%) 439.00 ( 21.75%) Min free-odr4-2 418.00 ( 0.00%) 318.00 ( 23.92%) Min free-odr4-4 339.00 ( 0.00%) 269.00 ( 20.65%) Min free-odr4-8 299.00 ( 0.00%) 239.00 ( 20.07%) Min free-odr4-16 289.00 ( 0.00%) 234.00 ( 19.03%) Min free-odr4-32 291.00 ( 0.00%) 235.00 ( 19.24%) Min free-odr4-64 298.00 ( 0.00%) 238.00 ( 20.13%) Min free-odr4-128 308.00 ( 0.00%) 251.00 ( 18.51%) Min free-odr4-256 321.00 ( 0.00%) 267.00 ( 16.82%) Min free-odr4-512 327.00 ( 0.00%) 269.00 ( 17.74%) Min free-odr4-1024 326.00 ( 0.00%) 271.00 ( 16.87%) Min total-odr0-1 644.00 ( 0.00%) 494.00 ( 23.29%) Min total-odr0-2 466.00 ( 0.00%) 356.00 ( 23.61%) Min total-odr0-4 376.00 ( 0.00%) 291.00 ( 22.61%) Min total-odr0-8 328.00 ( 0.00%) 251.00 ( 23.48%) Min total-odr0-16 304.00 ( 0.00%) 234.00 ( 23.03%) Min total-odr0-32 289.00 ( 0.00%) 224.00 ( 22.49%) Min total-odr0-64 282.00 ( 0.00%) 218.00 ( 22.70%) Min total-odr0-128 280.00 ( 0.00%) 216.00 ( 22.86%) Min total-odr0-256 310.00 ( 0.00%) 243.00 ( 21.61%) Min total-odr0-512 329.00 ( 0.00%) 273.00 ( 17.02%) Min total-odr0-1024 346.00 ( 0.00%) 290.00 ( 16.18%) Min total-odr0-2048 357.00 ( 0.00%) 304.00 ( 14.85%) Min total-odr0-4096 367.00 ( 0.00%) 315.00 ( 14.17%) Min total-odr0-8192 372.00 ( 0.00%) 317.00 ( 14.78%) Min total-odr0-16384 373.00 ( 0.00%) 319.00 ( 14.48%) Min total-odr1-1 838.00 ( 0.00%) 739.00 ( 11.81%) Min total-odr1-2 619.00 ( 0.00%) 543.00 ( 12.28%) Min total-odr1-4 495.00 ( 0.00%) 433.00 ( 12.53%) Min total-odr1-8 440.00 ( 0.00%) 382.00 ( 13.18%) Min total-odr1-16 407.00 ( 0.00%) 353.00 ( 13.27%) Min total-odr1-32 398.00 ( 0.00%) 341.00 ( 14.32%) Min total-odr1-64 389.00 ( 0.00%) 336.00 ( 13.62%) Min total-odr1-128 392.00 ( 0.00%) 341.00 ( 13.01%) Min total-odr1-256 391.00 ( 0.00%) 344.00 ( 12.02%) Min total-odr1-512 396.00 ( 0.00%) 349.00 ( 11.87%) Min total-odr1-1024 402.00 ( 0.00%) 357.00 ( 11.19%) Min total-odr1-2048 409.00 ( 0.00%) 364.00 ( 11.00%) Min total-odr1-4096 414.00 ( 0.00%) 366.00 ( 11.59%) Min total-odr1-8192 417.00 ( 0.00%) 369.00 ( 11.51%) Min total-odr2-1 921.00 ( 0.00%) 1210.00 (-31.38%) Min total-odr2-2 682.00 ( 0.00%) 576.00 ( 15.54%) Min total-odr2-4 547.00 ( 0.00%) 616.00 (-12.61%) Min total-odr2-8 483.00 ( 0.00%) 406.00 ( 15.94%) Min total-odr2-16 449.00 ( 0.00%) 376.00 ( 16.26%) Min total-odr2-32 437.00 ( 0.00%) 366.00 ( 16.25%) Min total-odr2-64 431.00 ( 0.00%) 363.00 ( 15.78%) Min total-odr2-128 433.00 ( 0.00%) 365.00 ( 15.70%) Min total-odr2-256 434.00 ( 0.00%) 371.00 ( 14.52%) Min total-odr2-512 446.00 ( 0.00%) 379.00 ( 15.02%) Min total-odr2-1024 461.00 ( 0.00%) 392.00 ( 14.97%) Min total-odr2-2048 464.00 ( 0.00%) 395.00 ( 14.87%) Min total-odr2-4096 465.00 ( 0.00%) 398.00 ( 14.41%) Min total-odr3-1 1060.00 ( 0.00%) 872.00 ( 17.74%) Min total-odr3-2 778.00 ( 0.00%) 633.00 ( 18.64%) Min total-odr3-4 632.00 ( 0.00%) 510.00 ( 19.30%) Min total-odr3-8 565.00 ( 0.00%) 452.00 ( 20.00%) Min total-odr3-16 524.00 ( 0.00%) 424.00 ( 19.08%) Min total-odr3-32 514.00 ( 0.00%) 415.00 ( 19.26%) Min total-odr3-64 515.00 ( 0.00%) 419.00 ( 18.64%) Min total-odr3-128 523.00 ( 0.00%) 426.00 ( 18.55%) Min total-odr3-256 541.00 ( 0.00%) 438.00 ( 19.04%) Min total-odr3-512 559.00 ( 0.00%) 451.00 ( 19.32%) Min total-odr3-1024 561.00 ( 0.00%) 456.00 ( 18.72%) Min total-odr3-2048 562.00 ( 0.00%) 459.00 ( 18.33%) Min total-odr4-1 1328.00 ( 0.00%) 1011.00 ( 23.87%) Min total-odr4-2 997.00 ( 0.00%) 747.00 ( 25.08%) Min total-odr4-4 813.00 ( 0.00%) 615.00 ( 24.35%) Min total-odr4-8 721.00 ( 0.00%) 550.00 ( 23.72%) Min total-odr4-16 689.00 ( 0.00%) 529.00 ( 23.22%) Min total-odr4-32 683.00 ( 0.00%) 528.00 ( 22.69%) Min total-odr4-64 692.00 ( 0.00%) 531.00 ( 23.27%) Min total-odr4-128 713.00 ( 0.00%) 556.00 ( 22.02%) Min total-odr4-256 738.00 ( 0.00%) 586.00 ( 20.60%) Min total-odr4-512 753.00 ( 0.00%) 595.00 ( 20.98%) Min total-odr4-1024 752.00 ( 0.00%) 600.00 ( 20.21%) This patch (of 27): order-0 pages by definition cannot be compound so avoid the check in the fast path for those pages. [akpm@linux-foundation.org: use unlikely(order) in free_pages_prepare(), per Vlastimil] Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d1c8b0..087ba3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1024,24 +1024,33 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) static bool free_pages_prepare(struct page *page, unsigned int order) { - bool compound = PageCompound(page); - int i, bad = 0; + int bad = 0; VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); kasan_free_pages(page, order); + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + bad += free_pages_check(page + i); + } + } if (PageAnon(page)) page->mapping = NULL; bad += free_pages_check(page); - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); - } if (bad) return false; -- cgit v0.10.2 From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:21 -0700 Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast path The PageAnon check always checks for compound_head but this is a relatively expensive check if the caller already knows the page is a head page. This patch creates a helper and uses it in the page free path which only operates on head pages. With this patch and "Only check PageCompound for high-order pages", the performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 vanilla nocompound-v1r20 Min alloc-odr0-1 425.00 ( 0.00%) 417.00 ( 1.88%) Min alloc-odr0-2 313.00 ( 0.00%) 308.00 ( 1.60%) Min alloc-odr0-4 257.00 ( 0.00%) 253.00 ( 1.56%) Min alloc-odr0-8 224.00 ( 0.00%) 221.00 ( 1.34%) Min alloc-odr0-16 208.00 ( 0.00%) 205.00 ( 1.44%) Min alloc-odr0-32 199.00 ( 0.00%) 199.00 ( 0.00%) Min alloc-odr0-64 195.00 ( 0.00%) 193.00 ( 1.03%) Min alloc-odr0-128 192.00 ( 0.00%) 191.00 ( 0.52%) Min alloc-odr0-256 204.00 ( 0.00%) 200.00 ( 1.96%) Min alloc-odr0-512 213.00 ( 0.00%) 212.00 ( 0.47%) Min alloc-odr0-1024 219.00 ( 0.00%) 219.00 ( 0.00%) Min alloc-odr0-2048 225.00 ( 0.00%) 225.00 ( 0.00%) Min alloc-odr0-4096 230.00 ( 0.00%) 231.00 ( -0.43%) Min alloc-odr0-8192 235.00 ( 0.00%) 234.00 ( 0.43%) Min alloc-odr0-16384 235.00 ( 0.00%) 234.00 ( 0.43%) Min free-odr0-1 215.00 ( 0.00%) 191.00 ( 11.16%) Min free-odr0-2 152.00 ( 0.00%) 136.00 ( 10.53%) Min free-odr0-4 119.00 ( 0.00%) 107.00 ( 10.08%) Min free-odr0-8 106.00 ( 0.00%) 96.00 ( 9.43%) Min free-odr0-16 97.00 ( 0.00%) 87.00 ( 10.31%) Min free-odr0-32 91.00 ( 0.00%) 83.00 ( 8.79%) Min free-odr0-64 89.00 ( 0.00%) 81.00 ( 8.99%) Min free-odr0-128 88.00 ( 0.00%) 80.00 ( 9.09%) Min free-odr0-256 106.00 ( 0.00%) 95.00 ( 10.38%) Min free-odr0-512 116.00 ( 0.00%) 111.00 ( 4.31%) Min free-odr0-1024 125.00 ( 0.00%) 118.00 ( 5.60%) Min free-odr0-2048 133.00 ( 0.00%) 126.00 ( 5.26%) Min free-odr0-4096 136.00 ( 0.00%) 130.00 ( 4.41%) Min free-odr0-8192 138.00 ( 0.00%) 130.00 ( 5.80%) Min free-odr0-16384 137.00 ( 0.00%) 130.00 ( 5.11%) There is a sizable boost to the free allocator performance. While there is an apparent boost on the allocation side, it's likely a co-incidence or due to the patches slightly reducing cache footprint. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b052aa..a61e06e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY) #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) +static __always_inline int PageAnonHead(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; +} + static __always_inline int PageAnon(struct page *page) { page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return PageAnonHead(page); } #ifdef CONFIG_KSM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 087ba3e..7be1ce8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1048,7 +1048,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) bad += free_pages_check(page + i); } } - if (PageAnon(page)) + if (PageAnonHead(page)) page->mapping = NULL; bad += free_pages_check(page); if (bad) -- cgit v0.10.2 From b9f00e147f27d86691f7f52a3c8126d25432477c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:24 -0700 Subject: mm, page_alloc: reduce branches in zone_statistics zone_statistics has more branches than it really needs to take an unlikely GFP flag into account. Reduce the number and annotate the unlikely flag. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 nocompound-v1r10 statbranch-v1r10 Min alloc-odr0-1 417.00 ( 0.00%) 419.00 ( -0.48%) Min alloc-odr0-2 308.00 ( 0.00%) 305.00 ( 0.97%) Min alloc-odr0-4 253.00 ( 0.00%) 250.00 ( 1.19%) Min alloc-odr0-8 221.00 ( 0.00%) 219.00 ( 0.90%) Min alloc-odr0-16 205.00 ( 0.00%) 203.00 ( 0.98%) Min alloc-odr0-32 199.00 ( 0.00%) 195.00 ( 2.01%) Min alloc-odr0-64 193.00 ( 0.00%) 191.00 ( 1.04%) Min alloc-odr0-128 191.00 ( 0.00%) 189.00 ( 1.05%) Min alloc-odr0-256 200.00 ( 0.00%) 198.00 ( 1.00%) Min alloc-odr0-512 212.00 ( 0.00%) 210.00 ( 0.94%) Min alloc-odr0-1024 219.00 ( 0.00%) 216.00 ( 1.37%) Min alloc-odr0-2048 225.00 ( 0.00%) 221.00 ( 1.78%) Min alloc-odr0-4096 231.00 ( 0.00%) 227.00 ( 1.73%) Min alloc-odr0-8192 234.00 ( 0.00%) 232.00 ( 0.85%) Min alloc-odr0-16384 234.00 ( 0.00%) 232.00 ( 0.85%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmstat.c b/mm/vmstat.c index c831be3..d585de2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -581,17 +581,21 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) */ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { - if (z->zone_pgdat == preferred_zone->zone_pgdat) { + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); } else { __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } - if (z->node == ((flags & __GFP_OTHER_NODE) ? - preferred_zone->node : numa_node_id())) - __inc_zone_state(z, NUMA_LOCAL); - else - __inc_zone_state(z, NUMA_OTHER); } /* -- cgit v0.10.2 From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:27 -0700 Subject: mm, page_alloc: inline zone_statistics zone_statistics has one call-site but it's a public function. Make it static and inline. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 statbranch-v1r20 statinline-v1r20 Min alloc-odr0-1 419.00 ( 0.00%) 412.00 ( 1.67%) Min alloc-odr0-2 305.00 ( 0.00%) 301.00 ( 1.31%) Min alloc-odr0-4 250.00 ( 0.00%) 247.00 ( 1.20%) Min alloc-odr0-8 219.00 ( 0.00%) 215.00 ( 1.83%) Min alloc-odr0-16 203.00 ( 0.00%) 199.00 ( 1.97%) Min alloc-odr0-32 195.00 ( 0.00%) 191.00 ( 2.05%) Min alloc-odr0-64 191.00 ( 0.00%) 187.00 ( 2.09%) Min alloc-odr0-128 189.00 ( 0.00%) 185.00 ( 2.12%) Min alloc-odr0-256 198.00 ( 0.00%) 193.00 ( 2.53%) Min alloc-odr0-512 210.00 ( 0.00%) 207.00 ( 1.43%) Min alloc-odr0-1024 216.00 ( 0.00%) 213.00 ( 1.39%) Min alloc-odr0-2048 221.00 ( 0.00%) 220.00 ( 0.45%) Min alloc-odr0-4096 227.00 ( 0.00%) 226.00 ( 0.44%) Min alloc-odr0-8192 232.00 ( 0.00%) 229.00 ( 1.29%) Min alloc-odr0-16384 232.00 ( 0.00%) 229.00 ( 1.29%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 02fce41..d2da8e0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_NUMA extern unsigned long node_page_state(int node, enum zone_stat_item item); -extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #else #define node_page_state(node, item) global_page_state(item) -#define zone_statistics(_zl, _z, gfp) do { } while (0) #endif /* CONFIG_NUMA */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7be1ce8..36384ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2355,6 +2355,37 @@ int split_free_page(struct page *page) } /* + * Update NUMA hit/miss statistics + * + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. + */ +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + gfp_t flags) +{ +#ifdef CONFIG_NUMA + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { + __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } +#endif +} + +/* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline diff --git a/mm/vmstat.c b/mm/vmstat.c index d585de2..f1a73bf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -570,35 +570,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) #ifdef CONFIG_NUMA /* - * zonelist = the list of zones passed to the allocator - * z = the zone from which the allocation occurred. - * - * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. - */ -void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) -{ - int local_nid = numa_node_id(); - enum zone_stat_item local_stat = NUMA_LOCAL; - - if (unlikely(flags & __GFP_OTHER_NODE)) { - local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - - if (z->node == local_nid) { - __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); - } -} - -/* * Determine the per node value of a stat item. */ unsigned long node_page_state(int node, enum zone_stat_item item) -- cgit v0.10.2 From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:30 -0700 Subject: mm, page_alloc: inline the fast path of the zonelist iterator The page allocator iterates through a zonelist for zones that match the addressing limitations and nodemask of the caller but many allocations will not be restricted. Despite this, there is always functional call overhead which builds up. This patch inlines the optimistic basic case and only calls the iterator function for the complex case. A hindrance was the fact that cpuset_current_mems_allowed is used in the fastpath as the allowed nodemask even though all nodes are allowed on most systems. The patch handles this by only considering cpuset_current_mems_allowed if a cpuset exists. As well as being faster in the fast-path, this removes some junk in the slowpath. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 statinline-v1r20 optiter-v1r20 Min alloc-odr0-1 412.00 ( 0.00%) 382.00 ( 7.28%) Min alloc-odr0-2 301.00 ( 0.00%) 282.00 ( 6.31%) Min alloc-odr0-4 247.00 ( 0.00%) 233.00 ( 5.67%) Min alloc-odr0-8 215.00 ( 0.00%) 203.00 ( 5.58%) Min alloc-odr0-16 199.00 ( 0.00%) 188.00 ( 5.53%) Min alloc-odr0-32 191.00 ( 0.00%) 182.00 ( 4.71%) Min alloc-odr0-64 187.00 ( 0.00%) 177.00 ( 5.35%) Min alloc-odr0-128 185.00 ( 0.00%) 175.00 ( 5.41%) Min alloc-odr0-256 193.00 ( 0.00%) 184.00 ( 4.66%) Min alloc-odr0-512 207.00 ( 0.00%) 197.00 ( 4.83%) Min alloc-odr0-1024 213.00 ( 0.00%) 203.00 ( 4.69%) Min alloc-odr0-2048 220.00 ( 0.00%) 209.00 ( 5.00%) Min alloc-odr0-4096 226.00 ( 0.00%) 214.00 ( 5.31%) Min alloc-odr0-8192 229.00 ( 0.00%) 218.00 ( 4.80%) Min alloc-odr0-16384 229.00 ( 0.00%) 219.00 ( 4.37%) perf indicated that next_zones_zonelist disappeared in the profile and __next_zones_zonelist did not appear. This is expected as the micro-benchmark would hit the inlined fast-path every time. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 150c604..cfcd772 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) #endif /* CONFIG_NUMA */ } +struct zoneref *__next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes); + /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z - The cursor used as a starting point for the search @@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) * being examined. It should be advanced by one before calling * next_zones_zonelist again. */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, - nodemask_t *nodes); + nodemask_t *nodes) +{ + if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) + return z; + return __next_zones_zonelist(z, highest_zoneidx, nodes); +} /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist diff --git a/mm/mmzone.c b/mm/mmzone.c index 52687fb..5652be8 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) } /* Returns the next zone at or below highest_zoneidx in a zonelist */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36384ba..789e5f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3192,17 +3192,6 @@ retry: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Find the true preferred zone if the allocation is unconstrained by - * cpusets. - */ - if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { - struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, NULL, &ac->preferred_zone); - ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); - } - /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -3358,14 +3347,21 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), + .zonelist = zonelist, .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; + if (cpusets_enabled()) { + alloc_flags |= ALLOC_CPUSET; + if (!ac.nodemask) + ac.nodemask = &cpuset_current_mems_allowed; + } + gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); @@ -3389,16 +3385,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - /* We set it here, as __alloc_pages_slowpath might have changed it */ - ac.zonelist = zonelist; - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask ? : &cpuset_current_mems_allowed, - &ac.preferred_zone); + ac.nodemask, &ac.preferred_zone); if (!ac.preferred_zone) goto out; ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); -- cgit v0.10.2 From 754078eb45df8069f389f3371002e7e24962e1a2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:33 -0700 Subject: mm, page_alloc: use __dec_zone_state for order-0 page allocation __dec_zone_state is cheaper to use for removing an order-0 page as it has fewer conditions to check. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 optiter-v1r20 decstat-v1r20 Min alloc-odr0-1 382.00 ( 0.00%) 381.00 ( 0.26%) Min alloc-odr0-2 282.00 ( 0.00%) 275.00 ( 2.48%) Min alloc-odr0-4 233.00 ( 0.00%) 229.00 ( 1.72%) Min alloc-odr0-8 203.00 ( 0.00%) 199.00 ( 1.97%) Min alloc-odr0-16 188.00 ( 0.00%) 186.00 ( 1.06%) Min alloc-odr0-32 182.00 ( 0.00%) 179.00 ( 1.65%) Min alloc-odr0-64 177.00 ( 0.00%) 174.00 ( 1.69%) Min alloc-odr0-128 175.00 ( 0.00%) 172.00 ( 1.71%) Min alloc-odr0-256 184.00 ( 0.00%) 181.00 ( 1.63%) Min alloc-odr0-512 197.00 ( 0.00%) 193.00 ( 2.03%) Min alloc-odr0-1024 203.00 ( 0.00%) 201.00 ( 0.99%) Min alloc-odr0-2048 209.00 ( 0.00%) 206.00 ( 1.44%) Min alloc-odr0-4096 214.00 ( 0.00%) 212.00 ( 0.93%) Min alloc-odr0-8192 218.00 ( 0.00%) 215.00 ( 1.38%) Min alloc-odr0-16384 219.00 ( 0.00%) 216.00 ( 1.37%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 789e5f0..8f6b6ea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2417,6 +2417,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, else page = list_first_entry(list, struct page, lru); + __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; } else { @@ -2438,11 +2439,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) set_bit(ZONE_FAIR_DEPLETED, &zone->flags); -- cgit v0.10.2 From f75fb889d18d362e336f8d3fba158a8636d0a063 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:36 -0700 Subject: mm, page_alloc: avoid unnecessary zone lookups during pageblock operations Pageblocks have an associated bitmap to store migrate types and whether the pageblock should be skipped during compaction. The bitmap may be associated with a memory section or a zone but the zone is looked up unconditionally. The compiler should optimise this away automatically so this is a cosmetic patch only in many cases. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f6b6ea..7f328cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6767,23 +6767,23 @@ void *__init alloc_large_system_hash(const char *tablename, } /* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct zone *zone, +static inline unsigned long *get_pageblock_bitmap(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM return __pfn_to_section(pfn)->pageblock_flags; #else - return zone->pageblock_flags; + return page_zone(page)->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ } -static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } @@ -6801,14 +6801,12 @@ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { - struct zone *zone; unsigned long *bitmap; unsigned long bitidx, word_bitidx; unsigned long word; - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); @@ -6830,20 +6828,18 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long end_bitidx, unsigned long mask) { - struct zone *zone; unsigned long *bitmap; unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitidx += end_bitidx; mask <<= (BITS_PER_LONG - bitidx - 1); -- cgit v0.10.2 From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:38 -0700 Subject: mm, page_alloc: convert alloc_flags to unsigned alloc_flags is a bitmask of flags but it is signed which does not necessarily generate the best code depending on the compiler. Even without an impact, it makes more sense that this be unsigned. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compaction.h b/include/linux/compaction.h index d7c8de5..242b660 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended); + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int classzone_idx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cfcd772..327f0fa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags); + unsigned long mark, int classzone_idx, + unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx); enum memmap_context { diff --git a/mm/compaction.c b/mm/compaction.c index 7487067..8f339ca 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1313,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_CONTINUE - If compaction should run now */ static unsigned long __compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { int fragindex; unsigned long watermark; @@ -1358,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, } unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { unsigned long ret; @@ -1530,7 +1532,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1571,8 +1573,8 @@ int sysctl_extfrag_threshold = 500; * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; diff --git a/mm/internal.h b/mm/internal.h index 098a89e..114593a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -175,7 +175,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const int alloc_flags; /* alloc flags of a direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f328cf..094587a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1525,7 +1525,7 @@ static inline bool free_pages_prezeroed(bool poisoned) } static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) + unsigned int alloc_flags) { int i; bool poisoned = true; @@ -2391,7 +2391,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int alloc_flags, int migratetype) + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) { unsigned long flags; struct page *page; @@ -2545,12 +2546,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags, + unsigned long mark, int classzone_idx, + unsigned int alloc_flags, long free_pages) { long min = mark; int o; - const int alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2613,7 +2615,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, int alloc_flags) + int classzone_idx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); @@ -2957,7 +2959,7 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3013,7 +3015,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3053,7 +3055,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; @@ -3092,10 +3094,10 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } -static inline int +static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask) { - int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -3156,7 +3158,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; - int alloc_flags; + unsigned int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; @@ -3348,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), -- cgit v0.10.2 From fa379b9586c7507c607d031dadf3681ed29614a9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:41 -0700 Subject: mm, page_alloc: convert nr_fair_skipped to bool The number of zones skipped to a zone expiring its fair zone allocation quota is irrelevant. Convert to bool. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 094587a..41d20bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2680,7 +2680,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - int nr_fair_skipped = 0; + bool fair_skipped; bool zonelist_rescan; zonelist_scan: @@ -2708,7 +2708,7 @@ zonelist_scan: if (!zone_local(ac->preferred_zone, zone)) break; if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - nr_fair_skipped++; + fair_skipped = true; continue; } } @@ -2801,7 +2801,7 @@ try_this_zone: */ if (alloc_flags & ALLOC_FAIR) { alloc_flags &= ~ALLOC_FAIR; - if (nr_fair_skipped) { + if (fair_skipped) { zonelist_rescan = true; reset_alloc_batches(ac->preferred_zone); } -- cgit v0.10.2 From 4dfa6cd8fdb1682586d463bc34888980fe98eb46 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:44 -0700 Subject: mm, page_alloc: remove unnecessary local variable in get_page_from_freelist zonelist here is a copy of a struct field that is used once. Ditch it. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41d20bf..1d70580 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2676,7 +2676,6 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; @@ -2690,7 +2689,7 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { unsigned long mark; -- cgit v0.10.2 From be06af002f6d50de10fd602ce3a6aa5d28e88d38 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:47 -0700 Subject: mm, page_alloc: remove unnecessary initialisation in get_page_from_freelist Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1d70580..1096ac8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2677,7 +2677,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { struct zoneref *z; - struct page *page = NULL; struct zone *zone; bool fair_skipped; bool zonelist_rescan; @@ -2691,6 +2690,7 @@ zonelist_scan: */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { + struct page *page; unsigned long mark; if (cpusets_enabled() && -- cgit v0.10.2 From 5bb1b169757875a72e05bfcbb76e22602cb1a760 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:50 -0700 Subject: mm, page_alloc: remove unnecessary initialisation from __alloc_pages_nodemask() page is guaranteed to be set before it is read with or without the initialisation. [akpm@linux-foundation.org: fix warning] Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1096ac8..f9ca6cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3347,7 +3347,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *preferred_zoneref; - struct page *page = NULL; + struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ @@ -3393,8 +3393,11 @@ retry_cpuset: /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask, &ac.preferred_zone); - if (!ac.preferred_zone) + if (!ac.preferred_zone) { + page = NULL; goto out; + } + ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ -- cgit v0.10.2 From 09940a4f1e816abe3248fa0d185fc0e7f54c8c12 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:53 -0700 Subject: mm, page_alloc: simplify last cpupid reset The current reset unnecessarily clears flags and makes pointless calculations. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2852c..2b97be1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { - int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; - - page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); - page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ #else /* !CONFIG_NUMA_BALANCING */ -- cgit v0.10.2 From 83d4ca8148fd9092715fd8ef75b30bbfd67fd2a9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:56 -0700 Subject: mm, page_alloc: move __GFP_HARDWALL modifications out of the fastpath __GFP_HARDWALL only has meaning in the context of cpusets but the fast path always applies the flag on the first attempt. Move the manipulations into the cpuset paths where they will be masked by a static branch in the common case. With the other micro-optimisations in this series combined, the impact on a page allocator microbenchmark is 4.6.0-rc2 4.6.0-rc2 decstat-v1r20 micro-v1r20 Min alloc-odr0-1 381.00 ( 0.00%) 377.00 ( 1.05%) Min alloc-odr0-2 275.00 ( 0.00%) 273.00 ( 0.73%) Min alloc-odr0-4 229.00 ( 0.00%) 226.00 ( 1.31%) Min alloc-odr0-8 199.00 ( 0.00%) 196.00 ( 1.51%) Min alloc-odr0-16 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-32 179.00 ( 0.00%) 175.00 ( 2.23%) Min alloc-odr0-64 174.00 ( 0.00%) 172.00 ( 1.15%) Min alloc-odr0-128 172.00 ( 0.00%) 170.00 ( 1.16%) Min alloc-odr0-256 181.00 ( 0.00%) 183.00 ( -1.10%) Min alloc-odr0-512 193.00 ( 0.00%) 191.00 ( 1.04%) Min alloc-odr0-1024 201.00 ( 0.00%) 199.00 ( 1.00%) Min alloc-odr0-2048 206.00 ( 0.00%) 204.00 ( 0.97%) Min alloc-odr0-4096 212.00 ( 0.00%) 210.00 ( 0.94%) Min alloc-odr0-8192 215.00 ( 0.00%) 213.00 ( 0.93%) Min alloc-odr0-16384 216.00 ( 0.00%) 214.00 ( 0.93%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9ca6cc..48afc1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3350,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), .zonelist = zonelist, @@ -3359,6 +3359,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, }; if (cpusets_enabled()) { + alloc_mask |= __GFP_HARDWALL; alloc_flags |= ALLOC_CPUSET; if (!ac.nodemask) ac.nodemask = &cpuset_current_mems_allowed; @@ -3401,7 +3402,6 @@ retry_cpuset: ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ - alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (unlikely(!page)) { /* @@ -3427,8 +3427,10 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { + alloc_mask = gfp_mask; goto retry_cpuset; + } return page; } -- cgit v0.10.2 From 3777999dd47ec00ec34a151b1d93c0a2b721e822 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:58 -0700 Subject: mm, page_alloc: check once if a zone has isolated pageblocks When bulk freeing pages from the per-cpu lists the zone is checked for isolated pageblocks on every release. This patch checks it once per drain. [mgorman@techsingularity.net: fix locking radce, per Vlastimil] Signed-off-by: Mel Gorman Signed-off-by: Vlastimil Babka Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48afc1a..a3b7eb8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -831,8 +831,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, int batch_free = 0; int to_free = count; unsigned long nr_scanned; + bool isolated_pageblocks; spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); @@ -870,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ - if (unlikely(has_isolate_pageblock(zone))) + if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); __free_one_page(page, page_to_pfn(page), zone, 0, mt); -- cgit v0.10.2 From 4fcb0971175f6037590d7b7772fe6619261d2165 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:01 -0700 Subject: mm, page_alloc: shorten the page allocator fast path The page allocator fast path checks page multiple times unnecessarily. This patch avoids all the slowpath checks if the first allocation attempt succeeds. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3b7eb8..8380011 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3398,31 +3398,26 @@ retry_cpuset: ac.nodemask, &ac.preferred_zone); if (!ac.preferred_zone) { page = NULL; - goto out; + goto no_zone; } ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); - if (unlikely(!page)) { - /* - * Runtime PM, block IO and its error handling path - * can deadlock because I/O on the device might not - * complete. - */ - alloc_mask = memalloc_noio_flags(gfp_mask); - ac.spread_dirty_pages = false; - - page = __alloc_pages_slowpath(alloc_mask, order, &ac); - } + if (likely(page)) + goto out; - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); + /* + * Runtime PM, block IO and its error handling path can deadlock + * because I/O on the device might not complete. + */ + alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + page = __alloc_pages_slowpath(alloc_mask, order, &ac); -out: +no_zone: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while @@ -3434,6 +3429,12 @@ out: goto retry_cpuset; } +out: + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); -- cgit v0.10.2 From 305347550becd08fdb576df32fc0767842ed71a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:04 -0700 Subject: mm, page_alloc: reduce cost of fair zone allocation policy retry The fair zone allocation policy is not without cost but it can be reduced slightly. This patch removes an unnecessary local variable, checks the likely conditions of the fair zone policy first, uses a bool instead of a flags check and falls through when a remote node is encountered instead of doing a full restart. The benefit is marginal but it's there 4.6.0-rc2 4.6.0-rc2 decstat-v1r20 optfair-v1r20 Min alloc-odr0-1 377.00 ( 0.00%) 380.00 ( -0.80%) Min alloc-odr0-2 273.00 ( 0.00%) 273.00 ( 0.00%) Min alloc-odr0-4 226.00 ( 0.00%) 227.00 ( -0.44%) Min alloc-odr0-8 196.00 ( 0.00%) 196.00 ( 0.00%) Min alloc-odr0-16 183.00 ( 0.00%) 183.00 ( 0.00%) Min alloc-odr0-32 175.00 ( 0.00%) 173.00 ( 1.14%) Min alloc-odr0-64 172.00 ( 0.00%) 169.00 ( 1.74%) Min alloc-odr0-128 170.00 ( 0.00%) 169.00 ( 0.59%) Min alloc-odr0-256 183.00 ( 0.00%) 180.00 ( 1.64%) Min alloc-odr0-512 191.00 ( 0.00%) 190.00 ( 0.52%) Min alloc-odr0-1024 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-2048 204.00 ( 0.00%) 204.00 ( 0.00%) Min alloc-odr0-4096 210.00 ( 0.00%) 209.00 ( 0.48%) Min alloc-odr0-8192 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-16384 214.00 ( 0.00%) 214.00 ( 0.00%) The benefit is marginal at best but one of the most important benefits, avoiding a second search when falling back to another node is not triggered by this particular test so the benefit for some corner cases is understated. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8380011..a412493 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2680,12 +2680,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z; struct zone *zone; - bool fair_skipped; - bool zonelist_rescan; + bool fair_skipped = false; + bool apply_fair = (alloc_flags & ALLOC_FAIR); zonelist_scan: - zonelist_rescan = false; - /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. @@ -2705,13 +2703,16 @@ zonelist_scan: * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. */ - if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(ac->preferred_zone, zone)) - break; + if (apply_fair) { if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { fair_skipped = true; continue; } + if (!zone_local(ac->preferred_zone, zone)) { + if (fair_skipped) + goto reset_fair; + apply_fair = false; + } } /* * When allocating a page cache page for writing, we @@ -2800,18 +2801,13 @@ try_this_zone: * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ - if (alloc_flags & ALLOC_FAIR) { - alloc_flags &= ~ALLOC_FAIR; - if (fair_skipped) { - zonelist_rescan = true; - reset_alloc_batches(ac->preferred_zone); - } - if (nr_online_nodes > 1) - zonelist_rescan = true; - } - - if (zonelist_rescan) + if (fair_skipped) { +reset_fair: + apply_fair = false; + fair_skipped = false; + reset_alloc_batches(ac->preferred_zone); goto zonelist_scan; + } return NULL; } -- cgit v0.10.2 From 48ee5f3696f62496481a8b6d852bcad9b3ebbe37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:07 -0700 Subject: mm, page_alloc: shortcut watermark checks for order-0 pages Watermarks have to be checked on every allocation including the number of pages being allocated and whether reserves can be accessed. The reserves only matter if memory is limited and the free_pages adjustment only applies to high-order pages. This patch adds a shortcut for order-0 pages that avoids numerous calculations if there is plenty of free memory yielding the following performance difference in a page allocator microbenchmark; 4.6.0-rc2 4.6.0-rc2 optfair-v1r20 fastmark-v1r20 Min alloc-odr0-1 380.00 ( 0.00%) 364.00 ( 4.21%) Min alloc-odr0-2 273.00 ( 0.00%) 262.00 ( 4.03%) Min alloc-odr0-4 227.00 ( 0.00%) 214.00 ( 5.73%) Min alloc-odr0-8 196.00 ( 0.00%) 186.00 ( 5.10%) Min alloc-odr0-16 183.00 ( 0.00%) 173.00 ( 5.46%) Min alloc-odr0-32 173.00 ( 0.00%) 165.00 ( 4.62%) Min alloc-odr0-64 169.00 ( 0.00%) 161.00 ( 4.73%) Min alloc-odr0-128 169.00 ( 0.00%) 159.00 ( 5.92%) Min alloc-odr0-256 180.00 ( 0.00%) 168.00 ( 6.67%) Min alloc-odr0-512 190.00 ( 0.00%) 180.00 ( 5.26%) Min alloc-odr0-1024 198.00 ( 0.00%) 190.00 ( 4.04%) Min alloc-odr0-2048 204.00 ( 0.00%) 196.00 ( 3.92%) Min alloc-odr0-4096 209.00 ( 0.00%) 202.00 ( 3.35%) Min alloc-odr0-8192 213.00 ( 0.00%) 206.00 ( 3.29%) Min alloc-odr0-16384 214.00 ( 0.00%) 206.00 ( 3.74%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a412493..732875b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2623,6 +2623,32 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, zone_page_state(z, NR_FREE_PAGES)); } +static inline bool zone_watermark_fast(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, unsigned int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + long cma_pages = 0; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + /* + * Fast check for order-0 only. If this fails then the reserves + * need to be calculated. There is a corner case where the check + * passes but only the high-order atomic reserve are free. If + * the caller is !atomic then it'll uselessly search the free + * list. That corner case is then slower but it is harmless. + */ + if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + return true; + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); +} + bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx) { @@ -2744,7 +2770,7 @@ zonelist_scan: continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, + if (!zone_watermark_fast(zone, order, mark, ac->classzone_idx, alloc_flags)) { int ret; -- cgit v0.10.2 From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:10 -0700 Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice The allocator fast path looks up the first usable zone in a zonelist and then get_page_from_freelist does the same job in the zonelist iterator. This patch preserves the necessary information. 4.6.0-rc2 4.6.0-rc2 fastmark-v1r20 initonce-v1r20 Min alloc-odr0-1 364.00 ( 0.00%) 359.00 ( 1.37%) Min alloc-odr0-2 262.00 ( 0.00%) 260.00 ( 0.76%) Min alloc-odr0-4 214.00 ( 0.00%) 214.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 186.00 ( 0.00%) Min alloc-odr0-16 173.00 ( 0.00%) 173.00 ( 0.00%) Min alloc-odr0-32 165.00 ( 0.00%) 165.00 ( 0.00%) Min alloc-odr0-64 161.00 ( 0.00%) 162.00 ( -0.62%) Min alloc-odr0-128 159.00 ( 0.00%) 161.00 ( -1.26%) Min alloc-odr0-256 168.00 ( 0.00%) 170.00 ( -1.19%) Min alloc-odr0-512 180.00 ( 0.00%) 181.00 ( -0.56%) Min alloc-odr0-1024 190.00 ( 0.00%) 190.00 ( 0.00%) Min alloc-odr0-2048 196.00 ( 0.00%) 196.00 ( 0.00%) Min alloc-odr0-4096 202.00 ( 0.00%) 202.00 ( 0.00%) Min alloc-odr0-8192 206.00 ( 0.00%) 205.00 ( 0.49%) Min alloc-odr0-16384 206.00 ( 0.00%) 205.00 ( 0.49%) The benefit is negligible and the results are within the noise but each cycle counts. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/buffer.c b/fs/buffer.c index af0d9a8..754813a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -255,17 +255,17 @@ out: */ static void free_more_memory(void) { - struct zone *zone; + struct zoneref *z; int nid; wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { - (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL, - &zone); - if (zone) + + z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL); + if (z->zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, GFP_NOFS, NULL); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 327f0fa..4b28d2f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, - nodemask_t *nodes, - struct zone **zone) + nodemask_t *nodes) { - struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs, + return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); - *zone = zonelist_zone(z); - return z; } /** @@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ - for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ + for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ - zone = zonelist_zone(z)) \ + zone = zonelist_zone(z)) + +#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ + for (zone = z->zone; \ + zone; \ + z = next_zones_zonelist(++z, highidx, nodemask), \ + zone = zonelist_zone(z)) + /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index diff --git a/mm/internal.h b/mm/internal.h index 114593a..d1ddd71 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -102,7 +102,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; - struct zone *preferred_zone; + struct zoneref *preferred_zoneref; int classzone_idx; int migratetype; enum zone_type high_zoneidx; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7f80ebc..297d685 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1739,18 +1739,18 @@ unsigned int mempolicy_slab_node(void) return interleave_nodes(policy); case MPOL_BIND: { + struct zoneref *z; + /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; - struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[0]; - (void)first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes, - &zone); - return zone ? zone->node : node; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes); + return z->zone ? z->zone->node : node; } default: @@ -2266,7 +2266,7 @@ static void sp_free(struct sp_node *n) int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; - struct zone *zone; + struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); @@ -2298,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* * allows binding to multiple nodes. * use current page if in policy nodemask, @@ -2306,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ if (node_isset(curnid, pol->v.nodes)) goto out; - (void)first_zones_zonelist( + z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; + &pol->v.nodes); + polnid = z->zone->node; break; default: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 732875b..dba8cfd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2704,7 +2704,7 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zoneref *z; + struct zoneref *z = ac->preferred_zoneref; struct zone *zone; bool fair_skipped = false; bool apply_fair = (alloc_flags & ALLOC_FAIR); @@ -2714,7 +2714,7 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -2734,7 +2734,7 @@ zonelist_scan: fair_skipped = true; continue; } - if (!zone_local(ac->preferred_zone, zone)) { + if (!zone_local(ac->preferred_zoneref->zone, zone)) { if (fair_skipped) goto reset_fair; apply_fair = false; @@ -2780,7 +2780,7 @@ zonelist_scan: goto try_this_zone; if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(ac->preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2802,7 +2802,7 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(ac->preferred_zone, zone, order, + page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) @@ -2831,7 +2831,7 @@ try_this_zone: reset_fair: apply_fair = false; fair_skipped = false; - reset_alloc_batches(ac->preferred_zone); + reset_alloc_batches(ac->preferred_zoneref->zone); goto zonelist_scan; } @@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); + wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref)); } static inline unsigned int @@ -3332,7 +3332,7 @@ retry: if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); goto retry; } @@ -3370,7 +3370,6 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - struct zoneref *preferred_zoneref; struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; @@ -3416,14 +3415,14 @@ retry_cpuset: ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask, &ac.preferred_zone); - if (!ac.preferred_zone) { + ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, + ac.high_zoneidx, ac.nodemask); + if (!ac.preferred_zoneref) { page = NULL; goto no_zone; } - ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -4462,13 +4461,12 @@ static void build_zonelists(pg_data_t *pgdat) */ int local_memory_node(int node) { - struct zone *zone; + struct zoneref *z; - (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), - NULL, - &zone); - return zone->node; + NULL); + return z->zone->node; } #endif -- cgit v0.10.2 From 93ea9964d14ad583492ffb9ab7543f015876aaf2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:13 -0700 Subject: mm, page_alloc: remove field from alloc_context The classzone_idx can be inferred from preferred_zoneref so remove the unnecessary field and save stack space. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 8f339ca..eda3c22 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1602,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, - ac->classzone_idx); + ac_classzone_idx(ac)); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1612,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/internal.h b/mm/internal.h index d1ddd71..3ac544f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -103,12 +103,13 @@ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; - int classzone_idx; int migratetype; enum zone_type high_zoneidx; bool spread_dirty_pages; }; +#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dba8cfd..9da66e7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2771,7 +2771,7 @@ zonelist_scan: mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_fast(zone, order, mark, - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2794,7 +2794,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) + ac_classzone_idx(ac), alloc_flags)) goto try_this_zone; continue; @@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref)); + wakeup_kswapd(zone, order, ac_classzone_idx(ac)); } static inline unsigned int @@ -3422,8 +3422,6 @@ retry_cpuset: goto no_zone; } - ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref); - /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) -- cgit v0.10.2 From 7bfec6f47bb0ffd207c7e813e819235e6c1c0f34 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:15 -0700 Subject: mm, page_alloc: check multiple page fields with a single branch Every page allocated or freed is checked for sanity to avoid corruptions that are difficult to detect later. A bad page could be due to a number of fields. Instead of using multiple branches, this patch combines multiple fields into a single branch. A detailed check is only necessary if that check fails. 4.6.0-rc2 4.6.0-rc2 initonce-v1r20 multcheck-v1r20 Min alloc-odr0-1 359.00 ( 0.00%) 348.00 ( 3.06%) Min alloc-odr0-2 260.00 ( 0.00%) 254.00 ( 2.31%) Min alloc-odr0-4 214.00 ( 0.00%) 213.00 ( 0.47%) Min alloc-odr0-8 186.00 ( 0.00%) 186.00 ( 0.00%) Min alloc-odr0-16 173.00 ( 0.00%) 173.00 ( 0.00%) Min alloc-odr0-32 165.00 ( 0.00%) 166.00 ( -0.61%) Min alloc-odr0-64 162.00 ( 0.00%) 162.00 ( 0.00%) Min alloc-odr0-128 161.00 ( 0.00%) 160.00 ( 0.62%) Min alloc-odr0-256 170.00 ( 0.00%) 169.00 ( 0.59%) Min alloc-odr0-512 181.00 ( 0.00%) 180.00 ( 0.55%) Min alloc-odr0-1024 190.00 ( 0.00%) 188.00 ( 1.05%) Min alloc-odr0-2048 196.00 ( 0.00%) 194.00 ( 1.02%) Min alloc-odr0-4096 202.00 ( 0.00%) 199.00 ( 1.49%) Min alloc-odr0-8192 205.00 ( 0.00%) 202.00 ( 1.46%) Min alloc-odr0-16384 205.00 ( 0.00%) 203.00 ( 0.98%) Again, the benefit is marginal but avoiding excessive branches is important. Ideally the paths would not have to check these conditions at all but regrettably abandoning the tests would make use-after-free bugs much harder to detect. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9da66e7..76a3948 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -784,10 +784,42 @@ out: zone->free_area[order].nr_free++; } +/* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed + * check if necessary. + */ +static inline bool page_expected_state(struct page *page, + unsigned long check_flags) +{ + if (unlikely(atomic_read(&page->_mapcount) != -1)) + return false; + + if (unlikely((unsigned long)page->mapping | + page_ref_count(page) | +#ifdef CONFIG_MEMCG + (unsigned long)page->mem_cgroup | +#endif + (page->flags & check_flags))) + return false; + + return true; +} + static inline int free_pages_check(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + const char *bad_reason; + unsigned long bad_flags; + + if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)) { + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; + } + + /* Something has gone sideways, find it */ + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -803,14 +835,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - page_cpupid_reset_last(page); - if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; + bad_page(page, bad_reason, bad_flags); + return 1; } /* @@ -1492,9 +1518,14 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + const char *bad_reason; + unsigned long bad_flags; + + if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)) + return 0; + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) -- cgit v0.10.2 From bb552ac6c6b4f24e7a7b491286f87b63f9478d42 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:18 -0700 Subject: mm, page_alloc: un-inline the bad part of free_pages_check From: Vlastimil Babka !DEBUG_VM size and bloat-o-meter: add/remove: 1/0 grow/shrink: 0/2 up/down: 124/-370 (-246) function old new delta free_pages_check_bad - 124 +124 free_pcppages_bulk 1288 1171 -117 __free_pages_ok 948 695 -253 DEBUG_VM: add/remove: 1/0 grow/shrink: 0/1 up/down: 124/-214 (-90) function old new delta free_pages_check_bad - 124 +124 free_pages_prepare 1112 898 -214 [akpm@linux-foundation.org: fix whitespace] Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76a3948..d51543d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -806,18 +806,11 @@ static inline bool page_expected_state(struct page *page, return true; } -static inline int free_pages_check(struct page *page) +static void free_pages_check_bad(struct page *page) { const char *bad_reason; unsigned long bad_flags; - if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)) { - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; - } - - /* Something has gone sideways, find it */ bad_reason = NULL; bad_flags = 0; @@ -836,6 +829,18 @@ static inline int free_pages_check(struct page *page) bad_reason = "page still charged to cgroup"; #endif bad_page(page, bad_reason, bad_flags); +} + +static inline int free_pages_check(struct page *page) +{ + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) { + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; + } + + /* Something has gone sideways, find it */ + free_pages_check_bad(page); return 1; } -- cgit v0.10.2 From da838d4fcba675cbf864f225d76f970e91220ee6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:21 -0700 Subject: mm, page_alloc: pull out side effects from free_pages_check Check without side-effects should be easier to maintain. It also removes the duplicated cpupid and flags reset done in !DEBUG_VM variant of both free_pcp_prepare() and then bulkfree_pcp_prepare(). Finally, it enables the next patch. It shouldn't result in new branches, thanks to inlining of the check. !DEBUG_VM bloat-o-meter: add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-27 (-27) function old new delta __free_pages_ok 748 739 -9 free_pcppages_bulk 1403 1385 -18 DEBUG_VM: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-28 (-28) function old new delta free_pages_prepare 806 778 -28 This is also slightly faster because cpupid information is not set on tail pages so we can avoid resets there. Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d51543d..fea50b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -833,11 +833,8 @@ static void free_pages_check_bad(struct page *page) static inline int free_pages_check(struct page *page) { - if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) { - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; - } /* Something has gone sideways, find it */ free_pages_check_bad(page); @@ -1078,7 +1075,11 @@ static bool free_pages_prepare(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (PageAnonHead(page)) @@ -1087,6 +1088,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) if (bad) return false; + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); if (!PageHighMem(page)) { -- cgit v0.10.2 From e5b31ac2ca2cd0cf6bf2fcbb708ed01466c89aaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:24 -0700 Subject: mm, page_alloc: remove unnecessary variable from free_pcppages_bulk The original count is never reused so it can be removed. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fea50b0..822ce86 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -857,7 +857,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int to_free = count; unsigned long nr_scanned; bool isolated_pageblocks; @@ -867,7 +866,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - while (to_free) { + while (count) { struct page *page; struct list_head *list; @@ -887,7 +886,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* This is the only non-empty list. Free them all. */ if (batch_free == MIGRATE_PCPTYPES) - batch_free = to_free; + batch_free = count; do { int mt; /* migratetype of the to-be-freed page */ @@ -905,7 +904,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - } while (--to_free && --batch_free && !list_empty(list)); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } -- cgit v0.10.2 From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:27 -0700 Subject: mm, page_alloc: inline pageblock lookup in page free fast paths The function call overhead of get_pfnblock_flags_mask() is measurable in the page free paths. This patch uses an inlined version that is faster. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4b28d2f..c60db20 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled; get_pfnblock_flags_mask(page, page_to_pfn(page), \ PB_migrate_end, MIGRATETYPE_MASK) -static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) -{ - BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); - return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, - MIGRATETYPE_MASK); -} - struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822ce86..bdf7a13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat, } #endif +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); +} + +static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -6831,94 +6931,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, - unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; -#else - return page_zone(page)->pageblock_flags; -#endif /* CONFIG_SPARSEMEM */ -} - -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#endif /* CONFIG_SPARSEMEM */ -} - -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; - - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; -} - -/** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @flags: The flags to set - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest - * @mask: mask of bits that the caller is interested in - */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long old_word, word; - - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); - - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); - - word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } -} - /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages diff --git a/mm/page_owner.c b/mm/page_owner.c index 438768c..792b56d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, goto err; /* Print information relevant to grouping pages by mobility */ - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", diff --git a/mm/vmstat.c b/mm/vmstat.c index f1a73bf..5b72a8a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1041,7 +1041,7 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = min(block_end_pfn, end_pfn); page = pfn_to_page(pfn); - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) -- cgit v0.10.2 From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:30 -0700 Subject: cpuset: use static key better and convert to new API An important function for cpusets is cpuset_node_allowed(), which optimizes on the fact if there's a single root CPU set, it must be trivially allowed. But the check "nr_cpusets() <= 1" doesn't use the cpusets_enabled_key static key the right way where static keys eliminate branching overhead with jump labels. This patch converts it so that static key is used properly. It's also switched to the new static key API and the checking functions are converted to return bool instead of int. We also provide a new variant __cpuset_zone_allowed() which expects that the static key check was already done and they key was enabled. This is needed for get_page_from_freelist() where we want to also avoid the relatively slower check when ALLOC_CPUSET is not set in alloc_flags. The impact on the page allocator microbenchmark is less than expected but the cleanup in itself is worthwhile. 4.6.0-rc2 4.6.0-rc2 multcheck-v1r20 cpuset-v1r20 Min alloc-odr0-1 348.00 ( 0.00%) 348.00 ( 0.00%) Min alloc-odr0-2 254.00 ( 0.00%) 254.00 ( 0.00%) Min alloc-odr0-4 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-16 173.00 ( 0.00%) 171.00 ( 1.16%) Min alloc-odr0-32 166.00 ( 0.00%) 163.00 ( 1.81%) Min alloc-odr0-64 162.00 ( 0.00%) 159.00 ( 1.85%) Min alloc-odr0-128 160.00 ( 0.00%) 157.00 ( 1.88%) Min alloc-odr0-256 169.00 ( 0.00%) 166.00 ( 1.78%) Min alloc-odr0-512 180.00 ( 0.00%) 180.00 ( 0.00%) Min alloc-odr0-1024 188.00 ( 0.00%) 187.00 ( 0.53%) Min alloc-odr0-2048 194.00 ( 0.00%) 193.00 ( 0.52%) Min alloc-odr0-4096 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-8192 202.00 ( 0.00%) 201.00 ( 0.50%) Min alloc-odr0-16384 203.00 ( 0.00%) 202.00 ( 0.49%) Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Acked-by: Zefan Li Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868c..bfc204e 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -16,26 +16,26 @@ #ifdef CONFIG_CPUSETS -extern struct static_key cpusets_enabled_key; +extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { - return static_key_false(&cpusets_enabled_key); + return static_branch_unlikely(&cpusets_enabled_key); } static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key) + 1; + return static_key_count(&cpusets_enabled_key.key) + 1; } static inline void cpuset_inc(void) { - static_key_slow_inc(&cpusets_enabled_key); + static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_key_slow_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_enabled_key); } extern int cpuset_init(void); @@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_node_allowed(int node, gfp_t gfp_mask); +extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask); -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask); + if (cpusets_enabled()) + return __cpuset_node_allowed(node, gfp_mask); + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return cpuset_node_allowed(zone_to_nid(z), gfp_mask); + return __cpuset_node_allowed(zone_to_nid(z), gfp_mask); +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + if (cpusets_enabled()) + return __cpuset_zone_allowed(z, gfp_mask); + return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return 1; + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return 1; + return true; +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 611cc69..73e93e5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,7 +61,7 @@ #include #include -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdf7a13..39c441b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2859,7 +2859,7 @@ zonelist_scan: if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual -- cgit v0.10.2 From 4db7548ccbd9ec8e666f35df4a530f55904dec39 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:32 -0700 Subject: mm, page_alloc: defer debugging checks of freed pages until a PCP drain Every page free checks a number of page fields for validity. This catches premature frees and corruptions but it is also expensive. This patch weakens the debugging check by checking PCP pages at the time they are drained from the PCP list. This will trigger the bug but the site that freed the corrupt page will be lost. To get the full context, a kernel rebuild with DEBUG_VM is necessary. [akpm@linux-foundation.org: fix build] Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39c441b..759d3f6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -941,6 +941,103 @@ static inline int free_pages_check(struct page *page) return 1; } +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + goto out; + } + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; + } + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; +} + +static bool free_pages_prepare(struct page *page, unsigned int order); + +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + + trace_mm_page_free(page, 0); + kmemcheck_free_shadow(page, 0); + kasan_free_pages(page, 0); + + if (PageAnonHead(page)) + page->mapping = NULL; + + reset_page_owner(page, 0); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE); + } + arch_free_page(page, 0); + kernel_poison_pages(page, 0, 0); + kernel_map_pages(page, 0, 0); + + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return true; +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return free_pages_check(page); +} +#endif /* CONFIG_DEBUG_VM */ + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1002,6 +1099,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); + if (bulkfree_pcp_prepare(page)) + continue; + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); @@ -1028,56 +1128,6 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } -static int free_tail_pages_check(struct page *head_page, struct page *page) -{ - int ret = 1; - - /* - * We rely page->lru.next never has bit 0 set, unless the page - * is PageTail(). Let's make sure that's true even for poisoned ->lru. - */ - BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - - if (!IS_ENABLED(CONFIG_DEBUG_VM)) { - ret = 0; - goto out; - } - switch (page - head_page) { - case 1: - /* the first tail page: ->mapping is compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); - goto out; - } - break; - case 2: - /* - * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. - */ - break; - default: - if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); - goto out; - } - break; - } - if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); - goto out; - } - if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); - goto out; - } - ret = 0; -out: - page->mapping = NULL; - clear_compound_head(page); - return ret; -} - static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { @@ -2339,7 +2389,7 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_pages_prepare(page, 0)) + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); -- cgit v0.10.2 From 479f854a207ce2b97545a0a83856778b541063d0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:35 -0700 Subject: mm, page_alloc: defer debugging checks of pages allocated from the PCP Every page allocated checks a number of page fields for validity. This catches corruption bugs of pages that are already freed but it is expensive. This patch weakens the debugging check by checking PCP pages only when the PCP lists are being refilled. All compound pages are checked. This potentially avoids debugging checks entirely if the PCP lists are never emptied and refilled so some corruption issues may be missed. Full checking requires DEBUG_VM. With the two deferred debugging patches applied, the impact to a page allocator microbenchmark is 4.6.0-rc3 4.6.0-rc3 inline-v3r6 deferalloc-v3r7 Min alloc-odr0-1 344.00 ( 0.00%) 317.00 ( 7.85%) Min alloc-odr0-2 248.00 ( 0.00%) 231.00 ( 6.85%) Min alloc-odr0-4 209.00 ( 0.00%) 192.00 ( 8.13%) Min alloc-odr0-8 181.00 ( 0.00%) 166.00 ( 8.29%) Min alloc-odr0-16 168.00 ( 0.00%) 154.00 ( 8.33%) Min alloc-odr0-32 161.00 ( 0.00%) 148.00 ( 8.07%) Min alloc-odr0-64 158.00 ( 0.00%) 145.00 ( 8.23%) Min alloc-odr0-128 156.00 ( 0.00%) 143.00 ( 8.33%) Min alloc-odr0-256 168.00 ( 0.00%) 154.00 ( 8.33%) Min alloc-odr0-512 178.00 ( 0.00%) 167.00 ( 6.18%) Min alloc-odr0-1024 186.00 ( 0.00%) 174.00 ( 6.45%) Min alloc-odr0-2048 192.00 ( 0.00%) 180.00 ( 6.25%) Min alloc-odr0-4096 198.00 ( 0.00%) 184.00 ( 7.07%) Min alloc-odr0-8192 200.00 ( 0.00%) 188.00 ( 6.00%) Min alloc-odr0-16384 201.00 ( 0.00%) 188.00 ( 6.47%) Min free-odr0-1 189.00 ( 0.00%) 180.00 ( 4.76%) Min free-odr0-2 132.00 ( 0.00%) 126.00 ( 4.55%) Min free-odr0-4 104.00 ( 0.00%) 99.00 ( 4.81%) Min free-odr0-8 90.00 ( 0.00%) 85.00 ( 5.56%) Min free-odr0-16 84.00 ( 0.00%) 80.00 ( 4.76%) Min free-odr0-32 80.00 ( 0.00%) 76.00 ( 5.00%) Min free-odr0-64 78.00 ( 0.00%) 74.00 ( 5.13%) Min free-odr0-128 77.00 ( 0.00%) 73.00 ( 5.19%) Min free-odr0-256 94.00 ( 0.00%) 91.00 ( 3.19%) Min free-odr0-512 108.00 ( 0.00%) 112.00 ( -3.70%) Min free-odr0-1024 115.00 ( 0.00%) 118.00 ( -2.61%) Min free-odr0-2048 120.00 ( 0.00%) 125.00 ( -4.17%) Min free-odr0-4096 123.00 ( 0.00%) 129.00 ( -4.88%) Min free-odr0-8192 126.00 ( 0.00%) 130.00 ( -3.17%) Min free-odr0-16384 126.00 ( 0.00%) 131.00 ( -3.97%) Note that the free paths for large numbers of pages is impacted as the debugging cost gets shifted into that path when the page data is no longer necessarily cache-hot. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 759d3f6..193ed34 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1714,7 +1714,41 @@ static inline bool free_pages_prezeroed(bool poisoned) page_poisoning_enabled() && poisoned; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, +#ifdef CONFIG_DEBUG_VM +static bool check_pcp_refill(struct page *page) +{ + return false; +} + +static bool check_new_pcp(struct page *page) +{ + return check_new_page(page); +} +#else +static bool check_pcp_refill(struct page *page) +{ + return check_new_page(page); +} +static bool check_new_pcp(struct page *page) +{ + return false; +} +#endif /* CONFIG_DEBUG_VM */ + +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + +static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { int i; @@ -1722,8 +1756,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, for (i = 0; i < (1 << order); i++) { struct page *p = page + i; - if (unlikely(check_new_page(p))) - return 1; if (poisoned) poisoned &= page_is_poisoned(p); } @@ -1755,8 +1787,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_pfmemalloc(page); else clear_page_pfmemalloc(page); - - return 0; } /* @@ -2178,6 +2208,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + if (unlikely(check_pcp_refill(page))) + continue; + /* * Split buddy pages returned by expand() are received here * in physical page order. The page is added to the callers and @@ -2593,20 +2626,22 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct list_head *list; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } + do { + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + goto failed; + } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + } while (page && check_new_pcp(page)); __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); @@ -2619,14 +2654,16 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2993,8 +3030,7 @@ try_this_zone: page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { - if (prep_new_page(page, order, gfp_mask, alloc_flags)) - goto try_this_zone; + prep_new_page(page, order, gfp_mask, alloc_flags); /* * If this is a high-order atomic allocation then check -- cgit v0.10.2 From e2769dbdc51f1baa1908ecf6c84d50f19577e1db Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:38 -0700 Subject: mm, page_alloc: don't duplicate code in free_pcp_prepare The new free_pcp_prepare() function shares a lot of code with free_pages_prepare(), which makes this a maintenance risk when some future patch modifies only one of them. We should be able to achieve the same effect (skipping free_pages_check() from !DEBUG_VM configs) by adding a parameter to free_pages_prepare() and making it inline, so the checks (and the order != 0 parts) are eliminated from the call from free_pcp_prepare(). !DEBUG_VM: bloat-o-meter reports no difference, as my gcc was already inlining free_pages_prepare() and the elimination seems to work as expected DEBUG_VM bloat-o-meter: add/remove: 0/1 grow/shrink: 2/0 up/down: 1035/-778 (257) function old new delta __free_pages_ok 297 1060 +763 free_hot_cold_page 480 752 +272 free_pages_prepare 778 - -778 Here inlining didn't occur before, and added some code, but it's ok for a debug option. [akpm@linux-foundation.org: fix build] Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 193ed34..7d8f642 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -991,47 +991,77 @@ out: return ret; } -static bool free_pages_prepare(struct page *page, unsigned int order); - -#ifdef CONFIG_DEBUG_VM -static inline bool free_pcp_prepare(struct page *page) +static __always_inline bool free_pages_prepare(struct page *page, + unsigned int order, bool check_free) { - return free_pages_prepare(page, 0); -} + int bad = 0; -static inline bool bulkfree_pcp_prepare(struct page *page) -{ - return false; -} -#else -static bool free_pcp_prepare(struct page *page) -{ VM_BUG_ON_PAGE(PageTail(page), page); - trace_mm_page_free(page, 0); - kmemcheck_free_shadow(page, 0); - kasan_free_pages(page, 0); + trace_mm_page_free(page, order); + kmemcheck_free_shadow(page, order); + kasan_free_pages(page, order); + + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + } + } if (PageAnonHead(page)) page->mapping = NULL; + if (check_free) + bad += free_pages_check(page); + if (bad) + return false; - reset_page_owner(page, 0); + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), - PAGE_SIZE); + PAGE_SIZE << order); debug_check_no_obj_freed(page_address(page), - PAGE_SIZE); + PAGE_SIZE << order); } - arch_free_page(page, 0); - kernel_poison_pages(page, 0, 0); - kernel_map_pages(page, 0, 0); + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return true; } +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, true); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, false); +} + static bool bulkfree_pcp_prepare(struct page *page) { return free_pages_check(page); @@ -1201,66 +1231,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) } } -static bool free_pages_prepare(struct page *page, unsigned int order) -{ - int bad = 0; - - VM_BUG_ON_PAGE(PageTail(page), page); - - trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); - - /* - * Check tail pages before head page information is cleared to - * avoid checking PageCompound for order-0 pages. - */ - if (unlikely(order)) { - bool compound = PageCompound(page); - int i; - - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { - bad++; - continue; - } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - } - } - if (PageAnonHead(page)) - page->mapping = NULL; - bad += free_pages_check(page); - if (bad) - return false; - - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - reset_page_owner(page, order); - - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE << order); - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } - arch_free_page(page, order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); - - return true; -} - static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order)) + if (!free_pages_prepare(page, order, true)) return; migratetype = get_pfnblock_migratetype(page, pfn); -- cgit v0.10.2 From 4e6118016eb7986109ad61b00186579f384f956a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:41 -0700 Subject: mm, page_alloc: uninline the bad page part of check_new_page() Bad pages should be rare so the code handling them doesn't need to be inline for performance reasons. Put it to separate function which returns void. This also assumes that the initial page_expected_state() result will match the result of the thorough check, i.e. the page doesn't become "good" in the meanwhile. This matches the same expectations already in place in free_pages_check(). !DEBUG_VM bloat-o-meter: add/remove: 1/0 grow/shrink: 0/1 up/down: 134/-274 (-140) function old new delta check_new_page_bad - 134 +134 get_page_from_freelist 3468 3194 -274 Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d8f642..ecf6633 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1647,19 +1647,11 @@ static inline void expand(struct zone *zone, struct page *page, } } -/* - * This page is about to be returned from the page allocator - */ -static inline int check_new_page(struct page *page) +static void check_new_page_bad(struct page *page) { - const char *bad_reason; - unsigned long bad_flags; + const char *bad_reason = NULL; + unsigned long bad_flags = 0; - if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)) - return 0; - - bad_reason = NULL; - bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) @@ -1678,11 +1670,20 @@ static inline int check_new_page(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - return 0; + bad_page(page, bad_reason, bad_flags); +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + if (likely(page_expected_state(page, + PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) + return 0; + + check_new_page_bad(page); + return 1; } static inline bool free_pages_prezeroed(bool poisoned) -- cgit v0.10.2 From 4741526b83c5d3a3d661d1896f9e7414c5730bcb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:44 -0700 Subject: mm, page_alloc: restore the original nodemask if the fast path allocation failed The page allocator fast path uses either the requested nodemask or cpuset_current_mems_allowed if cpusets are enabled. If the allocation context allows watermarks to be ignored then it can also ignore memory policies. However, on entering the allocator slowpath the nodemask may still be cpuset_current_mems_allowed and the policies are enforced. This patch resets the nodemask appropriately before entering the slowpath. Link: http://lkml.kernel.org/r/20160504143628.GU2858@techsingularity.net Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecf6633..5c469c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3636,6 +3636,12 @@ retry_cpuset: alloc_mask = memalloc_noio_flags(gfp_mask); ac.spread_dirty_pages = false; + /* + * Restore the original nodemask if it was potentially replaced with + * &cpuset_current_mems_allowed to optimize the fast-path attempt. + */ + if (cpusets_enabled()) + ac.nodemask = nodemask; page = __alloc_pages_slowpath(alloc_mask, order, &ac); no_zone: -- cgit v0.10.2