summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2014-04-07 23:49:35 (GMT)
committerScott Wood <scottwood@freescale.com>2014-04-07 23:49:35 (GMT)
commit62b8c978ee6b8d135d9e7953221de58000dba986 (patch)
tree683b04b2e627f6710c22c151b23c8cc9a165315e /kernel
parent78fd82238d0e5716578c326404184a27ba67fd6e (diff)
downloadlinux-fsl-qoriq-62b8c978ee6b8d135d9e7953221de58000dba986.tar.xz
Rewind v3.13-rc3+ (78fd82238d0e5716) to v3.12
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Makefile81
-rw-r--r--kernel/audit.c153
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c133
-rw-r--r--kernel/bounds.c6
-rw-r--r--kernel/cgroup.c290
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c49
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/cpuset.c8
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_debugger.c5
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/elfcore.c10
-rw-r--r--kernel/events/core.c180
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c101
-rw-r--r--kernel/events/uprobes.c223
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/Kconfig30
-rw-r--r--kernel/gcov/Makefile32
-rw-r--r--kernel/gcov/base.c32
-rw-r--r--kernel/gcov/fs.c52
-rw-r--r--kernel/gcov/gcc_3_4.c115
-rw-r--r--kernel/gcov/gcc_4_7.c560
-rw-r--r--kernel/gcov/gcov.h65
-rw-r--r--kernel/hung_task.c17
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/jump_label.c5
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/kthread.c73
-rw-r--r--kernel/lglock.c (renamed from kernel/locking/lglock.c)0
-rw-r--r--kernel/lockdep.c (renamed from kernel/locking/lockdep.c)8
-rw-r--r--kernel/lockdep_internals.h (renamed from kernel/locking/lockdep_internals.h)0
-rw-r--r--kernel/lockdep_proc.c (renamed from kernel/locking/lockdep_proc.c)15
-rw-r--r--kernel/lockdep_states.h (renamed from kernel/locking/lockdep_states.h)0
-rw-r--r--kernel/locking/Makefile25
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/rwsem-spinlock.c296
-rw-r--r--kernel/locking/rwsem-xadd.c293
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/modsign_certificate.S12
-rw-r--r--kernel/modsign_pubkey.c104
-rw-r--r--kernel/module-internal.h2
-rw-r--r--kernel/module.c169
-rw-r--r--kernel/module_signing.c11
-rw-r--r--kernel/mutex-debug.c (renamed from kernel/locking/mutex-debug.c)0
-rw-r--r--kernel/mutex-debug.h (renamed from kernel/locking/mutex-debug.h)0
-rw-r--r--kernel/mutex.c (renamed from kernel/locking/mutex.c)2
-rw-r--r--kernel/mutex.h (renamed from kernel/locking/mutex.h)0
-rw-r--r--kernel/padata.c9
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/power/Kconfig16
-rw-r--r--kernel/power/qos.c26
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/user.c21
-rw-r--r--kernel/printk/printk.c35
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcu.h (renamed from kernel/rcu/rcu.h)7
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcupdate.c (renamed from kernel/rcu/update.c)10
-rw-r--r--kernel/rcutiny.c (renamed from kernel/rcu/tiny.c)37
-rw-r--r--kernel/rcutiny_plugin.h (renamed from kernel/rcu/tiny_plugin.h)0
-rw-r--r--kernel/rcutorture.c (renamed from kernel/rcu/torture.c)6
-rw-r--r--kernel/rcutree.c (renamed from kernel/rcu/tree.c)200
-rw-r--r--kernel/rcutree.h (renamed from kernel/rcu/tree.h)2
-rw-r--r--kernel/rcutree_plugin.h (renamed from kernel/rcu/tree_plugin.h)88
-rw-r--r--kernel/rcutree_trace.c (renamed from kernel/rcu/tree_trace.c)2
-rw-r--r--kernel/rtmutex-debug.c (renamed from kernel/locking/rtmutex-debug.c)0
-rw-r--r--kernel/rtmutex-debug.h (renamed from kernel/locking/rtmutex-debug.h)0
-rw-r--r--kernel/rtmutex-tester.c (renamed from kernel/locking/rtmutex-tester.c)0
-rw-r--r--kernel/rtmutex.c (renamed from kernel/locking/rtmutex.c)0
-rw-r--r--kernel/rtmutex.h (renamed from kernel/locking/rtmutex.h)0
-rw-r--r--kernel/rtmutex_common.h (renamed from kernel/locking/rtmutex_common.h)0
-rw-r--r--kernel/rwsem.c (renamed from kernel/locking/rwsem.c)0
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c703
-rw-r--r--kernel/sched/debug.c68
-rw-r--r--kernel/sched/fair.c1445
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h54
-rw-r--r--kernel/sched/stats.h46
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/semaphore.c (renamed from kernel/locking/semaphore.c)0
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c19
-rw-r--r--kernel/softirq.c184
-rw-r--r--kernel/spinlock.c (renamed from kernel/locking/spinlock.c)0
-rw-r--r--kernel/srcu.c (renamed from kernel/rcu/srcu.c)0
-rw-r--r--kernel/stop_machine.c303
-rw-r--r--kernel/sys.c1
-rw-r--r--kernel/sysctl.c34
-rw-r--r--kernel/sysctl_binary.c6
-rw-r--r--kernel/system_certificates.S10
-rw-r--r--kernel/system_keyring.c105
-rw-r--r--kernel/taskstats.c54
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/sched_clock.c114
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c25
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timer.c13
-rw-r--r--kernel/trace/blktrace.c36
-rw-r--r--kernel/trace/ftrace.c225
-rw-r--r--kernel/trace/trace.c85
-rw-r--r--kernel/trace/trace.h51
-rw-r--r--kernel/trace/trace_branch.c2
-rw-r--r--kernel/trace/trace_event_perf.c10
-rw-r--r--kernel/trace/trace_events.c35
-rw-r--r--kernel/trace/trace_events_filter.c218
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c19
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_stat.c41
-rw-r--r--kernel/trace/trace_syscalls.c32
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/up.c11
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/wait.c (renamed from kernel/sched/wait.c)127
-rw-r--r--kernel/workqueue.c50
147 files changed, 2542 insertions, 6416 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a8..94fabd5 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
default 1000 if HZ_1000
config SCHED_HRTICK
- def_bool HIGH_RES_TIMERS
+ def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/Makefile b/kernel/Makefile
index bbaf7d5..1ce4755 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,44 +6,56 @@ obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
- extable.o params.o posix-timers.o \
- kthread.o sys_ni.o posix-cpu-timers.o \
- hrtimer.o nsproxy.o \
+ rcupdate.o extable.o params.o posix-timers.o \
+ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
+ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o groups.o smpboot.o
+ async.o range.o groups.o lglock.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
obj-y += sched/
-obj-y += locking/
obj-y += power/
obj-y += printk/
obj-y += cpu/
obj-y += irq/
-obj-y += rcu/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
+obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -69,6 +81,12 @@ obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -123,52 +141,19 @@ targets += timeconst.h
$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
$(call if_changed,bc)
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
+ifeq ($(CONFIG_MODULE_SIG),y)
#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509". Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
+# Pull the signing certificate and any extra certificates into the kernel
#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509
-X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
- $(or $(realpath $(CERT)),$(CERT))))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(info X.509 certificate list changed)
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-
-quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+quiet_cmd_touch = TOUCH $@
+ cmd_touch = touch $@
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
- $(call if_changed,x509certs)
+extra_certificates:
+ $(call cmd,touch)
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
- @echo $(X509_CERTIFICATES) >$@
+kernel/modsign_certificate.o: signing_key.x509 extra_certificates
-clean-files := x509_certificate_list .x509.list
-endif
-
-ifeq ($(CONFIG_MODULE_SIG),y)
###############################################################################
#
# If module signing is requested, say by allyesconfig, but a key has not been
diff --git a/kernel/audit.c b/kernel/audit.c
index 906ae5a0..7b0e23a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -60,6 +60,7 @@
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
+#include <net/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
#include <linux/pid_namespace.h>
@@ -139,17 +140,6 @@ static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
-static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
- .mask = -1,
- .features = 0,
- .lock = 0,};
-
-static char *audit_feature_names[2] = {
- "only_unset_loginuid",
- "loginuid_immutable",
-};
-
-
/* Serialize requests from userspace. */
DEFINE_MUTEX(audit_cmd_mutex);
@@ -594,8 +584,6 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return -EOPNOTSUPP;
case AUDIT_GET:
case AUDIT_SET:
- case AUDIT_GET_FEATURE:
- case AUDIT_SET_FEATURE:
case AUDIT_LIST_RULES:
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
@@ -625,7 +613,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
- if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
+ if (!audit_enabled) {
*ab = NULL;
return rc;
}
@@ -640,94 +628,6 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
return rc;
}
-int is_audit_feature_set(int i)
-{
- return af.features & AUDIT_FEATURE_TO_MASK(i);
-}
-
-
-static int audit_get_feature(struct sk_buff *skb)
-{
- u32 seq;
-
- seq = nlmsg_hdr(skb)->nlmsg_seq;
-
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &af, sizeof(af));
-
- return 0;
-}
-
-static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
- u32 old_lock, u32 new_lock, int res)
-{
- struct audit_buffer *ab;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
- audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d",
- audit_feature_names[which], !!old_feature, !!new_feature,
- !!old_lock, !!new_lock, res);
- audit_log_end(ab);
-}
-
-static int audit_set_feature(struct sk_buff *skb)
-{
- struct audit_features *uaf;
- int i;
-
- BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
- uaf = nlmsg_data(nlmsg_hdr(skb));
-
- /* if there is ever a version 2 we should handle that here */
-
- for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
- u32 feature = AUDIT_FEATURE_TO_MASK(i);
- u32 old_feature, new_feature, old_lock, new_lock;
-
- /* if we are not changing this feature, move along */
- if (!(feature & uaf->mask))
- continue;
-
- old_feature = af.features & feature;
- new_feature = uaf->features & feature;
- new_lock = (uaf->lock | af.lock) & feature;
- old_lock = af.lock & feature;
-
- /* are we changing a locked feature? */
- if ((af.lock & feature) && (new_feature != old_feature)) {
- audit_log_feature_change(i, old_feature, new_feature,
- old_lock, new_lock, 0);
- return -EPERM;
- }
- }
- /* nothing invalid, do the changes */
- for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
- u32 feature = AUDIT_FEATURE_TO_MASK(i);
- u32 old_feature, new_feature, old_lock, new_lock;
-
- /* if we are not changing this feature, move along */
- if (!(feature & uaf->mask))
- continue;
-
- old_feature = af.features & feature;
- new_feature = uaf->features & feature;
- old_lock = af.lock & feature;
- new_lock = (uaf->lock | af.lock) & feature;
-
- if (new_feature != old_feature)
- audit_log_feature_change(i, old_feature, new_feature,
- old_lock, new_lock, 1);
-
- if (new_feature)
- af.features |= feature;
- else
- af.features &= ~feature;
- af.lock |= new_lock;
- }
-
- return 0;
-}
-
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
@@ -759,7 +659,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
switch (msg_type) {
case AUDIT_GET:
- memset(&status_set, 0, sizeof(status_set));
status_set.enabled = audit_enabled;
status_set.failure = audit_failure;
status_set.pid = audit_pid;
@@ -771,7 +670,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
&status_set, sizeof(status_set));
break;
case AUDIT_SET:
- if (nlmsg_len(nlh) < sizeof(struct audit_status))
+ if (nlh->nlmsg_len < sizeof(struct audit_status))
return -EINVAL;
status_get = (struct audit_status *)data;
if (status_get->mask & AUDIT_STATUS_ENABLED) {
@@ -800,16 +699,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
err = audit_set_backlog_limit(status_get->backlog_limit);
break;
- case AUDIT_GET_FEATURE:
- err = audit_get_feature(skb);
- if (err)
- return err;
- break;
- case AUDIT_SET_FEATURE:
- err = audit_set_feature(skb);
- if (err)
- return err;
- break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
@@ -826,8 +715,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
- audit_log_format(ab, " msg='%.*s'",
- AUDIT_MESSAGE_TEXT_MAX,
+ audit_log_format(ab, " msg='%.1024s'",
(char *)data);
else {
int size;
@@ -930,7 +818,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct task_struct *tsk = current;
spin_lock(&tsk->sighand->siglock);
- s.enabled = tsk->signal->audit_tty;
+ s.enabled = tsk->signal->audit_tty != 0;
s.log_passwd = tsk->signal->audit_tty_log_passwd;
spin_unlock(&tsk->sighand->siglock);
@@ -944,7 +832,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len));
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
return -EINVAL;
@@ -1179,6 +1067,13 @@ static void wait_for_auditd(unsigned long sleep_time)
remove_wait_queue(&audit_backlog_wait, &wait);
}
+/* Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, tsk
+ * should be NULL. */
+
/**
* audit_log_start - obtain an audit buffer
* @ctx: audit_context (may be NULL)
@@ -1494,7 +1389,7 @@ void audit_log_session_info(struct audit_buffer *ab)
u32 sessionid = audit_get_sessionid(current);
uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+ audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid);
}
void audit_log_key(struct audit_buffer *ab, char *key)
@@ -1641,26 +1536,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
}
}
- /* log the audit_names record type */
- audit_log_format(ab, " nametype=");
- switch(n->type) {
- case AUDIT_TYPE_NORMAL:
- audit_log_format(ab, "NORMAL");
- break;
- case AUDIT_TYPE_PARENT:
- audit_log_format(ab, "PARENT");
- break;
- case AUDIT_TYPE_CHILD_DELETE:
- audit_log_format(ab, "DELETE");
- break;
- case AUDIT_TYPE_CHILD_CREATE:
- audit_log_format(ab, "CREATE");
- break;
- default:
- audit_log_format(ab, "UNKNOWN");
- break;
- }
-
audit_log_fcaps(ab, n);
audit_log_end(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index b779642..123c9b7 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -197,9 +197,6 @@ struct audit_context {
int fd;
int flags;
} mmap;
- struct {
- int argc;
- } execve;
};
int fds[2];
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 51f3fd4..f7aee8b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -343,7 +343,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_DEVMINOR:
case AUDIT_EXIT:
case AUDIT_SUCCESS:
- case AUDIT_INODE:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
@@ -424,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->lsm_rule = NULL;
/* Support legacy tests for a valid loginuid */
- if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
f->type = AUDIT_LOGINUID_SET;
f->val = 0;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 90594c9..9845cb3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -95,6 +95,13 @@ struct audit_aux_data {
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
+struct audit_aux_data_execve {
+ struct audit_aux_data d;
+ int argc;
+ int envc;
+ struct mm_struct *mm;
+};
+
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
@@ -114,6 +121,12 @@ struct audit_aux_data_bprm_fcaps {
struct audit_cap_data new_pcap;
};
+struct audit_aux_data_capset {
+ struct audit_aux_data d;
+ pid_t pid;
+ struct audit_cap_data cap;
+};
+
struct audit_tree_refs {
struct audit_tree_refs *next;
struct audit_chunk *c[31];
@@ -553,7 +566,7 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_INODE:
if (name)
- result = audit_comparator(name->ino, f->op, f->val);
+ result = (name->ino == f->val);
else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_comparator(n->ino, f->op, f->val)) {
@@ -930,10 +943,8 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED) {
- clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ if (state == AUDIT_DISABLED)
return 0;
- }
if (!(context = audit_alloc_context(state))) {
kfree(key);
@@ -1138,16 +1149,20 @@ static int audit_log_single_execve_arg(struct audit_context *context,
}
static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab)
+ struct audit_buffer **ab,
+ struct audit_aux_data_execve *axi)
{
int i, len;
size_t len_sent = 0;
const char __user *p;
char *buf;
- p = (const char __user *)current->mm->arg_start;
+ if (axi->mm != current->mm)
+ return; /* execve failed, no additional info */
+
+ p = (const char __user *)axi->mm->arg_start;
- audit_log_format(*ab, "argc=%d", context->execve.argc);
+ audit_log_format(*ab, "argc=%d", axi->argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1161,7 +1176,7 @@ static void audit_log_execve_info(struct audit_context *context,
return;
}
- for (i = 0; i < context->execve.argc; i++) {
+ for (i = 0; i < axi->argc; i++) {
len = audit_log_single_execve_arg(context, ab, i,
&len_sent, p, buf);
if (len <= 0)
@@ -1264,9 +1279,6 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break; }
- case AUDIT_EXECVE: {
- audit_log_execve_info(context, &ab);
- break; }
}
audit_log_end(ab);
}
@@ -1313,6 +1325,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
switch (aux->type) {
+ case AUDIT_EXECVE: {
+ struct audit_aux_data_execve *axi = (void *)aux;
+ audit_log_execve_info(context, &ab, axi);
+ break; }
+
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1947,43 +1964,6 @@ int auditsc_get_stamp(struct audit_context *ctx,
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
-static int audit_set_loginuid_perm(kuid_t loginuid)
-{
- /* if we are unset, we don't need privs */
- if (!audit_loginuid_set(current))
- return 0;
- /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
- if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
- return -EPERM;
- /* it is set, you need permission */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
- /* reject if this is not an unset and we don't allow that */
- if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
- return -EPERM;
- return 0;
-}
-
-static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
- unsigned int oldsessionid, unsigned int sessionid,
- int rc)
-{
- struct audit_buffer *ab;
- uid_t uid, ologinuid, nloginuid;
-
- uid = from_kuid(&init_user_ns, task_uid(current));
- ologinuid = from_kuid(&init_user_ns, koldloginuid);
- nloginuid = from_kuid(&init_user_ns, kloginuid),
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (!ab)
- return;
- audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old "
- "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid,
- nloginuid, oldsessionid, sessionid, !rc);
- audit_log_end(ab);
-}
-
/**
* audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
@@ -1995,26 +1975,37 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
int audit_set_loginuid(kuid_t loginuid)
{
struct task_struct *task = current;
- unsigned int oldsessionid, sessionid = (unsigned int)-1;
- kuid_t oldloginuid;
- int rc;
-
- oldloginuid = audit_get_loginuid(current);
- oldsessionid = audit_get_sessionid(current);
+ struct audit_context *context = task->audit_context;
+ unsigned int sessionid;
- rc = audit_set_loginuid_perm(loginuid);
- if (rc)
- goto out;
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+ if (audit_loginuid_set(task))
+ return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
- /* are we setting or clearing? */
- if (uid_valid(loginuid))
- sessionid = atomic_inc_return(&session_id);
+ sessionid = atomic_inc_return(&session_id);
+ if (context && context->in_syscall) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (ab) {
+ audit_log_format(ab, "login pid=%d uid=%u "
+ "old auid=%u new auid=%u"
+ " old ses=%u new ses=%u",
+ task->pid,
+ from_kuid(&init_user_ns, task_uid(task)),
+ from_kuid(&init_user_ns, task->loginuid),
+ from_kuid(&init_user_ns, loginuid),
+ task->sessionid, sessionid);
+ audit_log_end(ab);
+ }
+ }
task->sessionid = sessionid;
task->loginuid = loginuid;
-out:
- audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
- return rc;
+ return 0;
}
/**
@@ -2135,12 +2126,22 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
context->ipc.has_perm = 1;
}
-void __audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
{
+ struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- context->type = AUDIT_EXECVE;
- context->execve.argc = bprm->argc;
+ ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->argc = bprm->argc;
+ ax->envc = bprm->envc;
+ ax->mm = bprm->mm;
+ ax->d.type = AUDIT_EXECVE;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 5253204..0c9b862 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,8 +10,6 @@
#include <linux/mmzone.h>
#include <linux/kbuild.h>
#include <linux/page_cgroup.h>
-#include <linux/log2.h>
-#include <linux/spinlock_types.h>
void foo(void)
{
@@ -19,9 +17,5 @@ void foo(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
-#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
-#endif
- DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int));
/* End of constants */
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b729c2..8bd9cfd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,14 +90,6 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
/*
- * cgroup destruction makes heavy use of work items and there can be a lot
- * of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
- * which may lead to deadlock.
- */
-static struct workqueue_struct *cgroup_destroy_wq;
-
-/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
@@ -133,6 +125,38 @@ struct cfent {
};
/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX (65535)
+struct css_id {
+ /*
+ * The css to which this ID points. This pointer is set to valid value
+ * after cgroup is populated. If cgroup is removed, this will be NULL.
+ * This pointer is expected to be RCU-safe because destroy()
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
+ */
+ struct cgroup_subsys_state __rcu *css;
+ /*
+ * ID of this css.
+ */
+ unsigned short id;
+ /*
+ * Depth in hierarchy which this ID belongs to.
+ */
+ unsigned short depth;
+ /*
+ * ID is freed by RCU. (and lookup routine is RCU safe.)
+ */
+ struct rcu_head rcu_head;
+ /*
+ * Hierarchy of CSS ID belongs to.
+ */
+ unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+/*
* cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
@@ -199,7 +223,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static int cgroup_file_release(struct inode *inode, struct file *file);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -364,6 +387,9 @@ struct cgrp_cset_link {
static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link;
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+ struct cgroup_subsys_state *css);
+
/*
* css_set_lock protects the list of css_set objects, and the chain of
* tasks off each css_set. Nests outside task->alloc_lock due to
@@ -815,6 +841,8 @@ static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
+static int alloc_css_id(struct cgroup_subsys_state *child_css);
+
static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -880,7 +908,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
- queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
+ schedule_work(&cgrp->destroy_work);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -904,6 +932,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
iput(inode);
}
+static int cgroup_delete(const struct dentry *d)
+{
+ return 1;
+}
+
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -1490,7 +1523,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
{
static const struct dentry_operations cgroup_dops = {
.d_iput = cgroup_diput,
- .d_delete = always_delete_dentry,
+ .d_delete = cgroup_delete,
};
struct inode *inode =
@@ -2430,7 +2463,7 @@ static const struct file_operations cgroup_seqfile_operations = {
.read = seq_read,
.write = cgroup_file_write,
.llseek = seq_lseek,
- .release = cgroup_file_release,
+ .release = single_release,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2491,8 +2524,6 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
ret = cft->release(inode, file);
if (css->ss)
css_put(css);
- if (file->f_op == &cgroup_seqfile_operations)
- single_release(inode, file);
return ret;
}
@@ -4209,6 +4240,21 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
goto err;
}
}
+
+ /* This cgroup is ready now */
+ for_each_root_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+ struct css_id *id = rcu_dereference_protected(css->id, true);
+
+ /*
+ * Update id->css pointer and make this css visible from
+ * CSS ID functions. This pointer will be dereferened
+ * from RCU-read-side without locks.
+ */
+ if (id)
+ rcu_assign_pointer(id->css, css);
+ }
+
return 0;
err:
cgroup_clear_dir(cgrp, subsys_mask);
@@ -4260,7 +4306,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
* css_put(). dput() requires process context which we don't have.
*/
INIT_WORK(&css->destroy_work, css_free_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ schedule_work(&css->destroy_work);
}
static void css_release(struct percpu_ref *ref)
@@ -4277,6 +4323,7 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
css->cgroup = cgrp;
css->ss = ss;
css->flags = 0;
+ css->id = NULL;
if (cgrp->parent)
css->parent = cgroup_css(cgrp->parent, ss);
@@ -4408,6 +4455,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_free_all;
init_css(css, ss, cgrp);
+
+ if (ss->use_id) {
+ err = alloc_css_id(css);
+ if (err)
+ goto err_free_all;
+ }
}
/*
@@ -4550,7 +4603,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ schedule_work(&css->destroy_work);
}
/**
@@ -4872,6 +4925,12 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
/* our new subsystem will be attached to the dummy hierarchy. */
init_css(css, ss, cgroup_dummy_top);
+ /* init_idr must be after init_css() because it sets css->id. */
+ if (ss->use_id) {
+ ret = cgroup_init_idr(ss, css);
+ if (ret)
+ goto err_unload;
+ }
/*
* Now we need to entangle the css into the existing css_sets. unlike
@@ -4937,6 +4996,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
offline_css(cgroup_css(cgroup_dummy_top, ss));
+ if (ss->use_id)
+ idr_destroy(&ss->idr);
+
/* deassign the subsys_id */
cgroup_subsys[ss->subsys_id] = NULL;
@@ -4963,7 +5025,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
/*
* remove subsystem's css from the cgroup_dummy_top and free it -
* need to free before marking as null because ss->css_free needs
- * the cgrp->subsys pointer to find their state.
+ * the cgrp->subsys pointer to find their state. note that this
+ * also takes care of freeing the css_id.
*/
ss->css_free(cgroup_css(cgroup_dummy_top, ss));
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@ -5034,6 +5097,8 @@ int __init cgroup_init(void)
for_each_builtin_subsys(ss, i) {
if (!ss->early_init)
cgroup_init_subsys(ss);
+ if (ss->use_id)
+ cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
}
/* allocate id for the dummy hierarchy */
@@ -5074,22 +5139,6 @@ out:
return err;
}
-static int __init cgroup_wq_init(void)
-{
- /*
- * There isn't much point in executing destruction path in
- * parallel. Good chunk is serialized with cgroup_mutex anyway.
- * Use 1 for @max_active.
- *
- * We would prefer to do this in cgroup_init() above, but that
- * is called before init_workqueues(): so leave this until after.
- */
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
- return 0;
-}
-core_initcall(cgroup_wq_init);
-
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -5469,6 +5518,181 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+/*
+ * Functons for CSS ID.
+ */
+
+/* to get ID other than 0, this should be called when !cgroup_is_dead() */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+ struct css_id *cssid;
+
+ /*
+ * This css_id() can return correct value when somone has refcnt
+ * on this or this is under rcu_read_lock(). Once css->id is allocated,
+ * it's unchanged until freed.
+ */
+ cssid = rcu_dereference_raw(css->id);
+
+ if (cssid)
+ return cssid->id;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(css_id);
+
+/**
+ * css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, the caller must hold rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+ const struct cgroup_subsys_state *root)
+{
+ struct css_id *child_id;
+ struct css_id *root_id;
+
+ child_id = rcu_dereference(child->id);
+ if (!child_id)
+ return false;
+ root_id = rcu_dereference(root->id);
+ if (!root_id)
+ return false;
+ if (child_id->depth < root_id->depth)
+ return false;
+ if (child_id->stack[root_id->depth] != root_id->id)
+ return false;
+ return true;
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+ struct css_id *id = rcu_dereference_protected(css->id, true);
+
+ /* When this is called before css_id initialization, id can be NULL */
+ if (!id)
+ return;
+
+ BUG_ON(!ss->use_id);
+
+ rcu_assign_pointer(id->css, NULL);
+ rcu_assign_pointer(css->id, NULL);
+ spin_lock(&ss->id_lock);
+ idr_remove(&ss->idr, id->id);
+ spin_unlock(&ss->id_lock);
+ kfree_rcu(id, rcu_head);
+}
+EXPORT_SYMBOL_GPL(free_css_id);
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+ struct css_id *newid;
+ int ret, size;
+
+ BUG_ON(!ss->use_id);
+
+ size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+ newid = kzalloc(size, GFP_KERNEL);
+ if (!newid)
+ return ERR_PTR(-ENOMEM);
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&ss->id_lock);
+ /* Don't use 0. allocates an ID of 1-65535 */
+ ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
+ spin_unlock(&ss->id_lock);
+ idr_preload_end();
+
+ /* Returns error when there are no free spaces for new ID.*/
+ if (ret < 0)
+ goto err_out;
+
+ newid->id = ret;
+ newid->depth = depth;
+ return newid;
+err_out:
+ kfree(newid);
+ return ERR_PTR(ret);
+
+}
+
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+ struct cgroup_subsys_state *rootcss)
+{
+ struct css_id *newid;
+
+ spin_lock_init(&ss->id_lock);
+ idr_init(&ss->idr);
+
+ newid = get_new_cssid(ss, 0);
+ if (IS_ERR(newid))
+ return PTR_ERR(newid);
+
+ newid->stack[0] = newid->id;
+ RCU_INIT_POINTER(newid->css, rootcss);
+ RCU_INIT_POINTER(rootcss->id, newid);
+ return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys_state *child_css)
+{
+ struct cgroup_subsys_state *parent_css = css_parent(child_css);
+ struct css_id *child_id, *parent_id;
+ int i, depth;
+
+ parent_id = rcu_dereference_protected(parent_css->id, true);
+ depth = parent_id->depth + 1;
+
+ child_id = get_new_cssid(child_css->ss, depth);
+ if (IS_ERR(child_id))
+ return PTR_ERR(child_id);
+
+ for (i = 0; i < depth; i++)
+ child_id->stack[i] = parent_id->stack[i];
+ child_id->stack[depth] = child_id->id;
+ /*
+ * child_id->css pointer will be set after this cgroup is available
+ * see cgroup_populate_dir()
+ */
+ rcu_assign_pointer(child_css->id, child_id);
+
+ return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+ struct css_id *cssid = NULL;
+
+ BUG_ON(!ss->use_id);
+ cssid = idr_find(&ss->idr, id);
+
+ if (unlikely(!cssid))
+ return NULL;
+
+ return rcu_dereference(cssid->css);
+}
+EXPORT_SYMBOL_GPL(css_lookup);
+
/**
* css_from_dir - get corresponding css from the dentry of a cgroup dir
* @dentry: directory dentry of interest
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e5f3917..859c8df 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage void __sched notrace preempt_schedule_context(void)
+void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index deff2e6..d7f07a2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -306,28 +306,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__func__, cpu);
goto out_release;
}
-
- /*
- * By now we've cleared cpu_active_mask, wait for all preempt-disabled
- * and RCU users of this state to go away such that all new such users
- * will observe it.
- *
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so explicitly call both.
- *
- * Do sync before park smpboot threads to take care the rcu boost case.
- */
-#ifdef CONFIG_PREEMPT
- synchronize_sched();
-#endif
- synchronize_rcu();
-
smpboot_park_threads(cpu);
- /*
- * So now all preempt/rcu users must observe !cpu_active().
- */
-
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
@@ -440,6 +420,11 @@ int cpu_up(unsigned int cpu)
{
int err = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ int nid;
+ pg_data_t *pgdat;
+#endif
+
if (!cpu_possible(cpu)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
@@ -450,9 +435,27 @@ int cpu_up(unsigned int cpu)
return -EINVAL;
}
- err = try_online_node(cpu_to_node(cpu));
- if (err)
- return err;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ nid = cpu_to_node(cpu);
+ if (!node_online(nid)) {
+ err = mem_online_node(nid);
+ if (err)
+ return err;
+ }
+
+ pgdat = NODE_DATA(nid);
+ if (!pgdat) {
+ printk(KERN_ERR
+ "Can't online cpu %d due to NULL pgdat\n", cpu);
+ return -ENOMEM;
+ }
+
+ if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
+#endif
cpu_maps_update_begin();
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 988573a..e695c0a 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- while (!tif_need_resched())
+ while (!need_resched())
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -92,7 +92,8 @@ static void cpu_idle_loop(void)
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
cpu_idle_poll();
} else {
- if (!current_clr_polling_and_test()) {
+ current_clr_polling();
+ if (!need_resched()) {
stop_critical_timings();
rcu_idle_enter();
arch_cpu_idle();
@@ -102,16 +103,9 @@ static void cpu_idle_loop(void)
} else {
local_irq_enable();
}
- __current_set_polling();
+ current_set_polling();
}
arch_cpu_idle_exit();
- /*
- * We need to test and propagate the TIF_NEED_RESCHED
- * bit here because we might not have send the
- * reschedule IPI to idle tasks.
- */
- if (tif_need_resched())
- set_preempt_need_resched();
}
tick_nohz_idle_exit();
schedule_preempt_disabled();
@@ -135,7 +129,7 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
- __current_set_polling();
+ current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4772034..6bf981e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,10 +1033,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
need_loop = task_has_mempolicy(tsk) ||
!nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop) {
- local_irq_disable();
+ if (need_loop)
write_seqcount_begin(&tsk->mems_allowed_seq);
- }
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1044,10 +1042,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
- if (need_loop) {
+ if (need_loop)
write_seqcount_end(&tsk->mems_allowed_seq);
- local_irq_enable();
- }
task_unlock(tsk);
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7d2f35e..0506d44 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,12 +575,8 @@ return_normal:
raw_spin_lock(&dbg_slave_lock);
#ifdef CONFIG_SMP
- /* If send_ready set, slaves are already waiting */
- if (ks->send_ready)
- atomic_set(ks->send_ready, 1);
-
/* Signal the other CPUs to enter kgdb_wait() */
- else if ((!kgdb_single_step) && kgdb_do_roundup)
+ if ((!kgdb_single_step) && kgdb_do_roundup)
kgdb_roundup_cpus(flags);
#endif
@@ -682,11 +678,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (arch_kgdb_ops.enable_nmi)
arch_kgdb_ops.enable_nmi(0);
- memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
ks->signo = signo;
ks->err_code = ecode;
+ ks->kgdb_usethreadid = 0;
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
@@ -736,30 +732,6 @@ int kgdb_nmicallback(int cpu, void *regs)
return 1;
}
-int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
-{
-#ifdef CONFIG_SMP
- if (!kgdb_io_ready(0) || !send_ready)
- return 1;
-
- if (kgdb_info[cpu].enter_kgdb == 0) {
- struct kgdb_state kgdb_var;
- struct kgdb_state *ks = &kgdb_var;
-
- memset(ks, 0, sizeof(struct kgdb_state));
- ks->cpu = cpu;
- ks->ex_vector = trapnr;
- ks->signo = SIGTRAP;
- ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI;
- ks->linux_regs = regs;
- ks->send_ready = send_ready;
- kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
- return 0;
- }
-#endif
- return 1;
-}
-
static void kgdb_console_write(struct console *co, const char *s,
unsigned count)
{
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 572aa4f..2235967 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,7 +26,6 @@ struct kgdb_state {
unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
- atomic_t *send_ready;
};
/* Exception state values */
@@ -75,13 +74,11 @@ extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
extern int kdb_common_init_state(struct kgdb_state *ks);
extern int kdb_common_deinit_state(void);
-#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
return DBG_PASS_EVENT;
}
-#define KGDB_KDB_REASON_SYSTEM_NMI 0
#endif /* CONFIG_KGDB_KDB */
#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca3..328d18e 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,10 +69,7 @@ int kdb_stub(struct kgdb_state *ks)
if (atomic_read(&kgdb_setting_breakpoint))
reason = KDB_REASON_KEYBOARD;
- if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
- reason = KDB_REASON_SYSTEM_NMI;
-
- else if (in_nmi())
+ if (in_nmi())
reason = KDB_REASON_NMI;
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8..00eb8f7 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,9 +1200,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
instruction_pointer(regs));
kdb_dumpregs(regs);
break;
- case KDB_REASON_SYSTEM_NMI:
- kdb_printf("due to System NonMaskable Interrupt\n");
- break;
case KDB_REASON_NMI:
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 54996b7..d473988 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -108,6 +108,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
struct timespec ts;
cputime_t utime, stime, stimescaled, utimescaled;
+ /* Though tsk->delays accessed later, early exit avoids
+ * unnecessary returning of other data
+ */
+ if (!tsk->delays)
+ goto done;
+
tmp = (s64)d->cpu_run_real_total;
task_cputime(tsk, &utime, &stime);
cputime_to_timespec(utime + stime, &ts);
@@ -152,6 +158,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
+done:
return 0;
}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index e556751..ff915ef 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,19 +1,23 @@
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/binfmts.h>
+
+#include <asm/elf.h>
+
Elf_Half __weak elf_core_extra_phdrs(void)
{
return 0;
}
-int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+ unsigned long limit)
{
return 1;
}
-int __weak elf_core_write_extra_data(struct coredump_params *cprm)
+int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+ unsigned long limit)
{
return 1;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72348dc..953c143 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
-static int perf_sample_allowed_ns __read_mostly =
- DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+static atomic_t perf_sample_allowed_ns __read_mostly =
+ ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
void update_perf_cpu_limits(void)
{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
tmp *= sysctl_perf_cpu_time_max_percent;
do_div(tmp, 100);
- ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+ atomic_set(&perf_sample_allowed_ns, tmp);
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -228,15 +228,14 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
* we detect that events are taking too long.
*/
#define NR_ACCUMULATED_SAMPLES 128
-static DEFINE_PER_CPU(u64, running_sample_length);
+DEFINE_PER_CPU(u64, running_sample_length);
void perf_sample_event_took(u64 sample_len_ns)
{
u64 avg_local_sample_len;
u64 local_samples_len;
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- if (allowed_ns == 0)
+ if (atomic_read(&perf_sample_allowed_ns) == 0)
return;
/* decay the counter by 1 average sample */
@@ -252,7 +251,7 @@ void perf_sample_event_took(u64 sample_len_ns)
*/
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
- if (avg_local_sample_len <= allowed_ns)
+ if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
return;
if (max_samples_per_tick <= 1)
@@ -263,9 +262,10 @@ void perf_sample_event_took(u64 sample_len_ns)
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
printk_ratelimited(KERN_WARNING
- "perf samples too long (%lld > %lld), lowering "
+ "perf samples too long (%lld > %d), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns,
+ avg_local_sample_len,
+ atomic_read(&perf_sample_allowed_ns),
sysctl_perf_event_sample_rate);
update_perf_cpu_limits();
@@ -899,7 +899,6 @@ static void unclone_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
- ctx->generation++;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1137,8 +1136,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
-
- ctx->generation++;
}
/*
@@ -1204,9 +1201,6 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
- if (sample_type & PERF_SAMPLE_TRANSACTION)
- size += sizeof(data->txn);
-
event->header_size = size;
}
@@ -1316,8 +1310,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
-
- ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
@@ -2154,38 +2146,22 @@ static void ctx_sched_out(struct perf_event_context *ctx,
}
/*
- * Test whether two contexts are equivalent, i.e. whether they have both been
- * cloned from the same version of the same context.
- *
- * Equivalence is measured using a generation number in the context that is
- * incremented on each modification to it; see unclone_ctx(), list_add_event()
- * and list_del_event().
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled events.
+ * If the number of enabled events is the same, then the set
+ * of enabled events should be the same, because these are both
+ * inherited contexts, therefore we can't access individual events
+ * in them directly with an fd; we can only enable/disable all
+ * events via prctl, or enable/disable all events in a family
+ * via ioctl, which will have the same effect on both contexts.
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
- /* Pinning disables the swap optimization */
- if (ctx1->pin_count || ctx2->pin_count)
- return 0;
-
- /* If ctx1 is the parent of ctx2 */
- if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
- return 1;
-
- /* If ctx2 is the parent of ctx1 */
- if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
- return 1;
-
- /*
- * If ctx1 and ctx2 have the same parent; we flatten the parent
- * hierarchy, see perf_event_init_context().
- */
- if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
- ctx1->parent_gen == ctx2->parent_gen)
- return 1;
-
- /* Unmatched */
- return 0;
+ return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+ && ctx1->parent_gen == ctx2->parent_gen
+ && !ctx1->pin_count && !ctx2->pin_count;
}
static void __perf_event_sync_stat(struct perf_event *event,
@@ -2234,6 +2210,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
perf_event_update_userpage(next_event);
}
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
@@ -2265,7 +2244,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
- struct perf_event_context *parent, *next_parent;
+ struct perf_event_context *parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
@@ -2277,18 +2256,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
return;
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
- if (!next_ctx)
- goto unlock;
-
parent = rcu_dereference(ctx->parent_ctx);
- next_parent = rcu_dereference(next_ctx->parent_ctx);
-
- /* If neither context have a parent context; they cannot be clones. */
- if (!parent && !next_parent)
- goto unlock;
-
- if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
+ next_ctx = next->perf_event_ctxp[ctxn];
+ if (parent && next_ctx &&
+ rcu_dereference(next_ctx->parent_ctx) == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
@@ -2316,7 +2287,6 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
-unlock:
rcu_read_unlock();
if (do_switch) {
@@ -4602,9 +4572,6 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
- if (sample_type & PERF_SAMPLE_TRANSACTION)
- perf_output_put(handle, data->txn);
-
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5133,26 +5100,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ const char *name;
+
+ memset(tmp, 0, sizeof(tmp));
if (file) {
struct inode *inode;
dev_t dev;
-
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf) {
- name = "//enomem";
- goto cpy_name;
- }
/*
- * d_path() works from the end of the rb backwards, so we
+ * d_path works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+ if (!buf) {
+ name = strncpy(tmp, "//enomem", sizeof(tmp));
+ goto got_name;
+ }
+ name = d_path(&file->f_path, buf, PATH_MAX);
if (IS_ERR(name)) {
- name = "//toolong";
- goto cpy_name;
+ name = strncpy(tmp, "//toolong", sizeof(tmp));
+ goto got_name;
}
inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
@@ -5160,39 +5128,34 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
- goto got_name;
+
} else {
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
+ if (arch_vma_name(mmap_event->vma)) {
+ name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+ sizeof(tmp) - 1);
+ tmp[sizeof(tmp) - 1] = '\0';
+ goto got_name;
+ }
- if (vma->vm_start <= vma->vm_mm->start_brk &&
+ if (!vma->vm_mm) {
+ name = strncpy(tmp, "[vdso]", sizeof(tmp));
+ goto got_name;
+ } else if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
+ name = strncpy(tmp, "[heap]", sizeof(tmp));
+ goto got_name;
+ } else if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
+ name = strncpy(tmp, "[stack]", sizeof(tmp));
+ goto got_name;
}
- name = "//anon";
- goto cpy_name;
+ name = strncpy(tmp, "//anon", sizeof(tmp));
+ goto got_name;
}
-cpy_name:
- strlcpy(tmp, name, sizeof(tmp));
- name = tmp;
got_name:
- /*
- * Since our buffer works in 8 byte units we need to align our string
- * size to a multiple of 8. However, we must guarantee the tail end is
- * zero'd out to avoid leaking random bits to userspace.
- */
- size = strlen(name)+1;
- while (!IS_ALIGNED(size, sizeof(u64)))
- name[size++] = '\0';
+ size = ALIGN(strlen(name)+1, sizeof(u64));
mmap_event->file_name = name;
mmap_event->file_size = size;
@@ -5680,6 +5643,11 @@ static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
+ if (event->cpu != -1) {
+ swevent_hlist_put_cpu(event, event->cpu);
+ return;
+ }
+
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
@@ -5713,6 +5681,9 @@ static int swevent_hlist_get(struct perf_event *event)
int err;
int cpu, failed_cpu;
+ if (event->cpu != -1)
+ return swevent_hlist_get_cpu(event, event->cpu);
+
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
@@ -6321,7 +6292,6 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
-static DEVICE_ATTR_RO(type);
static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
@@ -6366,19 +6336,17 @@ perf_event_mux_interval_ms_store(struct device *dev,
return count;
}
-static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static struct attribute *pmu_dev_attrs[] = {
- &dev_attr_type.attr,
- &dev_attr_perf_event_mux_interval_ms.attr,
- NULL,
+static struct device_attribute pmu_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_RW(perf_event_mux_interval_ms),
+ __ATTR_NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
- .dev_groups = pmu_dev_groups,
+ .dev_attrs = pmu_dev_attrs,
};
static void pmu_dev_release(struct device *dev)
@@ -7158,6 +7126,7 @@ SYSCALL_DEFINE5(perf_event_open,
}
perf_install_in_context(ctx, event, event->cpu);
+ ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -7240,6 +7209,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
+ ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b2187..ca65997 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
}
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
+static inline unsigned int \
func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+ const void *buf, unsigned int len) \
{ \
unsigned long size, written; \
\
do { \
- size = min(handle->size, len); \
+ size = min_t(unsigned long, handle->size, len); \
+ \
written = memcpy_func(handle->addr, buf, size); \
- written = size - written; \
\
len -= written; \
handle->addr += written; \
@@ -110,37 +110,20 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
-static inline unsigned long
-memcpy_common(void *dst, const void *src, unsigned long n)
+static inline int memcpy_common(void *dst, const void *src, size_t n)
{
memcpy(dst, src, n);
- return 0;
+ return n;
}
DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-static inline unsigned long
-memcpy_skip(void *dst, const void *src, unsigned long n)
-{
- return 0;
-}
+#define MEMCPY_SKIP(dst, src, n) (n)
-DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
+DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
#ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user arch_perf_out_copy_user
-
-static inline unsigned long
-arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
-{
- unsigned long ret;
-
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst, src, n);
- pagefault_enable();
-
- return ret;
-}
+#define arch_perf_out_copy_user __copy_from_user_inatomic
#endif
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8b168a..9c2ddfb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,10 +12,40 @@
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
-#include <linux/circ_buf.h>
#include "internal.h"
+static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
+ unsigned long offset, unsigned long head)
+{
+ unsigned long sz = perf_data_size(rb);
+ unsigned long mask = sz - 1;
+
+ /*
+ * check if user-writable
+ * overwrite : over-write its own tail
+ * !overwrite: buffer possibly drops events.
+ */
+ if (rb->overwrite)
+ return true;
+
+ /*
+ * verify that payload is not bigger than buffer
+ * otherwise masking logic may fail to detect
+ * the "not enough space" condition
+ */
+ if ((head - offset) > sz)
+ return false;
+
+ offset = (offset - tail) & mask;
+ head = (head - tail) & mask;
+
+ if ((int)(head - offset) < 0)
+ return false;
+
+ return true;
+}
+
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
@@ -85,8 +115,8 @@ again:
rb->user_page->data_head = head;
/*
- * Now check if we missed an update -- rely on previous implied
- * compiler barriers to force a re-read.
+ * Now check if we missed an update, rely on the (compiler)
+ * barrier in atomic_dec_and_test() to re-read rb->head.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
@@ -105,7 +135,8 @@ int perf_output_begin(struct perf_output_handle *handle,
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
- int have_lost, page_shift;
+ int have_lost;
+ struct perf_sample_data sample_data;
struct {
struct perf_event_header header;
u64 id;
@@ -120,63 +151,57 @@ int perf_output_begin(struct perf_output_handle *handle,
event = event->parent;
rb = rcu_dereference(event->rb);
- if (unlikely(!rb))
+ if (!rb)
goto out;
- if (unlikely(!rb->nr_pages))
- goto out;
+ handle->rb = rb;
+ handle->event = event;
- handle->rb = rb;
- handle->event = event;
+ if (!rb->nr_pages)
+ goto out;
have_lost = local_read(&rb->lost);
- if (unlikely(have_lost)) {
- size += sizeof(lost_event);
- if (event->attr.sample_id_all)
- size += event->id_header_size;
+ if (have_lost) {
+ lost_event.header.size = sizeof(lost_event);
+ perf_event_header__init_id(&lost_event.header, &sample_data,
+ event);
+ size += lost_event.header.size;
}
perf_output_get_handle(handle);
do {
+ /*
+ * Userspace could choose to issue a mb() before updating the
+ * tail pointer. So that all reads will be completed before the
+ * write is issued.
+ *
+ * See perf_output_put_handle().
+ */
tail = ACCESS_ONCE(rb->user_page->data_tail);
+ smp_mb();
offset = head = local_read(&rb->head);
- if (!rb->overwrite &&
- unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
- goto fail;
head += size;
+ if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ goto fail;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
- /*
- * Separate the userpage->tail read from the data stores below.
- * Matches the MB userspace SHOULD issue after reading the data
- * and before storing the new tail position.
- *
- * See perf_output_put_handle().
- */
- smp_mb();
-
- if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
+ if (head - local_read(&rb->wakeup) > rb->watermark)
local_add(rb->watermark, &rb->wakeup);
- page_shift = PAGE_SHIFT + page_order(rb);
+ handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+ handle->page &= rb->nr_pages - 1;
+ handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
+ handle->addr = rb->data_pages[handle->page];
+ handle->addr += handle->size;
+ handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
- handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
- offset &= (1UL << page_shift) - 1;
- handle->addr = rb->data_pages[handle->page] + offset;
- handle->size = (1UL << page_shift) - offset;
-
- if (unlikely(have_lost)) {
- struct perf_sample_data sample_data;
-
- lost_event.header.size = sizeof(lost_event);
+ if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
- perf_event_header__init_id(&lost_event.header,
- &sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24b7d6c..ad8e1bd 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,7 +35,6 @@
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
-#include <linux/task_work.h>
#include <linux/uprobes.h>
@@ -245,12 +244,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
* supported by that architecture then we need to modify is_trap_at_addr and
- * uprobe_write_opcode accordingly. This would never be a problem for archs
- * that have fixed length instructions.
+ * write_opcode accordingly. This would never be a problem for archs that
+ * have fixed length instructions.
*/
/*
- * uprobe_write_opcode - write the opcode at a given virtual address.
+ * write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -261,7 +260,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* For mm @mm, write the opcode at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
+static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
@@ -315,7 +314,7 @@ put_old:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+ return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -330,7 +329,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+ return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -504,8 +503,9 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
return ret;
}
-static int __copy_insn(struct address_space *mapping, struct file *filp,
- void *insn, int nbytes, loff_t offset)
+static int
+__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
+ unsigned long nbytes, loff_t offset)
{
struct page *page;
@@ -527,28 +527,28 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
- struct address_space *mapping = uprobe->inode->i_mapping;
- loff_t offs = uprobe->offset;
- void *insn = uprobe->arch.insn;
- int size = MAX_UINSN_BYTES;
- int len, err = -EIO;
+ struct address_space *mapping;
+ unsigned long nbytes;
+ int bytes;
- /* Copy only available bytes, -EIO if nothing was read */
- do {
- if (offs >= i_size_read(uprobe->inode))
- break;
+ nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
+ mapping = uprobe->inode->i_mapping;
- len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
- err = __copy_insn(mapping, filp, insn, len, offs);
- if (err)
- break;
-
- insn += len;
- offs += len;
- size -= len;
- } while (size);
+ /* Instruction at end of binary; copy only available bytes */
+ if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+ bytes = uprobe->inode->i_size - uprobe->offset;
+ else
+ bytes = MAX_UINSN_BYTES;
- return err;
+ /* Instruction at the page-boundary; copy bytes in second page */
+ if (nbytes < bytes) {
+ int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
+ bytes - nbytes, uprobe->offset + nbytes);
+ if (err)
+ return err;
+ bytes = nbytes;
+ }
+ return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
}
static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
+ /* write_opcode() assumes we don't cross page boundary */
BUG_ON((uprobe->offset & ~PAGE_MASK) +
UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -1096,22 +1096,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
}
/* Slot allocation for XOL */
-static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
+static int xol_add_vma(struct xol_area *area)
{
+ struct mm_struct *mm = current->mm;
int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
- if (!area->vaddr) {
- /* Try to map as high as possible, this is only a hint. */
- area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
- PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
- ret = area->vaddr;
- goto fail;
- }
+ ret = -ENOMEM;
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
}
ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1121,19 +1120,30 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
+ ret = 0;
fail:
up_write(&mm->mmap_sem);
return ret;
}
-static struct xol_area *__create_xol_area(unsigned long vaddr)
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
{
struct mm_struct *mm = current->mm;
- uprobe_opcode_t insn = UPROBE_SWBP_INSN;
struct xol_area *area;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ area = mm->uprobes_state.xol_area;
+ if (area)
+ goto ret;
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1145,14 +1155,13 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->page)
goto free_bitmap;
- area->vaddr = vaddr;
- init_waitqueue_head(&area->wq);
- /* Reserve the 1st slot for get_trampoline_vaddr() */
+ /* allocate first slot of task's xol_area for the return probes */
set_bit(0, area->bitmap);
- atomic_set(&area->slot_count, 1);
copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ atomic_set(&area->slot_count, 1);
+ init_waitqueue_head(&area->wq);
- if (!xol_add_vma(mm, area))
+ if (!xol_add_vma(area))
return area;
__free_page(area->page);
@@ -1161,25 +1170,9 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
free_area:
kfree(area);
out:
- return NULL;
-}
-
-/*
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
-{
- struct mm_struct *mm = current->mm;
- struct xol_area *area;
-
- if (!mm->uprobes_state.xol_area)
- __create_xol_area(0);
-
area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ ret:
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
return area;
}
@@ -1263,8 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
return 0;
/* Initialize the slot */
- copy_to_page(area->page, xol_vaddr,
- uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+ copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
@@ -1353,6 +1345,14 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t)
+{
+ t->utask = NULL;
+}
+
+/*
* Allocate a uprobe_task object for the task if if necessary.
* Called when the thread hits a breakpoint.
*
@@ -1367,90 +1367,6 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
-static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
-{
- struct uprobe_task *n_utask;
- struct return_instance **p, *o, *n;
-
- n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
- if (!n_utask)
- return -ENOMEM;
- t->utask = n_utask;
-
- p = &n_utask->return_instances;
- for (o = o_utask->return_instances; o; o = o->next) {
- n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
- if (!n)
- return -ENOMEM;
-
- *n = *o;
- atomic_inc(&n->uprobe->ref);
- n->next = NULL;
-
- *p = n;
- p = &n->next;
- n_utask->depth++;
- }
-
- return 0;
-}
-
-static void uprobe_warn(struct task_struct *t, const char *msg)
-{
- pr_warn("uprobe: %s:%d failed to %s\n",
- current->comm, current->pid, msg);
-}
-
-static void dup_xol_work(struct callback_head *work)
-{
- kfree(work);
-
- if (current->flags & PF_EXITING)
- return;
-
- if (!__create_xol_area(current->utask->vaddr))
- uprobe_warn(current, "dup xol area");
-}
-
-/*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t, unsigned long flags)
-{
- struct uprobe_task *utask = current->utask;
- struct mm_struct *mm = current->mm;
- struct callback_head *work;
- struct xol_area *area;
-
- t->utask = NULL;
-
- if (!utask || !utask->return_instances)
- return;
-
- if (mm == t->mm && !(flags & CLONE_VFORK))
- return;
-
- if (dup_utask(t, utask))
- return uprobe_warn(t, "dup ret instances");
-
- /* The task can fork() after dup_xol_work() fails */
- area = mm->uprobes_state.xol_area;
- if (!area)
- return uprobe_warn(t, "dup xol area");
-
- if (mm == t->mm)
- return;
-
- /* TODO: move it into the union in uprobe_task */
- work = kmalloc(sizeof(*work), GFP_KERNEL);
- if (!work)
- return uprobe_warn(t, "dup xol area");
-
- t->utask->vaddr = area->vaddr;
- init_task_work(work, dup_xol_work);
- task_work_add(t, work, true);
-}
-
/*
* Current area->vaddr notion assume the trampoline address is always
* equal area->vaddr.
@@ -1941,4 +1857,9 @@ static int __init init_uprobes(void)
return register_die_notifier(&uprobe_exception_nb);
}
-__initcall(init_uprobes);
+module_init(init_uprobes);
+
+static void __exit exit_uprobes(void)
+{
+}
+module_exit(exit_uprobes);
diff --git a/kernel/extable.c b/kernel/extable.c
index 763faf0..832cb28 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
- addr < (unsigned long)_einittext)
+ addr <= (unsigned long)_einittext)
return 1;
return 0;
}
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
- addr < (unsigned long)_etext)
+ addr <= (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 728d5be..086fe73 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->flags = (current->mm) ?
(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
- atomic_long_set(&mm->nr_ptes, 0);
+ mm->nr_ptes = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
mm_init_aio(mm);
@@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm)
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON(mm->pmd_huge_pte);
#endif
}
@@ -814,9 +814,12 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->first_nid = NUMA_PTE_SCAN_INIT;
+#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -1310,7 +1313,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
- sched_fork(clone_flags, p);
+ sched_fork(p);
retval = perf_event_init_task(p);
if (retval)
@@ -1370,6 +1373,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
+ uprobe_copy_process(p);
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -1486,7 +1490,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
- uprobe_copy_process(p, clone_flags);
return p;
diff --git a/kernel/futex.c b/kernel/futex.c
index 80ba086..c3a1a55 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -66,7 +66,7 @@
#include <asm/futex.h>
-#include "locking/rtmutex_common.h"
+#include "rtmutex_common.h"
int __read_mostly futex_cmpxchg_enabled;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8a..d4da55d 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -46,34 +46,4 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
-choice
- prompt "Specify GCOV format"
- depends on GCOV_KERNEL
- default GCOV_FORMAT_AUTODETECT
- ---help---
- The gcov format is usually determined by the GCC version, but there are
- exceptions where format changes are integrated in lower-version GCCs.
- In such a case use this option to adjust the format used in the kernel
- accordingly.
-
- If unsure, choose "Autodetect".
-
-config GCOV_FORMAT_AUTODETECT
- bool "Autodetect"
- ---help---
- Select this option to use the format that corresponds to your GCC
- version.
-
-config GCOV_FORMAT_3_4
- bool "GCC 3.4 format"
- ---help---
- Select this option to use the format defined by GCC 3.4.
-
-config GCOV_FORMAT_4_7
- bool "GCC 4.7 format"
- ---help---
- Select this option to use the format defined by GCC 4.7.
-
-endchoice
-
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8..e97ca59 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,3 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-# if-lt
-# Usage VAR := $(call if-lt, $(a), $(b))
-# Returns 1 if (a < b)
-if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
-
-ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
- cc-ver := 0304
-else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
- cc-ver := 0407
-else
-# Use cc-version if available, otherwise set 0
-#
-# scripts/Kbuild.include, which contains cc-version function, is not included
-# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
-# Meaning cc-ver is empty causing if-lt test to fail with
-# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
-# This has no affect on the clean phase, but the error message could be
-# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
-# is not available. We can probably move if-lt to Kbuild.include, so it's also
-# not defined during clean or to include Kbuild.include in
-# scripts/Makefile.clean. But the following workaround seems least invasive.
- cc-ver := $(if $(call cc-version),$(call cc-version),0)
-endif
-
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
-
-ifeq ($(call if-lt, $(cc-ver), 0407),1)
- obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
-else
- obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
-endif
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index f45b75b..9b22d03 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,6 +20,7 @@
#include <linux/mutex.h>
#include "gcov.h"
+static struct gcov_info *gcov_info_head;
static int gcov_events_enabled;
static DEFINE_MUTEX(gcov_lock);
@@ -33,7 +34,7 @@ void __gcov_init(struct gcov_info *info)
mutex_lock(&gcov_lock);
if (gcov_version == 0) {
- gcov_version = gcov_info_version(info);
+ gcov_version = info->version;
/*
* Printing gcc's version magic may prove useful for debugging
* incompatibility reports.
@@ -44,7 +45,8 @@ void __gcov_init(struct gcov_info *info)
* Add new profiling data structure to list and inform event
* listener.
*/
- gcov_info_link(info);
+ info->next = gcov_info_head;
+ gcov_info_head = info;
if (gcov_events_enabled)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
@@ -79,12 +81,6 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_delta);
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
@@ -95,15 +91,13 @@ EXPORT_SYMBOL(__gcov_merge_ior);
*/
void gcov_enable_events(void)
{
- struct gcov_info *info = NULL;
+ struct gcov_info *info;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
-
/* Perform event callback for previously registered entries. */
- while ((info = gcov_info_next(info)))
+ for (info = gcov_info_head; info; info = info->next)
gcov_event(GCOV_ADD, info);
-
mutex_unlock(&gcov_lock);
}
@@ -118,23 +112,25 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
- struct gcov_info *info = NULL;
- struct gcov_info *prev = NULL;
+ struct gcov_info *info;
+ struct gcov_info *prev;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
-
+ prev = NULL;
/* Remove entries located in module from linked list. */
- while ((info = gcov_info_next(info))) {
+ for (info = gcov_info_head; info; info = info->next) {
if (within(info, mod->module_core, mod->core_size)) {
- gcov_info_unlink(prev, info);
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
-
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a..7a7d2ee 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str)
unsigned long val;
if (kstrtoul(str, 0, &val)) {
- pr_warn("invalid gcov_persist parameter '%s'\n", str);
+ pr_warning("invalid gcov_persist parameter '%s'\n", str);
return 0;
}
gcov_persist = val;
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)
list_for_each_entry(node, &all_head, all) {
info = get_node_info(node);
- if (info && (strcmp(gcov_info_filename(info), name) == 0))
+ if (info && (strcmp(info->filename, name) == 0))
return node;
}
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
seq = file->private_data;
info = gcov_iter_get_info(seq->private);
mutex_lock(&node_lock);
- node = get_node_by_name(gcov_info_filename(info));
+ node = get_node_by_name(info->filename);
if (node) {
/* Reset counts or remove node for unloaded modules. */
if (node->num_loaded == 0)
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename)
*/
static void add_links(struct gcov_node *node, struct dentry *parent)
{
- const char *basename;
+ char *basename;
char *target;
int num;
int i;
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
if (!node->links)
return;
for (i = 0; i < num; i++) {
- target = get_link_target(
- gcov_info_filename(get_node_info(node)),
- &gcov_link[i]);
+ target = get_link_target(get_node_info(node)->filename,
+ &gcov_link[i]);
if (!target)
goto out_err;
- basename = kbasename(target);
- if (basename == target)
+ basename = strrchr(target, '/');
+ if (!basename)
goto out_err;
+ basename++;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
if (!node->links[i])
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
if (!node->dentry) {
- pr_warn("could not create file\n");
+ pr_warning("could not create file\n");
kfree(node);
return NULL;
}
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
err_nomem:
kfree(node);
- pr_warn("out of memory\n");
+ pr_warning("out of memory\n");
return NULL;
}
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)
struct gcov_node *parent;
struct gcov_node *node;
- filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
+ filename = kstrdup(info->filename, GFP_KERNEL);
if (!filename)
return;
parent = &root_node;
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
*/
loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
if (!loaded_info) {
- pr_warn("could not add '%s' (out of memory)\n",
- gcov_info_filename(info));
+ pr_warning("could not add '%s' (out of memory)\n",
+ info->filename);
return;
}
memcpy(loaded_info, node->loaded_info,
@@ -644,9 +644,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
* data set replaces the copy of the last one.
*/
if (!gcov_info_is_compatible(node->unloaded_info, info)) {
- pr_warn("discarding saved data for %s "
- "(incompatible version)\n",
- gcov_info_filename(info));
+ pr_warning("discarding saved data for %s "
+ "(incompatible version)\n", info->filename);
gcov_info_free(node->unloaded_info);
node->unloaded_info = NULL;
}
@@ -656,8 +655,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
* The initial one takes precedence.
*/
if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
- pr_warn("could not add '%s' (incompatible "
- "version)\n", gcov_info_filename(info));
+ pr_warning("could not add '%s' (incompatible "
+ "version)\n", info->filename);
kfree(loaded_info);
return;
}
@@ -692,9 +691,8 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)
else {
node->unloaded_info = gcov_info_dup(info);
if (!node->unloaded_info) {
- pr_warn("could not save data for '%s' "
- "(out of memory)\n",
- gcov_info_filename(info));
+ pr_warning("could not save data for '%s' "
+ "(out of memory)\n", info->filename);
}
}
}
@@ -709,8 +707,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)
i = get_info_index(node, info);
if (i < 0) {
- pr_warn("could not remove '%s' (not found)\n",
- gcov_info_filename(info));
+ pr_warning("could not remove '%s' (not found)\n",
+ info->filename);
return;
}
if (gcov_persist)
@@ -737,7 +735,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
struct gcov_node *node;
mutex_lock(&node_lock);
- node = get_node_by_name(gcov_info_filename(info));
+ node = get_node_by_name(info->filename);
switch (action) {
case GCOV_ADD:
if (node)
@@ -749,8 +747,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
if (node)
remove_info(node, info);
else {
- pr_warn("could not remove '%s' (not found)\n",
- gcov_info_filename(info));
+ pr_warning("could not remove '%s' (not found)\n",
+ info->filename);
}
break;
}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 27bc88a..ae5bb42 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,121 +21,6 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#define GCOV_COUNTERS 5
-
-static struct gcov_info *gcov_info_head;
-
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @ident: object file-unique function identifier
- * @checksum: function checksum
- * @n_ctrs: number of values per counter type belonging to this function
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
- unsigned int ident;
- unsigned int checksum;
- unsigned int n_ctrs[0];
-};
-
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
- void (*merge)(gcov_type *, unsigned int);
-};
-
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: time stamp
- * @filename: name of the associated gcov data file
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- unsigned int n_functions;
- const struct gcov_fn_info *functions;
- unsigned int ctr_mask;
- struct gcov_ctr_info counts[0];
-};
-
-/**
- * gcov_info_filename - return info filename
- * @info: profiling data set
- */
-const char *gcov_info_filename(struct gcov_info *info)
-{
- return info->filename;
-}
-
-/**
- * gcov_info_version - return info version
- * @info: profiling data set
- */
-unsigned int gcov_info_version(struct gcov_info *info)
-{
- return info->version;
-}
-
-/**
- * gcov_info_next - return next profiling data set
- * @info: profiling data set
- *
- * Returns next gcov_info following @info or first gcov_info in the chain if
- * @info is %NULL.
- */
-struct gcov_info *gcov_info_next(struct gcov_info *info)
-{
- if (!info)
- return gcov_info_head;
-
- return info->next;
-}
-
-/**
- * gcov_info_link - link/add profiling data set to the list
- * @info: profiling data set
- */
-void gcov_info_link(struct gcov_info *info)
-{
- info->next = gcov_info_head;
- gcov_info_head = info;
-}
-
-/**
- * gcov_info_unlink - unlink/remove profiling data set from the list
- * @prev: previous profiling data set
- * @info: profiling data set
- */
-void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
-{
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
-}
-
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
deleted file mode 100644
index 2c6e463..0000000
--- a/kernel/gcov/gcc_4_7.c
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- * This code provides functions to handle gcc's profiling data format
- * introduced with gcc 4.7.
- *
- * This file is based heavily on gcc_3_4.c file.
- *
- * For a better understanding, refer to gcc source:
- * gcc/gcov-io.h
- * libgcc/libgcov.c
- *
- * Uses gcc-internal data definitions.
- */
-
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
-#include "gcov.h"
-
-#define GCOV_COUNTERS 8
-#define GCOV_TAG_FUNCTION_LENGTH 3
-
-static struct gcov_info *gcov_info_head;
-
-/**
- * struct gcov_ctr_info - information about counters for a single function
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
-};
-
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @key: comdat key
- * @ident: unique ident of function
- * @lineno_checksum: function lineo_checksum
- * @cfg_checksum: function cfg checksum
- * @ctrs: instrumented counters
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- *
- * Information about a single function. This uses the trailing array
- * idiom. The number of counters is determined from the merge pointer
- * array in gcov_info. The key is used to detect which of a set of
- * comdat functions was selected -- it points to the gcov_info object
- * of the object file containing the selected comdat function.
- */
-struct gcov_fn_info {
- const struct gcov_info *key;
- unsigned int ident;
- unsigned int lineno_checksum;
- unsigned int cfg_checksum;
- struct gcov_ctr_info ctrs[0];
-};
-
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: uniquifying time stamp
- * @filename: name of the associated gcov data file
- * @merge: merge functions (null for unused counter type)
- * @n_functions: number of instrumented functions
- * @functions: pointer to pointers to function information
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
- unsigned int n_functions;
- struct gcov_fn_info **functions;
-};
-
-/**
- * gcov_info_filename - return info filename
- * @info: profiling data set
- */
-const char *gcov_info_filename(struct gcov_info *info)
-{
- return info->filename;
-}
-
-/**
- * gcov_info_version - return info version
- * @info: profiling data set
- */
-unsigned int gcov_info_version(struct gcov_info *info)
-{
- return info->version;
-}
-
-/**
- * gcov_info_next - return next profiling data set
- * @info: profiling data set
- *
- * Returns next gcov_info following @info or first gcov_info in the chain if
- * @info is %NULL.
- */
-struct gcov_info *gcov_info_next(struct gcov_info *info)
-{
- if (!info)
- return gcov_info_head;
-
- return info->next;
-}
-
-/**
- * gcov_info_link - link/add profiling data set to the list
- * @info: profiling data set
- */
-void gcov_info_link(struct gcov_info *info)
-{
- info->next = gcov_info_head;
- gcov_info_head = info;
-}
-
-/**
- * gcov_info_unlink - unlink/remove profiling data set from the list
- * @prev: previous profiling data set
- * @info: profiling data set
- */
-void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
-{
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
-}
-
-/* Symbolic links to be created for each profiling data file. */
-const struct gcov_link gcov_link[] = {
- { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
- { 0, NULL},
-};
-
-/*
- * Determine whether a counter is active. Doesn't change at run-time.
- */
-static int counter_active(struct gcov_info *info, unsigned int type)
-{
- return info->merge[type] ? 1 : 0;
-}
-
-/* Determine number of active counters. Based on gcc magic. */
-static unsigned int num_counter_active(struct gcov_info *info)
-{
- unsigned int i;
- unsigned int result = 0;
-
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(info, i))
- result++;
- }
- return result;
-}
-
-/**
- * gcov_info_reset - reset profiling data to zero
- * @info: profiling data set
- */
-void gcov_info_reset(struct gcov_info *info)
-{
- struct gcov_ctr_info *ci_ptr;
- unsigned int fi_idx;
- unsigned int ct_idx;
-
- for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
- ci_ptr = info->functions[fi_idx]->ctrs;
-
- for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
- if (!counter_active(info, ct_idx))
- continue;
-
- memset(ci_ptr->values, 0,
- sizeof(gcov_type) * ci_ptr->num);
- ci_ptr++;
- }
- }
-}
-
-/**
- * gcov_info_is_compatible - check if profiling data can be added
- * @info1: first profiling data set
- * @info2: second profiling data set
- *
- * Returns non-zero if profiling data can be added, zero otherwise.
- */
-int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
-{
- return (info1->stamp == info2->stamp);
-}
-
-/**
- * gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
- *
- * Adds profiling counts of @source to @dest.
- */
-void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
-{
- struct gcov_ctr_info *dci_ptr;
- struct gcov_ctr_info *sci_ptr;
- unsigned int fi_idx;
- unsigned int ct_idx;
- unsigned int val_idx;
-
- for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
- dci_ptr = dst->functions[fi_idx]->ctrs;
- sci_ptr = src->functions[fi_idx]->ctrs;
-
- for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
- if (!counter_active(src, ct_idx))
- continue;
-
- for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
- dci_ptr->values[val_idx] +=
- sci_ptr->values[val_idx];
-
- dci_ptr++;
- sci_ptr++;
- }
- }
-}
-
-/**
- * gcov_info_dup - duplicate profiling data set
- * @info: profiling data set to duplicate
- *
- * Return newly allocated duplicate on success, %NULL on error.
- */
-struct gcov_info *gcov_info_dup(struct gcov_info *info)
-{
- struct gcov_info *dup;
- struct gcov_ctr_info *dci_ptr; /* dst counter info */
- struct gcov_ctr_info *sci_ptr; /* src counter info */
- unsigned int active;
- unsigned int fi_idx; /* function info idx */
- unsigned int ct_idx; /* counter type idx */
- size_t fi_size; /* function info size */
- size_t cv_size; /* counter values size */
-
- dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
- if (!dup)
- return NULL;
-
- dup->next = NULL;
- dup->filename = NULL;
- dup->functions = NULL;
-
- dup->filename = kstrdup(info->filename, GFP_KERNEL);
- if (!dup->filename)
- goto err_free;
-
- dup->functions = kcalloc(info->n_functions,
- sizeof(struct gcov_fn_info *), GFP_KERNEL);
- if (!dup->functions)
- goto err_free;
-
- active = num_counter_active(info);
- fi_size = sizeof(struct gcov_fn_info);
- fi_size += sizeof(struct gcov_ctr_info) * active;
-
- for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
- dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
- if (!dup->functions[fi_idx])
- goto err_free;
-
- *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
-
- sci_ptr = info->functions[fi_idx]->ctrs;
- dci_ptr = dup->functions[fi_idx]->ctrs;
-
- for (ct_idx = 0; ct_idx < active; ct_idx++) {
-
- cv_size = sizeof(gcov_type) * sci_ptr->num;
-
- dci_ptr->values = vmalloc(cv_size);
-
- if (!dci_ptr->values)
- goto err_free;
-
- dci_ptr->num = sci_ptr->num;
- memcpy(dci_ptr->values, sci_ptr->values, cv_size);
-
- sci_ptr++;
- dci_ptr++;
- }
- }
-
- return dup;
-err_free:
- gcov_info_free(dup);
- return NULL;
-}
-
-/**
- * gcov_info_free - release memory for profiling data set duplicate
- * @info: profiling data set duplicate to free
- */
-void gcov_info_free(struct gcov_info *info)
-{
- unsigned int active;
- unsigned int fi_idx;
- unsigned int ct_idx;
- struct gcov_ctr_info *ci_ptr;
-
- if (!info->functions)
- goto free_info;
-
- active = num_counter_active(info);
-
- for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
- if (!info->functions[fi_idx])
- continue;
-
- ci_ptr = info->functions[fi_idx]->ctrs;
-
- for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
- vfree(ci_ptr->values);
-
- kfree(info->functions[fi_idx]);
- }
-
-free_info:
- kfree(info->functions);
- kfree(info->filename);
- kfree(info);
-}
-
-#define ITER_STRIDE PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
- struct gcov_info *info;
- void *buffer;
- size_t size;
- loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
- *data = v;
- }
-
- return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- }
-
- return sizeof(*data) * 2;
-}
-
-/**
- * convert_to_gcda - convert profiling data set to gcda file format
- * @buffer: the buffer to store file data or %NULL if no data should be stored
- * @info: profiling data set to be converted
- *
- * Returns the number of bytes that were/would have been stored into the buffer.
- */
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
-{
- struct gcov_fn_info *fi_ptr;
- struct gcov_ctr_info *ci_ptr;
- unsigned int fi_idx;
- unsigned int ct_idx;
- unsigned int cv_idx;
- size_t pos = 0;
-
- /* File header. */
- pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
- pos += store_gcov_u32(buffer, pos, info->version);
- pos += store_gcov_u32(buffer, pos, info->stamp);
-
- for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
- fi_ptr = info->functions[fi_idx];
-
- /* Function record. */
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
- pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
- pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
- pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-
- ci_ptr = fi_ptr->ctrs;
-
- for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
- if (!counter_active(info, ct_idx))
- continue;
-
- /* Counter record. */
- pos += store_gcov_u32(buffer, pos,
- GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
-
- for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
- pos += store_gcov_u64(buffer, pos,
- ci_ptr->values[cv_idx]);
- }
-
- ci_ptr++;
- }
- }
-
- return pos;
-}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
- if (!iter)
- goto err_free;
-
- iter->info = info;
- /* Dry-run to get the actual buffer size. */
- iter->size = convert_to_gcda(NULL, info);
- iter->buffer = vmalloc(iter->size);
- if (!iter->buffer)
- goto err_free;
-
- convert_to_gcda(iter->buffer, info);
-
- return iter;
-
-err_free:
- kfree(iter);
- return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- vfree(iter->buffer);
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- if (iter->pos < iter->size)
- iter->pos += ITER_STRIDE;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- size_t len;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- len = ITER_STRIDE;
- if (iter->pos + len > iter->size)
- len = iter->size - iter->pos;
-
- seq_write(seq, iter->buffer + iter->pos, len);
-
- return 0;
-}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 92c8e22..060073e 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,6 +21,7 @@
* gcc and need to be kept as close to the original definition as possible to
* remain compatible.
*/
+#define GCOV_COUNTERS 5
#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
@@ -33,18 +34,60 @@ typedef long gcov_type;
typedef long long gcov_type;
#endif
-/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
- * we cannot use full definition here and they need to be placed in gcc specific
- * implementation of gcov. This also means no direct access to the members in
- * generic code and usage of the interface below.*/
-struct gcov_info;
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
-/* Interface to access gcov_info data */
-const char *gcov_info_filename(struct gcov_info *info);
-unsigned int gcov_info_version(struct gcov_info *info);
-struct gcov_info *gcov_info_next(struct gcov_info *info);
-void gcov_info_link(struct gcov_info *info);
-void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
/* Base interface. */
enum gcov_action {
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9328b80..3e97fb1 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,12 +16,11 @@
#include <linux/export.h>
#include <linux/sysctl.h>
#include <linux/utsname.h>
-#include <trace/events/sched.h>
/*
* The number of tasks checked:
*/
-int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
@@ -93,9 +92,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
t->last_switch_count = switch_count;
return;
}
-
- trace_sched_process_hang(t);
-
if (!sysctl_hung_task_warnings)
return;
sysctl_hung_task_warnings--;
@@ -207,14 +203,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
-static atomic_t reset_hung_task = ATOMIC_INIT(0);
-
-void reset_hung_task_detector(void)
-{
- atomic_set(&reset_hung_task, 1);
-}
-EXPORT_SYMBOL_GPL(reset_hung_task_detector);
-
/*
* kthread which checks for tasks stuck in D state
*/
@@ -228,9 +216,6 @@ static int watchdog(void *dummy)
while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
timeout = sysctl_hung_task_timeout_secs;
- if (atomic_xchg(&reset_hung_task, 0))
- continue;
-
check_hung_uninterruptible_tasks(timeout);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc04c16..a3bb14f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc)
}
/**
- * irq_disable - Mark interrupt disabled
+ * irq_disable - Mark interupt disabled
* @desc: irq descriptor which should be disabled
*
* If the chip does not implement the irq_disable callback, we
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf68bb3..706724e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,26 +465,27 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
}
EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+unsigned int irq_create_of_mapping(struct device_node *controller,
+ const u32 *intspec, unsigned int intsize)
{
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
unsigned int virq;
- domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
+ domain = controller ? irq_find_host(controller) : irq_default_domain;
if (!domain) {
pr_warn("no irq domain found for %s !\n",
- of_node_full_name(irq_data->np));
+ of_node_full_name(controller));
return 0;
}
/* If domain has no translation, then we assume interrupt line */
if (domain->ops->xlate == NULL)
- hwirq = irq_data->args[0];
+ hwirq = intspec[0];
else {
- if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
- irq_data->args_count, &hwirq, &type))
+ if (domain->ops->xlate(domain, controller, intspec, intsize,
+ &hwirq, &type))
return 0;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c..514bcfd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
}
/*
- * Interrupts explicitly requested as threaded interrupts want to be
+ * Interrupts explicitely requested as threaded interupts want to be
* preemtible - many of them need to sleep and wait for slow busses to
* complete.
*/
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
goto out_mput;
}
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+ sched_setscheduler(t, SCHED_FIFO, &param);
/*
* We keep the reference to the task struct even if
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca..cb228bf 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
bool is_early = desc->action &&
desc->action->flags & IRQF_EARLY_RESUME;
- if (!is_early && want_early)
+ if (is_early != want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84..1162f10 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,7 +14,6 @@ enum {
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
- _IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -27,7 +26,6 @@ enum {
#define IRQ_NOAUTOEN GOT_YOU_MORON
#define IRQ_NESTED_THREAD GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
-#define IRQ_IS_POLLED GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -149,8 +147,3 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_NESTED_THREAD;
}
-
-static inline bool irq_settings_is_polled(struct irq_desc *desc)
-{
- return desc->status_use_accessors & _IRQ_IS_POLLED;
-}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a1d8cc6..7b5f012 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -67,13 +67,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
raw_spin_lock(&desc->lock);
- /*
- * PER_CPU, nested thread interrupts and interrupts explicitely
- * marked polled are excluded from polling.
- */
- if (irq_settings_is_per_cpu(desc) ||
- irq_settings_is_nested_thread(desc) ||
- irq_settings_is_polled(desc))
+ /* PER_CPU and nested thread interrupts are never polled */
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
goto out;
/*
@@ -273,8 +268,7 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
- if (desc->istate & IRQS_POLL_INPROGRESS ||
- irq_settings_is_polled(desc))
+ if (desc->istate & IRQS_POLL_INPROGRESS)
return;
/* we get here again via the threaded handler */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 9019f15..297a924 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,7 +58,6 @@ static void jump_label_update(struct static_key *key, int enable);
void static_key_slow_inc(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
if (atomic_inc_not_zero(&key->enabled))
return;
@@ -104,14 +103,12 @@ static void jump_label_update_timeout(struct work_struct *work)
void static_key_slow_dec(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
@@ -119,7 +116,6 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
- STATIC_KEY_CHECK_USE();
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
@@ -216,7 +212,6 @@ void __init jump_label_init(void)
key->next = NULL;
#endif
}
- static_key_initialized = true;
jump_label_unlock();
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 490afc0..2a74f30 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -921,7 +921,7 @@ static int kimage_load_segment(struct kimage *image,
* reinitialize them.
*
* - A machine specific part that includes the syscall number
- * and then copies the image to it's final destination. And
+ * and the copies the image to it's final destination. And
* jumps into the image at entry.
*
* kexec does not sync, or unmount filesystems so if you need
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfc..a0d367a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2066,7 +2066,7 @@ static int __init init_kprobes(void)
{
int i, err = 0;
unsigned long offset = 0, size = 0;
- char *modname, namebuf[KSYM_NAME_LEN];
+ char *modname, namebuf[128];
const char *symbol_name;
void *addr;
struct kprobe_blackpoint *kb;
@@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
- char *modname, namebuf[KSYM_NAME_LEN];
+ char *modname, namebuf[128];
head = &kprobe_table[i];
preempt_disable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee..760e86d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -33,7 +33,7 @@ struct kthread_create_info
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
- struct completion *done;
+ struct completion done;
struct list_head list;
};
@@ -178,7 +178,6 @@ static int kthread(void *_create)
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
- struct completion *done;
struct kthread self;
int ret;
@@ -188,16 +187,10 @@ static int kthread(void *_create)
init_completion(&self.parked);
current->vfork_done = &self.exited;
- /* If user was SIGKILLed, I release the structure. */
- done = xchg(&create->done, NULL);
- if (!done) {
- kfree(create);
- do_exit(-EINTR);
- }
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
- complete(done);
+ complete(&create->done);
schedule();
ret = -EINTR;
@@ -230,15 +223,8 @@ static void create_kthread(struct kthread_create_info *create)
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
- struct completion *done = xchg(&create->done, NULL);
-
- if (!done) {
- kfree(create);
- return;
- }
create->result = ERR_PTR(pid);
- complete(done);
+ complete(&create->done);
}
}
@@ -269,59 +255,36 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
const char namefmt[],
...)
{
- DECLARE_COMPLETION_ONSTACK(done);
- struct task_struct *task;
- struct kthread_create_info *create = kmalloc(sizeof(*create),
- GFP_KERNEL);
-
- if (!create)
- return ERR_PTR(-ENOMEM);
- create->threadfn = threadfn;
- create->data = data;
- create->node = node;
- create->done = &done;
+ struct kthread_create_info create;
+
+ create.threadfn = threadfn;
+ create.data = data;
+ create.node = node;
+ init_completion(&create.done);
spin_lock(&kthread_create_lock);
- list_add_tail(&create->list, &kthread_create_list);
+ list_add_tail(&create.list, &kthread_create_list);
spin_unlock(&kthread_create_lock);
wake_up_process(kthreadd_task);
- /*
- * Wait for completion in killable state, for I might be chosen by
- * the OOM killer while kthreadd is trying to allocate memory for
- * new kernel thread.
- */
- if (unlikely(wait_for_completion_killable(&done))) {
- /*
- * If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
- */
- if (xchg(&create->done, NULL))
- return ERR_PTR(-ENOMEM);
- /*
- * kthreadd (or new kernel thread) will call complete()
- * shortly.
- */
- wait_for_completion(&done);
- }
- task = create->result;
- if (!IS_ERR(task)) {
+ wait_for_completion(&create.done);
+
+ if (!IS_ERR(create.result)) {
static const struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
- vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+ vsnprintf(create.result->comm, sizeof(create.result->comm),
+ namefmt, args);
va_end(args);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task, cpu_all_mask);
+ sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(create.result, cpu_all_mask);
}
- kfree(create);
- return task;
+ return create.result;
}
EXPORT_SYMBOL(kthread_create_on_node);
diff --git a/kernel/locking/lglock.c b/kernel/lglock.c
index 86ae2ae..86ae2ae 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/lglock.c
diff --git a/kernel/locking/lockdep.c b/kernel/lockdep.c
index 576ba75..e16c45b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/lockdep.c
@@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data)
return 0;
}
-static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
struct lock_list *uninitialized_var(target_entry);
@@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
return ret;
}
-static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
+unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
struct lock_list *uninitialized_var(target_entry);
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
- : !rcu_is_watching()
+ : rcu_is_cpu_idle()
? "RCU used illegally from idle CPU!\n"
: "",
rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
* So complain bitterly if someone does call rcu_read_lock(),
* rcu_read_lock_bh() and so on from extended quiescent states.
*/
- if (!rcu_is_watching())
+ if (rcu_is_cpu_idle())
printk("RCU used illegally from extended quiescent state!\n");
lockdep_print_held_locks(curr);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/lockdep_internals.h
index 4f560cf..4f560cf 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
diff --git a/kernel/locking/lockdep_proc.c b/kernel/lockdep_proc.c
index ef43ac4..b2c71c5 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -421,7 +421,6 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -519,20 +518,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
}
if (i) {
seq_puts(m, "\n");
- seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
+ seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
seq_puts(m, "\n");
}
}
static void seq_header(struct seq_file *m)
{
- seq_puts(m, "lock_stat version 0.4\n");
+ seq_printf(m, "lock_stat version 0.3\n");
if (unlikely(!debug_locks))
seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
- seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
- seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
+ seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+ seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
"%14s %14s\n",
"class name",
"con-bounces",
@@ -540,14 +539,12 @@ static void seq_header(struct seq_file *m)
"waittime-min",
"waittime-max",
"waittime-total",
- "waittime-avg",
"acq-bounces",
"acquisitions",
"holdtime-min",
"holdtime-max",
- "holdtime-total",
- "holdtime-avg");
- seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+ "holdtime-total");
+ seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
seq_printf(m, "\n");
}
diff --git a/kernel/locking/lockdep_states.h b/kernel/lockdep_states.h
index 995b0cc..995b0cc 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/lockdep_states.h
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
deleted file mode 100644
index baab8e5..0000000
--- a/kernel/locking/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
-
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
-endif
-
-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
-obj-$(CONFIG_LOCKDEP) += lockdep.o
-ifeq ($(CONFIG_PROC_FS),y)
-obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
-endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
deleted file mode 100644
index 652a8ee..0000000
--- a/kernel/locking/percpu-rwsem.c
+++ /dev/null
@@ -1,165 +0,0 @@
-#include <linux/atomic.h>
-#include <linux/rwsem.h>
-#include <linux/percpu.h>
-#include <linux/wait.h>
-#include <linux/lockdep.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
- const char *name, struct lock_class_key *rwsem_key)
-{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
- return -ENOMEM;
-
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- atomic_set(&brw->write_ctr, 0);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
- return 0;
-}
-
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
-{
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
-}
-
-/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- * R_W: down_write() comes after up_read(), the writer should see all
- * changes done by the reader
- * or
- * W_R: down_read() comes after up_write(), the reader should see all
- * changes done by the writer
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
-{
- bool success = false;
-
- preempt_disable();
- if (likely(!atomic_read(&brw->write_ctr))) {
- __this_cpu_add(*brw->fast_read_ctr, val);
- success = true;
- }
- preempt_enable();
-
- return success;
-}
-
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
- might_sleep();
- if (likely(update_fast_ctr(brw, +1))) {
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
- return;
- }
-
- down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- /* avoid up_read()->rwsem_release() */
- __up_read(&brw->rw_sem);
-}
-
-void percpu_up_read(struct percpu_rw_semaphore *brw)
-{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
- if (likely(update_fast_ctr(brw, -1)))
- return;
-
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
-}
-
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
- unsigned int sum = 0;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
-
- return sum;
-}
-
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
-void percpu_down_write(struct percpu_rw_semaphore *brw)
-{
- /* tell update_fast_ctr() there is a pending writer */
- atomic_inc(&brw->write_ctr);
- /*
- * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
- * so that update_fast_ctr() can't succeed.
- *
- * 2. Ensures we see the result of every previous this_cpu_add() in
- * update_fast_ctr().
- *
- * 3. Ensures that if any reader has exited its critical section via
- * fast-path, it executes a full memory barrier before we return.
- * See R_W case in the comment above update_fast_ctr().
- */
- synchronize_sched_expedited();
-
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
-
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
-
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
-}
-
-void percpu_up_write(struct percpu_rw_semaphore *brw)
-{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
- /*
- * Insert the barrier before the next fast-path in down_read,
- * see W_R case in the comment above update_fast_ctr().
- */
- synchronize_sched_expedited();
- /* the last writer unblocks update_fast_ctr() */
- atomic_dec(&brw->write_ctr);
-}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index 9be8a91..0000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,296 +0,0 @@
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001 David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/export.h>
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
- int ret = 1;
- unsigned long flags;
-
- if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->activity != 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- }
- return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->activity = 0;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- * - the 'active count' _reached_ zero
- * - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- int woken;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wakewrite)
- /* Wake up a writer. Note that we do not grant it the
- * lock - it will have to acquire it when it runs. */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* grant an infinite number of read locks to the front of the queue */
- woken = 0;
- do {
- struct list_head *next = waiter->list.next;
-
- list_del(&waiter->list);
- tsk = waiter->task;
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- woken++;
- if (next == &sem->wait_list)
- break;
- waiter = list_entry(next, struct rwsem_waiter, list);
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- sem->activity += woken;
-
- out:
- return sem;
-}
-
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
- struct rwsem_waiter *waiter;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- wake_up_process(waiter->task);
-
- return sem;
-}
-
-/*
- * get a read lock on the semaphore
- */
-void __sched __down_read(struct rw_semaphore *sem)
-{
- struct rwsem_waiter waiter;
- struct task_struct *tsk;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->activity++;
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- goto out;
- }
-
- tsk = current;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
- /* set up my own style of waitqueue */
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we don't need to touch the semaphore struct anymore */
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- /* wait to be given the lock */
- for (;;) {
- if (!waiter.task)
- break;
- schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- }
-
- tsk->state = TASK_RUNNING;
- out:
- ;
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->activity++;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * get a write lock on the semaphore
- */
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
- struct rwsem_waiter waiter;
- struct task_struct *tsk;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* set up my own style of waitqueue */
- tsk = current;
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait for someone to release the lock */
- for (;;) {
- /*
- * That is the key to support write lock stealing: allows the
- * task already on CPU to get the lock soon rather than put
- * itself into sleep and waiting for system woke it or someone
- * else in the head of the wait list up.
- */
- if (sem->activity == 0)
- break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
- /* got the lock */
- sem->activity = -1;
- list_del(&waiter.list);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
- __down_write_nested(sem, 0);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->activity == 0) {
- /* got the lock */
- sem->activity = -1;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (--sem->activity == 0 && !list_empty(&sem->wait_list))
- sem = __rwsem_wake_one_writer(sem);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->activity = 0;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 1);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->activity = 1;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 0);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 19c5fa9..0000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/* rwsem.c: R/W semaphores: contention handling functions
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- *
- * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
- * and Michel Lespinasse <walken@google.com>
- */
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/export.h>
-
-/*
- * Initialize an rwsem:
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->count = RWSEM_UNLOCKED_VALUE;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-}
-
-EXPORT_SYMBOL(__init_rwsem);
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-enum rwsem_wake_type {
- RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
- RWSEM_WAKE_READERS, /* Wake readers only */
- RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
-};
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then:
- * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
- * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- * - there must be someone on the queue
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
- */
-static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- struct list_head *next;
- long oldcount, woken, loop, adjustment;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- /* Wake writer at the front of the queue, but do not
- * grant it the lock yet as we want other writers
- * to be able to steal it. Readers, on the other hand,
- * will block as they will notice the queued writer.
- */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* Writers might steal the lock before we grant it to the next reader.
- * We prefer to do the first reader grant before counting readers
- * so we can bail out early if a writer stole the lock.
- */
- adjustment = 0;
- if (wake_type != RWSEM_WAKE_READ_OWNED) {
- adjustment = RWSEM_ACTIVE_READ_BIAS;
- try_reader_grant:
- oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
- if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /* A writer stole the lock. Undo our reader grant. */
- if (rwsem_atomic_update(-adjustment, sem) &
- RWSEM_ACTIVE_MASK)
- goto out;
- /* Last active locker left. Retry waking readers. */
- goto try_reader_grant;
- }
- }
-
- /* Grant an infinite number of read locks to the readers at the front
- * of the queue. Note we increment the 'active part' of the count by
- * the number of readers before waking any processes up.
- */
- woken = 0;
- do {
- woken++;
-
- if (waiter->list.next == &sem->wait_list)
- break;
-
- waiter = list_entry(waiter->list.next,
- struct rwsem_waiter, list);
-
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- if (waiter->type != RWSEM_WAITING_FOR_WRITE)
- /* hit end of list above */
- adjustment -= RWSEM_WAITING_BIAS;
-
- if (adjustment)
- rwsem_atomic_add(adjustment, sem);
-
- next = sem->wait_list.next;
- loop = woken;
- do {
- waiter = list_entry(next, struct rwsem_waiter, list);
- next = waiter->list.next;
- tsk = waiter->task;
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- } while (--loop);
-
- sem->wait_list.next = next;
- next->prev = &sem->wait_list;
-
- out:
- return sem;
-}
-
-/*
- * wait for the read lock to be granted
- */
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
-{
- long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
- struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
-
- /* set up my own style of waitqueue */
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
-
- /* If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (count == RWSEM_WAITING_BIAS ||
- (count > RWSEM_WAITING_BIAS &&
- adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
-
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* wait to be given the lock */
- while (true) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if (!waiter.task)
- break;
- schedule();
- }
-
- tsk->state = TASK_RUNNING;
-
- return sem;
-}
-
-/*
- * wait until we successfully acquire the write lock
- */
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
-{
- long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
- struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
-
- /* set up my own style of waitqueue */
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
-
- /* If there were already threads queued before us and there are no
- * active writers, the lock must be read owned; so we try to wake
- * any read locks that were queued ahead of us. */
- if (count > RWSEM_WAITING_BIAS &&
- adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
-
- /* wait until we successfully acquire the lock */
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- while (true) {
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* Try acquiring the write lock. */
- count = RWSEM_ACTIVE_WRITE_BIAS;
- if (!list_is_singular(&sem->wait_list))
- count += RWSEM_WAITING_BIAS;
-
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
- RWSEM_WAITING_BIAS)
- break;
- }
-
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* Block until there are no active lockers. */
- do {
- schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
-
- raw_spin_lock_irq(&sem->wait_lock);
- }
-
- list_del(&waiter.list);
- raw_spin_unlock_irq(&sem->wait_lock);
- tsk->state = TASK_RUNNING;
-
- return sem;
-}
-
-/*
- * handle waking up a waiter on the semaphore
- * - up_read/up_write has decremented the active part of count if we come here
- */
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* do nothing if list empty */
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return sem;
-}
-
-/*
- * downgrade a write lock into a read lock
- * - caller incremented waiting part of count and discovered it still negative
- * - just wake up any readers at the front of the queue
- */
-struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* do nothing if list empty */
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return sem;
-}
-
-EXPORT_SYMBOL(rwsem_down_read_failed);
-EXPORT_SYMBOL(rwsem_down_write_failed);
-EXPORT_SYMBOL(rwsem_wake);
-EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
deleted file mode 100644
index 0374a59..0000000
--- a/kernel/locking/spinlock_debug.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Copyright 2005, Red Hat, Inc., Ingo Molnar
- * Released under the General Public License (GPL).
- *
- * This file contains the spinlock/rwlock implementations for
- * DEBUG_SPINLOCK.
- */
-
-#include <linux/spinlock.h>
-#include <linux/nmi.h>
-#include <linux/interrupt.h>
-#include <linux/debug_locks.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-
-void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
- lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
- lock->magic = SPINLOCK_MAGIC;
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
-}
-
-EXPORT_SYMBOL(__raw_spin_lock_init);
-
-void __rwlock_init(rwlock_t *lock, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
- lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
- lock->magic = RWLOCK_MAGIC;
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
-}
-
-EXPORT_SYMBOL(__rwlock_init);
-
-static void spin_dump(raw_spinlock_t *lock, const char *msg)
-{
- struct task_struct *owner = NULL;
-
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
- printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
- msg, raw_smp_processor_id(),
- current->comm, task_pid_nr(current));
- printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
- ".owner_cpu: %d\n",
- lock, lock->magic,
- owner ? owner->comm : "<none>",
- owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
- dump_stack();
-}
-
-static void spin_bug(raw_spinlock_t *lock, const char *msg)
-{
- if (!debug_locks_off())
- return;
-
- spin_dump(lock, msg);
-}
-
-#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
-
-static inline void
-debug_spin_lock_before(raw_spinlock_t *lock)
-{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
- lock, "cpu recursion");
-}
-
-static inline void debug_spin_lock_after(raw_spinlock_t *lock)
-{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
-}
-
-static inline void debug_spin_unlock(raw_spinlock_t *lock)
-{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
- SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
- SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
- lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
-}
-
-static void __spin_lock_debug(raw_spinlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
-
- for (i = 0; i < loops; i++) {
- if (arch_spin_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- spin_dump(lock, "lockup suspected");
-#ifdef CONFIG_SMP
- trigger_all_cpu_backtrace();
-#endif
-
- /*
- * The trylock above was causing a livelock. Give the lower level arch
- * specific lock code a chance to acquire the lock. We have already
- * printed a warning/backtrace at this point. The non-debug arch
- * specific code might actually succeed in acquiring the lock. If it is
- * not successful, the end-result is the same - there is no forward
- * progress.
- */
- arch_spin_lock(&lock->raw_lock);
-}
-
-void do_raw_spin_lock(raw_spinlock_t *lock)
-{
- debug_spin_lock_before(lock);
- if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
- __spin_lock_debug(lock);
- debug_spin_lock_after(lock);
-}
-
-int do_raw_spin_trylock(raw_spinlock_t *lock)
-{
- int ret = arch_spin_trylock(&lock->raw_lock);
-
- if (ret)
- debug_spin_lock_after(lock);
-#ifndef CONFIG_SMP
- /*
- * Must not happen on UP:
- */
- SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
-#endif
- return ret;
-}
-
-void do_raw_spin_unlock(raw_spinlock_t *lock)
-{
- debug_spin_unlock(lock);
- arch_spin_unlock(&lock->raw_lock);
-}
-
-static void rwlock_bug(rwlock_t *lock, const char *msg)
-{
- if (!debug_locks_off())
- return;
-
- printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
- msg, raw_smp_processor_id(), current->comm,
- task_pid_nr(current), lock);
- dump_stack();
-}
-
-#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
-
-#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_read_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
-void do_raw_read_lock(rwlock_t *lock)
-{
- RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- arch_read_lock(&lock->raw_lock);
-}
-
-int do_raw_read_trylock(rwlock_t *lock)
-{
- int ret = arch_read_trylock(&lock->raw_lock);
-
-#ifndef CONFIG_SMP
- /*
- * Must not happen on UP:
- */
- RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
-#endif
- return ret;
-}
-
-void do_raw_read_unlock(rwlock_t *lock)
-{
- RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- arch_read_unlock(&lock->raw_lock);
-}
-
-static inline void debug_write_lock_before(rwlock_t *lock)
-{
- RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
- RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
- lock, "cpu recursion");
-}
-
-static inline void debug_write_lock_after(rwlock_t *lock)
-{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
-}
-
-static inline void debug_write_unlock(rwlock_t *lock)
-{
- RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
- RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
- lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
-}
-
-#if 0 /* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_write_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
-void do_raw_write_lock(rwlock_t *lock)
-{
- debug_write_lock_before(lock);
- arch_write_lock(&lock->raw_lock);
- debug_write_lock_after(lock);
-}
-
-int do_raw_write_trylock(rwlock_t *lock)
-{
- int ret = arch_write_trylock(&lock->raw_lock);
-
- if (ret)
- debug_write_lock_after(lock);
-#ifndef CONFIG_SMP
- /*
- * Must not happen on UP:
- */
- RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
-#endif
- return ret;
-}
-
-void do_raw_write_unlock(rwlock_t *lock)
-{
- debug_write_unlock(lock);
- arch_write_unlock(&lock->raw_lock);
-}
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 0000000..4a9a86d
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,12 @@
+#include <linux/export.h>
+
+#define GLOBAL(name) \
+ .globl VMLINUX_SYMBOL(name); \
+ VMLINUX_SYMBOL(name):
+
+ .section ".init.data","aw"
+
+GLOBAL(modsign_certificate_list)
+ .incbin "signing_key.x509"
+ .incbin "extra_certificates"
+GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
new file mode 100644
index 0000000..7cbd450
--- /dev/null
+++ b/kernel/modsign_pubkey.c
@@ -0,0 +1,104 @@
+/* Public keys for module signature verification
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+
+struct key *modsign_keyring;
+
+extern __initconst const u8 modsign_certificate_list[];
+extern __initconst const u8 modsign_certificate_list_end[];
+
+/*
+ * We need to make sure ccache doesn't cache the .o file as it doesn't notice
+ * if modsign.pub changes.
+ */
+static __initconst const char annoy_ccache[] = __TIME__ "foo";
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int module_verify_init(void)
+{
+ pr_notice("Initialise module verification\n");
+
+ modsign_keyring = keyring_alloc(".module_sign",
+ KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(),
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(modsign_keyring))
+ panic("Can't allocate module signing keyring\n");
+
+ return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(module_verify_init);
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int load_module_signing_keys(void)
+{
+ key_ref_t key;
+ const u8 *p, *end;
+ size_t plen;
+
+ pr_notice("Loading module verification certificates\n");
+
+ end = modsign_certificate_list_end;
+ p = modsign_certificate_list;
+ while (p < end) {
+ /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+ * than 256 bytes in size.
+ */
+ if (end - p < 4)
+ goto dodgy_cert;
+ if (p[0] != 0x30 &&
+ p[1] != 0x82)
+ goto dodgy_cert;
+ plen = (p[2] << 8) | p[3];
+ plen += 4;
+ if (plen > end - p)
+ goto dodgy_cert;
+
+ key = key_create_or_update(make_key_ref(modsign_keyring, 1),
+ "asymmetric",
+ NULL,
+ p,
+ plen,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(key))
+ pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
+ PTR_ERR(key));
+ else
+ pr_notice("MODSIGN: Loaded cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+ p += plen;
+ }
+
+ return 0;
+
+dodgy_cert:
+ pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
+ return 0;
+}
+late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 915e123..24f9247 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -9,4 +9,6 @@
* 2 of the Licence, or (at your option) any later version.
*/
+extern struct key *modsign_keyring;
+
extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index f5a3b1e..dc58274 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -378,21 +378,23 @@ static bool check_symbol(const struct symsearch *syms,
if (syms->licence == GPL_ONLY)
return false;
if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
- pr_warn("Symbol %s is being used by a non-GPL module, "
- "which will not be allowed in the future\n",
- fsa->name);
+ printk(KERN_WARNING "Symbol %s is being used "
+ "by a non-GPL module, which will not "
+ "be allowed in the future\n", fsa->name);
}
}
#ifdef CONFIG_UNUSED_SYMBOLS
if (syms->unused && fsa->warn) {
- pr_warn("Symbol %s is marked as UNUSED, however this module is "
- "using it.\n", fsa->name);
- pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evalute if this is the right api to use and if "
- "it really is, submit a report the linux kernel "
- "mailinglist together with submitting your code for "
- "inclusion.\n");
+ printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+ "however this module is using it.\n", fsa->name);
+ printk(KERN_WARNING
+ "This symbol will go away in the future.\n");
+ printk(KERN_WARNING
+ "Please evalute if this is the right api to use and if "
+ "it really is, submit a report the linux kernel "
+ "mailinglist together with submitting your code for "
+ "inclusion.\n");
}
#endif
@@ -490,15 +492,16 @@ static int percpu_modalloc(struct module *mod, struct load_info *info)
return 0;
if (align > PAGE_SIZE) {
- pr_warn("%s: per-cpu alignment %li > %li\n",
- mod->name, align, PAGE_SIZE);
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
align = PAGE_SIZE;
}
mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
if (!mod->percpu) {
- pr_warn("%s: Could not allocate %lu bytes percpu data\n",
- mod->name, (unsigned long)pcpusec->sh_size);
+ printk(KERN_WARNING
+ "%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
return -ENOMEM;
}
mod->percpu_size = pcpusec->sh_size;
@@ -641,6 +644,8 @@ static int module_unload_init(struct module *mod)
/* Hold reference count during initialization. */
__this_cpu_write(mod->refptr->incs, 1);
+ /* Backwards compatibility macros put refcount during init. */
+ mod->waiter = current;
return 0;
}
@@ -674,7 +679,7 @@ static int add_module_usage(struct module *a, struct module *b)
pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
if (!use) {
- pr_warn("%s: out of memory loading\n", a->name);
+ printk(KERN_WARNING "%s: out of memory loading\n", a->name);
return -ENOMEM;
}
@@ -766,9 +771,16 @@ static int __try_stop_module(void *_sref)
static int try_stop_module(struct module *mod, int flags, int *forced)
{
- struct stopref sref = { mod, flags, forced };
+ if (flags & O_NONBLOCK) {
+ struct stopref sref = { mod, flags, forced };
- return stop_machine(__try_stop_module, &sref, NULL);
+ return stop_machine(__try_stop_module, &sref, NULL);
+ } else {
+ /* We don't need to stop the machine for this. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ return 0;
+ }
}
unsigned long module_refcount(struct module *mod)
@@ -801,6 +813,21 @@ EXPORT_SYMBOL(module_refcount);
/* This exists whether we can unload or not */
static void free_module(struct module *mod);
+static void wait_for_zero_refcount(struct module *mod)
+{
+ /* Since we might sleep for some time, release the mutex first */
+ mutex_unlock(&module_mutex);
+ for (;;) {
+ pr_debug("Looking at refcount...\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (module_refcount(mod) == 0)
+ break;
+ schedule();
+ }
+ current->state = TASK_RUNNING;
+ mutex_lock(&module_mutex);
+}
+
SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
unsigned int, flags)
{
@@ -815,11 +842,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
- if (!(flags & O_NONBLOCK)) {
- printk(KERN_WARNING
- "waiting module removal not supported: please upgrade");
- }
-
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
@@ -837,7 +859,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
/* Doing init or already dying? */
if (mod->state != MODULE_STATE_LIVE) {
- /* FIXME: if (force), slam module count damn the torpedoes */
+ /* FIXME: if (force), slam module count and wake up
+ waiter --RR */
pr_debug("%s already dying\n", mod->name);
ret = -EBUSY;
goto out;
@@ -853,11 +876,18 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
}
}
+ /* Set this up before setting mod->state */
+ mod->waiter = current;
+
/* Stop the machine so refcounts can't move and disable module. */
ret = try_stop_module(mod, flags, &forced);
if (ret != 0)
goto out;
+ /* Never wait if forced. */
+ if (!forced && module_refcount(mod) != 0)
+ wait_for_zero_refcount(mod);
+
mutex_unlock(&module_mutex);
/* Final destruction now no one is using it. */
if (mod->exit != NULL)
@@ -975,6 +1005,9 @@ void module_put(struct module *module)
__this_cpu_inc(module->refptr->decs);
trace_module_put(module, _RET_IP_);
+ /* Maybe they're waiting for us to drop reference? */
+ if (unlikely(!module_is_live(module)))
+ wake_up_process(module->waiter);
preempt_enable();
}
}
@@ -1112,7 +1145,8 @@ static int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
- pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ printk(KERN_WARNING "%s: %s: kernel tainted.\n",
+ mod->name, reason);
add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
return 0;
#else
@@ -1165,7 +1199,8 @@ static int check_version(Elf_Shdr *sechdrs,
goto bad_version;
}
- pr_warn("%s: no symbol version for %s\n", mod->name, symname);
+ printk(KERN_WARNING "%s: no symbol version for %s\n",
+ mod->name, symname);
return 0;
bad_version:
@@ -1274,8 +1309,8 @@ resolve_symbol_wait(struct module *mod,
!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
|| PTR_ERR(ksym) != -EBUSY,
30 * HZ) <= 0) {
- pr_warn("%s: gave up waiting for init of module %s.\n",
- mod->name, owner);
+ printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
}
return ksym;
}
@@ -1591,14 +1626,15 @@ static int mod_sysfs_init(struct module *mod)
struct kobject *kobj;
if (!module_sysfs_initialized) {
- pr_err("%s: module sysfs not initialized\n", mod->name);
+ printk(KERN_ERR "%s: module sysfs not initialized\n",
+ mod->name);
err = -EINVAL;
goto out;
}
kobj = kset_find_obj(module_kset, mod->name);
if (kobj) {
- pr_err("%s: module is already loaded\n", mod->name);
+ printk(KERN_ERR "%s: module is already loaded\n", mod->name);
kobject_put(kobj);
err = -EINVAL;
goto out;
@@ -1925,7 +1961,8 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(s->name, &owner, NULL, true, false)) {
- pr_err("%s: exports duplicate symbol %s"
+ printk(KERN_ERR
+ "%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, s->name, module_name(owner));
return -ENOEXEC;
@@ -1976,8 +2013,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
- pr_warn("%s: Unknown symbol %s (err %li)\n",
- mod->name, name, PTR_ERR(ksym));
+ printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
+ mod->name, name, PTR_ERR(ksym));
ret = PTR_ERR(ksym) ?: -ENOENT;
break;
@@ -2131,8 +2168,8 @@ static void set_license(struct module *mod, const char *license)
if (!license_is_gpl_compatible(license)) {
if (!test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license '%s' taints kernel.\n",
- mod->name, license);
+ printk(KERN_WARNING "%s: module license '%s' taints "
+ "kernel.\n", mod->name, license);
add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
LOCKDEP_NOW_UNRELIABLE);
}
@@ -2368,8 +2405,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
return;
#ifdef CONFIG_DYNAMIC_DEBUG
if (ddebug_add_module(debug, num, debug->modname))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
+ printk(KERN_ERR "dynamic debug error adding module: %s\n",
+ debug->modname);
#endif
}
@@ -2582,7 +2619,8 @@ static int rewrite_section_headers(struct load_info *info, int flags)
Elf_Shdr *shdr = &info->sechdrs[i];
if (shdr->sh_type != SHT_NOBITS
&& info->len < shdr->sh_offset + shdr->sh_size) {
- pr_err("Module len %lu truncated\n", info->len);
+ printk(KERN_ERR "Module len %lu truncated\n",
+ info->len);
return -ENOEXEC;
}
@@ -2644,14 +2682,15 @@ static struct module *setup_load_info(struct load_info *info, int flags)
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
- pr_warn("No module found in object\n");
+ printk(KERN_WARNING "No module found in object\n");
return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
+ printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
+ mod->name);
return ERR_PTR(-ENOEXEC);
}
@@ -2678,7 +2717,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
if (err)
return err;
} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
- pr_err("%s: version magic '%s' should be '%s'\n",
+ printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
return -ENOEXEC;
}
@@ -2688,8 +2727,9 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
- "is unknown, you have been warned.\n", mod->name);
+ printk(KERN_WARNING "%s: module is from the staging directory,"
+ " the quality is unknown, you have been warned.\n",
+ mod->name);
}
/* Set up license info based on the info section */
@@ -2698,7 +2738,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return 0;
}
-static int find_module_sections(struct module *mod, struct load_info *info)
+static void find_module_sections(struct module *mod, struct load_info *info)
{
mod->kp = section_objs(info, "__param",
sizeof(*mod->kp), &mod->num_kp);
@@ -2728,18 +2768,6 @@ static int find_module_sections(struct module *mod, struct load_info *info)
#ifdef CONFIG_CONSTRUCTORS
mod->ctors = section_objs(info, ".ctors",
sizeof(*mod->ctors), &mod->num_ctors);
- if (!mod->ctors)
- mod->ctors = section_objs(info, ".init_array",
- sizeof(*mod->ctors), &mod->num_ctors);
- else if (find_sec(info, ".init_array")) {
- /*
- * This shouldn't happen with same compiler and binutils
- * building all parts of the module.
- */
- printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
- mod->name);
- return -EINVAL;
- }
#endif
#ifdef CONFIG_TRACEPOINTS
@@ -2773,12 +2801,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->extable), &mod->num_exentries);
if (section_addr(info, "__obsparm"))
- pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+ printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+ mod->name);
info->debug = section_objs(info, "__verbose",
sizeof(*info->debug), &info->num_debug);
-
- return 0;
}
static int move_module(struct module *mod, struct load_info *info)
@@ -3051,10 +3078,11 @@ static int do_init_module(struct module *mod)
return ret;
}
if (ret > 0) {
- pr_warn("%s: '%s'->init suspiciously returned %d, it should "
- "follow 0/-E convention\n"
- "%s: loading module anyway...\n",
- __func__, mod->name, ret, __func__);
+ printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
dump_stack();
}
@@ -3177,8 +3205,10 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname)
{
/* Check for magic 'dyndbg' arg */
int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
- if (ret != 0)
- pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ if (ret != 0) {
+ printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
+ modname, param);
+ }
return 0;
}
@@ -3213,9 +3243,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
#ifdef CONFIG_MODULE_SIG
mod->sig_ok = info->sig_ok;
if (!mod->sig_ok) {
- pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
- "kernel\n", mod->name);
+ printk_once(KERN_NOTICE
+ "%s: module verification failed: signature and/or"
+ " required key missing - tainting kernel\n",
+ mod->name);
add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
}
#endif
@@ -3232,9 +3263,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Now we've got everything in the final locations, we can
* find optional sections. */
- err = find_module_sections(mod, info);
- if (err)
- goto free_unload;
+ find_module_sections(mod, info);
err = check_module_license_and_versions(mod);
if (err)
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fa..f2970bd 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -14,7 +14,6 @@
#include <crypto/public_key.h>
#include <crypto/hash.h>
#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
#include "module-internal.h"
/*
@@ -29,7 +28,7 @@
*/
struct module_signature {
u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
- u8 hash; /* Digest algorithm [enum hash_algo] */
+ u8 hash; /* Digest algorithm [enum pkey_hash_algo] */
u8 id_type; /* Key identifier type [enum pkey_id_type] */
u8 signer_len; /* Length of signer's name */
u8 key_id_len; /* Length of key identifier */
@@ -40,7 +39,7 @@ struct module_signature {
/*
* Digest the module contents.
*/
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
+static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
const void *mod,
unsigned long modlen)
{
@@ -55,7 +54,7 @@ static struct public_key_signature *mod_make_digest(enum hash_algo hash,
/* Allocate the hashing algorithm we're going to need and find out how
* big the hash operational data will be.
*/
- tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+ tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
if (IS_ERR(tfm))
return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
@@ -158,7 +157,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
pr_debug("Look up: \"%s\"\n", id);
- key = keyring_search(make_key_ref(system_trusted_keyring, 1),
+ key = keyring_search(make_key_ref(modsign_keyring, 1),
&key_type_asymmetric, id);
if (IS_ERR(key))
pr_warn("Request for unknown module key '%s' err %ld\n",
@@ -218,7 +217,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
return -ENOPKG;
if (ms.hash >= PKEY_HASH__LAST ||
- !hash_algo_name[ms.hash])
+ !pkey_hash_algo[ms.hash])
return -ENOPKG;
key = request_asymmetric_key(sig, ms.signer_len,
diff --git a/kernel/locking/mutex-debug.c b/kernel/mutex-debug.c
index 7e3443f..7e3443f 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/mutex-debug.c
diff --git a/kernel/locking/mutex-debug.h b/kernel/mutex-debug.h
index 0799fd3..0799fd3 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/mutex-debug.h
diff --git a/kernel/locking/mutex.c b/kernel/mutex.c
index 4dd6e4c..d24105b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/mutex.c
@@ -1,5 +1,5 @@
/*
- * kernel/locking/mutex.c
+ * kernel/mutex.c
*
* Mutexes: blocking mutual exclusion locks
*
diff --git a/kernel/locking/mutex.h b/kernel/mutex.h
index 4115fbf..4115fbf 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/mutex.h
diff --git a/kernel/padata.c b/kernel/padata.c
index 2abd25d..07af2c9 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,7 +46,6 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
static int padata_cpu_hash(struct parallel_data *pd)
{
- unsigned int seq_nr;
int cpu_index;
/*
@@ -54,8 +53,10 @@ static int padata_cpu_hash(struct parallel_data *pd)
* seq_nr mod. number of cpus in use.
*/
- seq_nr = atomic_inc_return(&pd->seq_nr);
- cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
+ spin_lock(&pd->seq_lock);
+ cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+ pd->seq_nr++;
+ spin_unlock(&pd->seq_lock);
return padata_index_to_cpu(pd, cpu_index);
}
@@ -428,7 +429,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
- atomic_set(&pd->seq_nr, -1);
+ pd->seq_nr = 0;
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
pd->pinst = pinst;
diff --git a/kernel/panic.c b/kernel/panic.c
index c00b4ce..b6c482c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -233,7 +233,7 @@ static const struct tnt tnts[] = {
*/
const char *print_tainted(void)
{
- static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
+ static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
if (tainted_mask) {
char *s;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06c62de..4208655 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -132,12 +132,6 @@ out:
return ERR_PTR(err);
}
-static void delayed_free_pidns(struct rcu_head *p)
-{
- kmem_cache_free(pid_ns_cachep,
- container_of(p, struct pid_namespace, rcu));
-}
-
static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
@@ -146,7 +140,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
put_user_ns(ns->user_ns);
- call_rcu(&ns->rcu, delayed_free_pidns);
+ kmem_cache_free(pid_ns_cachep, ns);
}
struct pid_namespace *copy_pid_ns(unsigned long flags,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc..d444c4e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -178,22 +178,6 @@ config PM_SLEEP_DEBUG
def_bool y
depends on PM_DEBUG && PM_SLEEP
-config DPM_WATCHDOG
- bool "Device suspend/resume watchdog"
- depends on PM_DEBUG && PSTORE
- ---help---
- Sets up a watchdog timer to capture drivers that are
- locked up attempting to suspend/resume a device.
- A detected lockup causes system panic with message
- captured in pstore device for inspection in subsequent
- boot session.
-
-config DPM_WATCHDOG_TIMEOUT
- int "Watchdog timeout in seconds"
- range 1 120
- default 12
- depends on DPM_WATCHDOG
-
config PM_TRACE
bool
help
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 8dff9b4..a394297 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -558,12 +558,30 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
- } else {
+ } else if (count <= 11) { /* ASCII perhaps? */
+ char ascii_value[11];
+ unsigned long int ulval;
int ret;
- ret = kstrtos32_from_user(buf, count, 16, &value);
- if (ret)
- return ret;
+ if (copy_from_user(ascii_value, buf, count))
+ return -EFAULT;
+
+ if (count > 10) {
+ if (ascii_value[10] == '\n')
+ ascii_value[10] = '\0';
+ else
+ return -EINVAL;
+ } else {
+ ascii_value[count] = '\0';
+ }
+ ret = kstrtoul(ascii_value, 16, &ulval);
+ if (ret) {
+ pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
+ return -EINVAL;
+ }
+ value = (s32)lower_32_bits(ulval);
+ } else {
+ return -EINVAL;
}
req = filp->private_data;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e..98c3b34 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -792,8 +792,7 @@ void free_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
- if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
- return;
+ BUG_ON(!(forbidden_pages_map && free_pages_map));
bm1 = forbidden_pages_map;
bm2 = free_pages_map;
@@ -1403,11 +1402,7 @@ int hibernate_preallocate_memory(void)
* highmem and non-highmem zones separately.
*/
pages_highmem = preallocate_image_highmem(highmem / 2);
- alloc = count - max_size;
- if (alloc > pages_highmem)
- alloc -= pages_highmem;
- else
- alloc = 0;
+ alloc = (count - max_size) - pages_highmem;
pages = preallocate_image_memory(alloc, avail_normal);
if (pages < alloc) {
/* We have exhausted non-highmem pages, try highmem. */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d3575..957f061 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -36,9 +36,9 @@ static struct snapshot_data {
struct snapshot_handle handle;
int swap;
int mode;
- bool frozen;
- bool ready;
- bool platform_support;
+ char frozen;
+ char ready;
+ char platform_support;
bool free_bitmaps;
} snapshot_state;
@@ -70,7 +70,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = swsusp_resume_device ?
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
- data->free_bitmaps = false;
error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
pm_notifier_call_chain(PM_POST_HIBERNATION);
@@ -94,9 +93,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
atomic_inc(&snapshot_device_available);
- data->frozen = false;
- data->ready = false;
- data->platform_support = false;
+ data->frozen = 0;
+ data->ready = 0;
+ data->platform_support = 0;
Unlock:
unlock_system_sleep();
@@ -230,7 +229,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (error)
thaw_processes();
else
- data->frozen = true;
+ data->frozen = 1;
break;
@@ -241,7 +240,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
free_basic_memory_bitmaps();
data->free_bitmaps = false;
thaw_processes();
- data->frozen = false;
+ data->frozen = 0;
break;
case SNAPSHOT_CREATE_IMAGE:
@@ -271,7 +270,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
case SNAPSHOT_FREE:
swsusp_free();
memset(&data->handle, 0, sizeof(struct snapshot_handle));
- data->ready = false;
+ data->ready = 0;
/*
* It is necessary to thaw kernel threads here, because
* SNAPSHOT_CREATE_IMAGE may be invoked directly after
@@ -335,7 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
* PM_HIBERNATION_PREPARE
*/
error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- data->ready = false;
+ data->ready = 0;
break;
case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86b..b4e8500 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = {
#ifdef CONFIG_KEXEC
/*
- * This appends the listed symbols to /proc/vmcore
+ * This appends the listed symbols to /proc/vmcoreinfo
*
- * /proc/vmcore is used by various utilities, like crash and makedumpfile to
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
* obtain access to symbols that are otherwise very difficult to locate. These
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
@@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
ignore_loglevel = 1;
- pr_info("debug: ignoring loglevel setting.\n");
+ printk(KERN_INFO "debug: ignoring loglevel setting.\n");
return 0;
}
@@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str)
pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
"HZ: %d, loops_per_msec: %llu\n",
boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
- return 0;
+ return 1;
}
-early_param("boot_delay", boot_delay_setup);
+__setup("boot_delay=", boot_delay_setup);
static void boot_delay_msec(int level)
{
@@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon;
static int __init keep_bootcon_setup(char *str)
{
keep_bootcon = 1;
- pr_info("debug: skip boot console de-registration.\n");
+ printk(KERN_INFO "debug: skip boot console de-registration.\n");
return 0;
}
@@ -2241,7 +2241,7 @@ void register_console(struct console *newcon)
/* find the last or real console */
for_each_console(bcon) {
if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
+ printk(KERN_INFO "Too late to register bootconsole %s%d\n",
newcon->name, newcon->index);
return;
}
@@ -2358,18 +2358,21 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- pr_info("%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
if (bcon &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* We need to iterate through all boot consoles, to make
- * sure we print everything out, before we unregister them.
+ /* we need to iterate through twice, to make sure we print
+ * everything out, before we unregister the console(s)
*/
+ printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
+ newcon->name, newcon->index);
for_each_console(bcon)
if (bcon->flags & CON_BOOT)
unregister_console(bcon);
+ } else {
+ printk(KERN_INFO "%sconsole [%s%d] enabled\n",
+ (newcon->flags & CON_BOOT) ? "boot" : "" ,
+ newcon->name, newcon->index);
}
}
EXPORT_SYMBOL(register_console);
@@ -2379,10 +2382,6 @@ int unregister_console(struct console *console)
struct console *a, *b;
int res;
- pr_info("%sconsole [%s%d] disabled\n",
- (console->flags & CON_BOOT) ? "boot" : "" ,
- console->name, console->index);
-
res = _braille_unregister_console(console);
if (res)
return res;
@@ -2422,6 +2421,8 @@ static int __init printk_late_init(void)
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
+ printk(KERN_INFO "turn off boot console %s%d\n",
+ con->name, con->index);
unregister_console(con);
}
}
@@ -2448,7 +2449,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
if (pending & PRINTK_PENDING_SCHED) {
char *buf = __get_cpu_var(printk_sched_buf);
- pr_warn("[sched_delayed] %s", buf);
+ printk(KERN_WARNING "[sched_delayed] %s", buf);
}
if (pending & PRINTK_PENDING_WAKEUP)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3..dd562e9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,8 +257,7 @@ ok:
if (task->mm)
dumpable = get_dumpable(task->mm);
rcu_read_lock();
- if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
rcu_read_unlock();
return -EPERM;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu.h
index 7859a0a..7713196 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu.h
@@ -122,11 +122,4 @@ int rcu_jiffies_till_stall_check(void);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-/*
- * Strings used in tracepoints need to be exported via the
- * tracing system such that tools like perf and trace-cmd can
- * translate the string address pointers to actual text.
- */
-#define TPS(x) tracepoint_string(x)
-
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
deleted file mode 100644
index 01e9ec3..0000000
--- a/kernel/rcu/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-obj-y += update.o srcu.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
-obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
-obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/update.c b/kernel/rcupdate.c
index 6cb3dff..b02a339 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcupdate.c
@@ -53,12 +53,6 @@
#include "rcu.h"
-MODULE_ALIAS("rcupdate");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcupdate."
-
module_param(rcu_expedited, int, 0);
#ifdef CONFIG_PREEMPT_RCU
@@ -154,7 +148,7 @@ int rcu_read_lock_bh_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
- if (!rcu_is_watching())
+ if (rcu_is_cpu_idle())
return 0;
if (!rcu_lockdep_current_cpu_online())
return 0;
@@ -304,7 +298,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/rcu/tiny.c b/kernel/rcutiny.c
index 1254f31..9ed6075 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcutiny.c
@@ -35,7 +35,6 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/ftrace_event.h>
#ifdef CONFIG_RCU_TRACE
#include <trace/events/rcu.h>
@@ -43,7 +42,7 @@
#include "rcu.h"
-/* Forward declarations for tiny_plugin.h. */
+/* Forward declarations for rcutiny_plugin.h. */
struct rcu_ctrlblk;
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static void rcu_process_callbacks(struct softirq_action *unused);
@@ -53,23 +52,22 @@ static void __call_rcu(struct rcu_head *head,
static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-#include "tiny_plugin.h"
+#include "rcutiny_plugin.h"
/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
static void rcu_idle_enter_common(long long newval)
{
if (newval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("--="),
+ RCU_TRACE(trace_rcu_dyntick("--=",
rcu_dynticks_nesting, newval));
rcu_dynticks_nesting = newval;
return;
}
- RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
- rcu_dynticks_nesting, newval));
+ RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
if (!is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
+ struct task_struct *idle = idle_task(smp_processor_id());
- RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
+ RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
rcu_dynticks_nesting, newval));
ftrace_dump(DUMP_ALL);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -122,15 +120,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
static void rcu_idle_exit_common(long long oldval)
{
if (oldval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("++="),
+ RCU_TRACE(trace_rcu_dyntick("++=",
oldval, rcu_dynticks_nesting));
return;
}
- RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
+ RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
if (!is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
+ struct task_struct *idle = idle_task(smp_processor_id());
- RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
+ RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
oldval, rcu_dynticks_nesting));
ftrace_dump(DUMP_ALL);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -176,18 +174,18 @@ void rcu_irq_enter(void)
}
EXPORT_SYMBOL_GPL(rcu_irq_enter);
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Test whether RCU thinks that the current CPU is idle.
*/
-bool notrace __rcu_is_watching(void)
+int rcu_is_cpu_idle(void)
{
- return rcu_dynticks_nesting;
+ return !rcu_dynticks_nesting;
}
-EXPORT_SYMBOL(__rcu_is_watching);
+EXPORT_SYMBOL(rcu_is_cpu_idle);
-#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
* Test whether the current CPU was interrupted from idle. Nested
@@ -275,7 +273,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
if (&rcp->rcucblist == rcp->donetail) {
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
- !!ACCESS_ONCE(rcp->rcucblist),
+ ACCESS_ONCE(rcp->rcucblist),
need_resched(),
is_idle_task(current),
false));
@@ -306,8 +304,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
RCU_TRACE(cb_count++);
}
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
- RCU_TRACE(trace_rcu_batch_end(rcp->name,
- cb_count, 0, need_resched(),
+ RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
is_idle_task(current),
false));
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcutiny_plugin.h
index 280d06c..280d06c 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
diff --git a/kernel/rcu/torture.c b/kernel/rcutorture.c
index 3929cd4..be63101 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcutorture.c
@@ -52,12 +52,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-MODULE_ALIAS("rcutorture");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcutorture."
-
static int fqs_duration;
module_param(fqs_duration, int, 0444);
MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
diff --git a/kernel/rcu/tree.c b/kernel/rcutree.c
index dd08198..32618b3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcutree.c
@@ -41,7 +41,6 @@
#include <linux/export.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
-#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -57,16 +56,17 @@
#include <linux/ftrace_event.h>
#include <linux/suspend.h>
-#include "tree.h"
+#include "rcutree.h"
#include <trace/events/rcu.h>
#include "rcu.h"
-MODULE_ALIAS("rcutree");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcutree."
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x) tracepoint_string(x)
/* Data structures. */
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(1),
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
@@ -371,8 +371,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
{
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
+ struct task_struct *idle = idle_task(smp_processor_id());
trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
ftrace_dump(DUMP_ORIG);
@@ -408,7 +407,7 @@ static void rcu_eqs_enter(bool user)
long long oldval;
struct rcu_dynticks *rdtp;
- rdtp = this_cpu_ptr(&rcu_dynticks);
+ rdtp = &__get_cpu_var(rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -436,7 +435,7 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -479,7 +478,7 @@ void rcu_irq_exit(void)
struct rcu_dynticks *rdtp;
local_irq_save(flags);
- rdtp = this_cpu_ptr(&rcu_dynticks);
+ rdtp = &__get_cpu_var(rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
@@ -509,8 +508,7 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
rcu_cleanup_after_idle(smp_processor_id());
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
+ struct task_struct *idle = idle_task(smp_processor_id());
trace_rcu_dyntick(TPS("Error on exit: not idle task"),
oldval, rdtp->dynticks_nesting);
@@ -530,7 +528,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
- rdtp = this_cpu_ptr(&rcu_dynticks);
+ rdtp = &__get_cpu_var(rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(oldval < 0);
if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -557,7 +555,7 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -601,7 +599,7 @@ void rcu_irq_enter(void)
long long oldval;
local_irq_save(flags);
- rdtp = this_cpu_ptr(&rcu_dynticks);
+ rdtp = &__get_cpu_var(rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
@@ -622,7 +620,7 @@ void rcu_irq_enter(void)
*/
void rcu_nmi_enter(void)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
if (rdtp->dynticks_nmi_nesting == 0 &&
(atomic_read(&rdtp->dynticks) & 0x1))
@@ -644,7 +642,7 @@ void rcu_nmi_enter(void)
*/
void rcu_nmi_exit(void)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
if (rdtp->dynticks_nmi_nesting == 0 ||
--rdtp->dynticks_nmi_nesting != 0)
@@ -657,34 +655,21 @@ void rcu_nmi_exit(void)
}
/**
- * __rcu_is_watching - are RCU read-side critical sections safe?
- *
- * Return true if RCU is watching the running CPU, which means that
- * this CPU can safely enter RCU read-side critical sections. Unlike
- * rcu_is_watching(), the caller of __rcu_is_watching() must have at
- * least disabled preemption.
- */
-bool notrace __rcu_is_watching(void)
-{
- return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
-}
-
-/**
- * rcu_is_watching - see if RCU thinks that the current CPU is idle
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
*
* If the current CPU is in its idle loop and is neither in an interrupt
* or NMI handler, return true.
*/
-bool notrace rcu_is_watching(void)
+int rcu_is_cpu_idle(void)
{
int ret;
preempt_disable();
- ret = __rcu_is_watching();
+ ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
preempt_enable();
return ret;
}
-EXPORT_SYMBOL_GPL(rcu_is_watching);
+EXPORT_SYMBOL(rcu_is_cpu_idle);
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
@@ -718,7 +703,7 @@ bool rcu_lockdep_current_cpu_online(void)
if (in_nmi())
return 1;
preempt_disable();
- rdp = this_cpu_ptr(&rcu_sched_data);
+ rdp = &__get_cpu_var(rcu_sched_data);
rnp = rdp->mynode;
ret = (rdp->grpmask & rnp->qsmaskinit) ||
!rcu_scheduler_fully_active;
@@ -738,7 +723,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
+ return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
}
/*
@@ -817,11 +802,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
- unsigned long j = ACCESS_ONCE(jiffies);
-
- rsp->gp_start = j;
- smp_wmb(); /* Record start time before stall time. */
- rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
+ rsp->gp_start = jiffies;
+ rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
}
/*
@@ -916,12 +898,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
force_quiescent_state(rsp); /* Kick them all. */
}
-/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
@@ -951,60 +927,22 @@ static void print_cpu_stall(struct rcu_state *rsp)
3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- /*
- * Attempt to revive the RCU machinery by forcing a context switch.
- *
- * A context switch would normally allow the RCU state machine to make
- * progress and it could be we're stuck in kernel space without context
- * switches for an entirely unreasonable amount of time.
- */
- resched_cpu(smp_processor_id());
+ set_need_resched(); /* kick ourselves to get things going. */
}
static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
- unsigned long completed;
- unsigned long gpnum;
- unsigned long gps;
unsigned long j;
unsigned long js;
struct rcu_node *rnp;
- if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+ if (rcu_cpu_stall_suppress)
return;
j = ACCESS_ONCE(jiffies);
-
- /*
- * Lots of memory barriers to reject false positives.
- *
- * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
- * then rsp->gp_start, and finally rsp->completed. These values
- * are updated in the opposite order with memory barriers (or
- * equivalent) during grace-period initialization and cleanup.
- * Now, a false positive can occur if we get an new value of
- * rsp->gp_start and a old value of rsp->jiffies_stall. But given
- * the memory barriers, the only way that this can happen is if one
- * grace period ends and another starts between these two fetches.
- * Detect this by comparing rsp->completed with the previous fetch
- * from rsp->gpnum.
- *
- * Given this check, comparisons of jiffies, rsp->jiffies_stall,
- * and rsp->gp_start suffice to forestall false positives.
- */
- gpnum = ACCESS_ONCE(rsp->gpnum);
- smp_rmb(); /* Pick up ->gpnum first... */
js = ACCESS_ONCE(rsp->jiffies_stall);
- smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = ACCESS_ONCE(rsp->gp_start);
- smp_rmb(); /* ...and finally ->gp_start before ->completed. */
- completed = ACCESS_ONCE(rsp->completed);
- if (ULONG_CMP_GE(completed, gpnum) ||
- ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
- return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
if (rcu_gp_in_progress(rsp) &&
- (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+ (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -1359,7 +1297,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Initialize a new grace period. Return 0 if no grace period required.
+ * Initialize a new grace period.
*/
static int rcu_gp_init(struct rcu_state *rsp)
{
@@ -1368,27 +1306,18 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
- if (rsp->gp_flags == 0) {
- /* Spurious wakeup, tell caller to go back to sleep. */
- raw_spin_unlock_irq(&rnp->lock);
- return 0;
- }
rsp->gp_flags = 0; /* Clear all flags: New grace period. */
- if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
- /*
- * Grace period already in progress, don't start another.
- * Not supposed to be able to happen.
- */
+ if (rcu_gp_in_progress(rsp)) {
+ /* Grace period already in progress, don't start another. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
/* Advance to a new grace period and initialize state. */
- record_gp_stall_check_time(rsp);
- smp_wmb(); /* Record GP times before starting GP. */
rsp->gpnum++;
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
+ record_gp_stall_check_time(rsp);
raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
@@ -1437,7 +1366,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
/*
* Do one round of quiescent-state forcing.
*/
-static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
{
int fqs_state = fqs_state_in;
bool isidle = false;
@@ -1522,12 +1451,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rsp->gp_flags = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("newreq"));
- }
+ if (cpu_needs_another_gp(rsp, rdp))
+ rsp->gp_flags = 1;
raw_spin_unlock_irq(&rnp->lock);
}
@@ -1537,7 +1462,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
static int __noreturn rcu_gp_kthread(void *arg)
{
int fqs_state;
- int gf;
unsigned long j;
int ret;
struct rcu_state *rsp = arg;
@@ -1547,19 +1471,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Handle grace-period start. */
for (;;) {
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("reqwait"));
wait_event_interruptible(rsp->gp_wq,
- ACCESS_ONCE(rsp->gp_flags) &
+ rsp->gp_flags &
RCU_GP_FLAG_INIT);
- if (rcu_gp_init(rsp))
+ if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
+ rcu_gp_init(rsp))
break;
cond_resched();
flush_signals(current);
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("reqwaitsig"));
}
/* Handle quiescent-state forcing. */
@@ -1569,16 +1488,10 @@ static int __noreturn rcu_gp_kthread(void *arg)
j = HZ;
jiffies_till_first_fqs = HZ;
}
- ret = 0;
for (;;) {
- if (!ret)
- rsp->jiffies_force_qs = jiffies + j;
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqswait"));
+ rsp->jiffies_force_qs = jiffies + j;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = ACCESS_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
+ (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
(!ACCESS_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp)),
j);
@@ -1587,23 +1500,13 @@ static int __noreturn rcu_gp_kthread(void *arg)
!rcu_preempt_blocked_readers_cgp(rnp))
break;
/* If time for quiescent-state forcing, do it. */
- if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
- (gf & RCU_GP_FLAG_FQS)) {
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqsstart"));
+ if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
fqs_state = rcu_gp_fqs(rsp, fqs_state);
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqsend"));
cond_resched();
} else {
/* Deal with stray signal. */
cond_resched();
flush_signals(current);
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqswaitsig"));
}
j = jiffies_till_next_fqs;
if (j > HZ) {
@@ -1651,8 +1554,6 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
return;
}
rsp->gp_flags = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
- TPS("newreq"));
/*
* We can't do wakeups while holding the rnp->lock, as that
@@ -2354,7 +2255,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
*/
- if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+ if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
invoke_rcu_core();
/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2824,13 +2725,10 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
for_each_rcu_flavor(rsp) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- if (!rdp->nxtlist)
- continue;
- hc = true;
- if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
+ if (rdp->qlen != rdp->qlen_lazy)
al = false;
- break;
- }
+ if (rdp->nxtlist)
+ hc = true;
}
if (all_lazy)
*all_lazy = al;
@@ -3318,7 +3216,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/*
* Compute the rcu_node tree geometry from kernel parameters. This cannot
- * replace the definitions in tree.h because those are needed to size
+ * replace the definitions in rcutree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
static void __init rcu_init_geometry(void)
@@ -3397,8 +3295,8 @@ void __init rcu_init(void)
rcu_bootup_announce();
rcu_init_geometry();
- rcu_init_one(&rcu_bh_state, &rcu_bh_data);
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
@@ -3413,4 +3311,4 @@ void __init rcu_init(void)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
}
-#include "tree_plugin.h"
+#include "rcutree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcutree.h
index 52be957..5f97eab 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcutree.h
@@ -104,8 +104,6 @@ struct rcu_dynticks {
/* idle-period nonlazy_posted snapshot. */
unsigned long last_accelerate;
/* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all;
- /* Last jiffy CBs were all advanced. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcutree_plugin.h
index 08a7652..130c97b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
#include <linux/gfp.h>
#include <linux/oom.h>
#include <linux/smpboot.h>
-#include "../time/tick-internal.h"
+#include "time/tick-internal.h"
#define RCU_KTHREAD_PRIO 1
@@ -96,15 +96,10 @@ static void __init rcu_bootup_announce_oddness(void)
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
#ifdef CONFIG_RCU_NOCB_CPU_ALL
pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
+ cpumask_setall(rcu_nocb_mask);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
if (have_rcu_nocb_mask) {
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
if (rcu_nocb_poll)
@@ -665,7 +660,7 @@ static void rcu_preempt_check_callbacks(int cpu)
static void rcu_preempt_do_callbacks(void)
{
- rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+ rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
}
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1133,7 +1128,7 @@ void exit_rcu(void)
#ifdef CONFIG_RCU_BOOST
-#include "../locking/rtmutex_common.h"
+#include "rtmutex_common.h"
#ifdef CONFIG_RCU_TRACE
@@ -1337,7 +1332,7 @@ static void invoke_rcu_callbacks_kthread(void)
*/
static bool rcu_is_callbacks_kthread(void)
{
- return __this_cpu_read(rcu_cpu_kthread_task) == current;
+ return __get_cpu_var(rcu_cpu_kthread_task) == current;
}
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1387,8 +1382,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
static void rcu_kthread_do_work(void)
{
- rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
- rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
+ rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+ rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
rcu_preempt_do_callbacks();
}
@@ -1407,7 +1402,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
- return __this_cpu_read(rcu_cpu_has_work);
+ return __get_cpu_var(rcu_cpu_has_work);
}
/*
@@ -1417,8 +1412,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
*/
static void rcu_cpu_kthread(unsigned int cpu)
{
- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+ char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
int spincnt;
for (spincnt = 0; spincnt < 10; spincnt++) {
@@ -1632,26 +1627,20 @@ module_param(rcu_idle_gp_delay, int, 0644);
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_active;
+extern int tick_nohz_enabled;
/*
- * Try to advance callbacks for all flavors of RCU on the current CPU, but
- * only if it has been awhile since the last time we did so. Afterwards,
- * if there are any callbacks ready for immediate invocation, return true.
+ * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Afterwards, if there are any callbacks ready for immediate invocation,
+ * return true.
*/
static bool rcu_try_advance_all_cbs(void)
{
bool cbs_ready = false;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
- /* Exit early if we advanced recently. */
- if (jiffies == rdtp->last_advance_all)
- return 0;
- rdtp->last_advance_all = jiffies;
-
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
rnp = rdp->mynode;
@@ -1729,7 +1718,7 @@ static void rcu_prepare_for_idle(int cpu)
int tne;
/* Handle nohz enablement switches conservatively. */
- tne = ACCESS_ONCE(tick_nohz_active);
+ tne = ACCESS_ONCE(tick_nohz_enabled);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(cpu, NULL))
invoke_rcu_core(); /* force nohz to see update. */
@@ -1750,8 +1739,6 @@ static void rcu_prepare_for_idle(int cpu)
*/
if (rdtp->all_lazy &&
rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
- rdtp->all_lazy = false;
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
invoke_rcu_core();
return;
}
@@ -1781,11 +1768,17 @@ static void rcu_prepare_for_idle(int cpu)
*/
static void rcu_cleanup_after_idle(int cpu)
{
+ struct rcu_data *rdp;
+ struct rcu_state *rsp;
if (rcu_is_nocb_cpu(cpu))
return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
+ rcu_try_advance_all_cbs();
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_core();
+ }
}
/*
@@ -2115,22 +2108,15 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
/* If we are not being polled and there is a kthread, awaken it ... */
t = ACCESS_ONCE(rdp->nocb_kthread);
- if (rcu_nocb_poll || !t) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeNotPoll"));
+ if (rcu_nocb_poll | !t)
return;
- }
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
rdp->qlen_last_fqs_check = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
wake_up_process(t); /* ... or if many callbacks queued. */
rdp->qlen_last_fqs_check = LONG_MAX / 2;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
- } else {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
}
return;
}
@@ -2154,12 +2140,10 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
(unsigned long)rhp->func,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -atomic_long_read(&rdp->nocb_q_count));
+ rdp->qlen_lazy, rdp->qlen);
else
trace_rcu_callback(rdp->rsp->name, rhp,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -atomic_long_read(&rdp->nocb_q_count));
+ rdp->qlen_lazy, rdp->qlen);
return 1;
}
@@ -2237,7 +2221,6 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
- bool firsttime = 1;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2246,27 +2229,14 @@ static int rcu_nocb_kthread(void *arg)
/* Each pass through this loop invokes one batch of callbacks */
for (;;) {
/* If not polling, wait for next batch of callbacks. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Sleep"));
+ if (!rcu_nocb_poll)
wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
- } else if (firsttime) {
- firsttime = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Poll"));
- }
list = ACCESS_ONCE(rdp->nocb_head);
if (!list) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeEmpty"));
schedule_timeout_interruptible(1);
flush_signals(current);
continue;
}
- firsttime = 1;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeNonEmpty"));
/*
* Extract queued callbacks, update counts, and wait
@@ -2287,11 +2257,7 @@ static int rcu_nocb_kthread(void *arg)
next = list->next;
/* Wait for enqueuing to complete, if needed. */
while (next == NULL && &list->next != tail) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WaitQueue"));
schedule_timeout_interruptible(1);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeQueue"));
next = list->next;
}
debug_rcu_head_unqueue(list);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcutree_trace.c
index 3596797..cf6c174 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -44,7 +44,7 @@
#include <linux/seq_file.h>
#define RCU_TREE_NONCORE
-#include "tree.h"
+#include "rcutree.h"
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 13b243a..13b243a 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/rtmutex-debug.h
index 14193d5..14193d5 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/rtmutex-debug.h
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 1d96dd0..1d96dd0 100644
--- a/kernel/locking/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
diff --git a/kernel/locking/rtmutex.c b/kernel/rtmutex.c
index 0dd6aec..0dd6aec 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/rtmutex.c
diff --git a/kernel/locking/rtmutex.h b/kernel/rtmutex.h
index a1a1dd0..a1a1dd0 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/rtmutex.h
diff --git a/kernel/locking/rtmutex_common.h b/kernel/rtmutex_common.h
index 53a66c8..53a66c8 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
diff --git a/kernel/locking/rwsem.c b/kernel/rwsem.c
index cfff143..cfff143 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/rwsem.c
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b62140..54adcf3 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,7 +12,6 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
-obj-y += wait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
deleted file mode 100644
index a63f4dc..0000000
--- a/kernel/sched/completion.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Generic wait-for-completion handler;
- *
- * It differs from semaphores in that their default case is the opposite,
- * wait_for_completion default blocks whereas semaphore default non-block. The
- * interface also makes it easy to 'complete' multiple waiting threads,
- * something which isn't entirely natural for semaphores.
- *
- * But more importantly, the primitive documents the usage. Semaphores would
- * typically be used for exclusion which gives rise to priority inversion.
- * Waiting for completion is a typically sync point, but not an exclusion point.
- */
-
-#include <linux/sched.h>
-#include <linux/completion.h>
-
-/**
- * complete: - signals a single thread waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x: holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-
-static inline long __sched
-do_wait_for_common(struct completion *x,
- long (*action)(long), long timeout, int state)
-{
- if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
-
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
- do {
- if (signal_pending_state(state, current)) {
- timeout = -ERESTARTSYS;
- break;
- }
- __set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
- timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
- } while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
- if (!x->done)
- return timeout;
- }
- x->done--;
- return timeout ?: 1;
-}
-
-static inline long __sched
-__wait_for_common(struct completion *x,
- long (*action)(long), long timeout, int state)
-{
- might_sleep();
-
- spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
- return timeout;
-}
-
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
- return __wait_for_common(x, schedule_timeout, timeout, state);
-}
-
-static long __sched
-wait_for_common_io(struct completion *x, long timeout, int state)
-{
- return __wait_for_common(x, io_schedule_timeout, timeout, state);
-}
-
-/**
- * wait_for_completion: - waits for completion of a task
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
- wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
- * till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-
-/**
- * wait_for_completion_io: - waits for completion of a task
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
- */
-void __sched wait_for_completion_io(struct completion *x)
-{
- wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io);
-
-/**
- * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
- *
- * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
- * till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
-{
- return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io_timeout);
-
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x: holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
- * or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x: holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
- long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
- if (t == -ERESTARTSYS)
- return t;
- return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x: holds the state of this particular completion
- * @timeout: timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
- * or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
- unsigned long timeout)
-{
- return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-
-/**
- * try_wait_for_completion - try to decrement a completion without blocking
- * @x: completion structure
- *
- * Return: 0 if a decrement cannot be done without blocking
- * 1 if a decrement succeeded.
- *
- * If a completion is being used as a counting completion,
- * attempt to decrement the counter without blocking. This
- * enables us to avoid waiting if the resource the completion
- * is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- else
- x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-
-/**
- * completion_done - Test to see if a completion has any waiters
- * @x: completion structure
- *
- * Return: 0 if there are waiters (wait_for_completion() in progress)
- * 1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e85cda2..5ac63c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,11 +513,12 @@ static inline void init_hrtick(void)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
+#ifdef CONFIG_SMP
void resched_task(struct task_struct *p)
{
int cpu;
- lockdep_assert_held(&task_rq(p)->lock);
+ assert_raw_spin_locked(&task_rq(p)->lock);
if (test_tsk_need_resched(p))
return;
@@ -525,10 +526,8 @@ void resched_task(struct task_struct *p)
set_tsk_need_resched(p);
cpu = task_cpu(p);
- if (cpu == smp_processor_id()) {
- set_preempt_need_resched();
+ if (cpu == smp_processor_id())
return;
- }
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
@@ -547,7 +546,6 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
@@ -695,6 +693,12 @@ void sched_avg_update(struct rq *rq)
}
}
+#else /* !CONFIG_SMP */
+void resched_task(struct task_struct *p)
+{
+ assert_raw_spin_locked(&task_rq(p)->lock);
+ set_tsk_need_resched(p);
+}
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -763,14 +767,14 @@ static void set_load_weight(struct task_struct *p)
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(rq, p);
+ sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(rq, p);
+ sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -983,7 +987,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
#ifdef CONFIG_LOCKDEP
/*
@@ -1013,107 +1017,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
-static void __migrate_swap_task(struct task_struct *p, int cpu)
-{
- if (p->on_rq) {
- struct rq *src_rq, *dst_rq;
-
- src_rq = task_rq(p);
- dst_rq = cpu_rq(cpu);
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, cpu);
- activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
- } else {
- /*
- * Task isn't running anymore; make it appear like we migrated
- * it before it went to sleep. This means on wakeup we make the
- * previous cpu our targer instead of where it really is.
- */
- p->wake_cpu = cpu;
- }
-}
-
-struct migration_swap_arg {
- struct task_struct *src_task, *dst_task;
- int src_cpu, dst_cpu;
-};
-
-static int migrate_swap_stop(void *data)
-{
- struct migration_swap_arg *arg = data;
- struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
-
- src_rq = cpu_rq(arg->src_cpu);
- dst_rq = cpu_rq(arg->dst_cpu);
-
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
- if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
-
- if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
-
- if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
- goto unlock;
-
- if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
- goto unlock;
-
- __migrate_swap_task(arg->src_task, arg->dst_cpu);
- __migrate_swap_task(arg->dst_task, arg->src_cpu);
-
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
-}
-
-/*
- * Cross migrate two tasks
- */
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
-{
- struct migration_swap_arg arg;
- int ret = -EINVAL;
-
- arg = (struct migration_swap_arg){
- .src_task = cur,
- .src_cpu = task_cpu(cur),
- .dst_task = p,
- .dst_cpu = task_cpu(p),
- };
-
- if (arg.src_cpu == arg.dst_cpu)
- goto out;
-
- /*
- * These three tests are all lockless; this is OK since all of them
- * will be re-checked with proper locks held further down the line.
- */
- if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
- goto out;
-
- if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
- goto out;
-
- if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
- goto out;
-
- ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
-
-out:
- return ret;
-}
-
struct migration_arg {
struct task_struct *task;
int dest_cpu;
@@ -1333,9 +1236,9 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1427,13 +1330,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
- u64 max = 2*rq->max_idle_balance_cost;
+ u64 max = 2*sysctl_sched_migration_cost;
- update_avg(&rq->avg_idle, delta);
-
- if (rq->avg_idle > max)
+ if (delta > max)
rq->avg_idle = max;
-
+ else
+ update_avg(&rq->avg_idle, delta);
rq->idle_stamp = 0;
}
#endif
@@ -1494,14 +1396,6 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- if (tif_need_resched())
- set_preempt_need_resched();
-
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
@@ -1619,7 +1513,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1701,7 +1595,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*
* __sched_fork() is basic setup used by init_idle() too:
*/
-static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+static void __sched_fork(struct task_struct *p)
{
p->on_rq = 0;
@@ -1725,24 +1619,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
- p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
p->mm->numa_scan_seq = 0;
}
- if (clone_flags & CLONE_VM)
- p->numa_preferred_nid = current->numa_preferred_nid;
- else
- p->numa_preferred_nid = -1;
-
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
-
- INIT_LIST_HEAD(&p->numa_entry);
- p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -1768,12 +1654,12 @@ void set_numabalancing_state(bool enabled)
/*
* fork()/clone()-time setup:
*/
-void sched_fork(unsigned long clone_flags, struct task_struct *p)
+void sched_fork(struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
- __sched_fork(clone_flags, p);
+ __sched_fork(p);
/*
* We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
@@ -1831,7 +1717,10 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
- init_task_preempt_count(p);
+#ifdef CONFIG_PREEMPT_COUNT
+ /* Want to start with kernel preemption disabled. */
+ task_thread_info(p)->preempt_count = 1;
+#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
#endif
@@ -1858,7 +1747,7 @@ void wake_up_new_task(struct task_struct *p)
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif
/* Initialize new task's runnable average */
@@ -1949,7 +1838,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
trace_sched_switch(prev, next);
- sched_info_switch(rq, prev, next);
+ sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
@@ -2001,8 +1890,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
- task_numa_free(prev);
-
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -2186,7 +2073,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -2253,20 +2140,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns = 0;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
- /*
- * 64-bit doesn't need locks to atomically read a 64bit value.
- * So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
- *
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
- * indistinguishable from the read occurring a few cycles earlier.
- */
- if (!p->on_cpu)
- return p->se.sum_exec_runtime;
-#endif
-
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, p, &flags);
@@ -2342,7 +2215,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes preempt_count_add(int val)
+void __kprobes add_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2351,7 +2224,7 @@ void __kprobes preempt_count_add(int val)
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
#endif
- __preempt_count_add(val);
+ preempt_count() += val;
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
@@ -2362,9 +2235,9 @@ void __kprobes preempt_count_add(int val)
if (preempt_count() == val)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
-EXPORT_SYMBOL(preempt_count_add);
+EXPORT_SYMBOL(add_preempt_count);
-void __kprobes preempt_count_sub(int val)
+void __kprobes sub_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2382,9 +2255,9 @@ void __kprobes preempt_count_sub(int val)
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
- __preempt_count_sub(val);
+ preempt_count() -= val;
}
-EXPORT_SYMBOL(preempt_count_sub);
+EXPORT_SYMBOL(sub_preempt_count);
#endif
@@ -2557,7 +2430,6 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
- clear_preempt_need_resched();
rq->skip_clock_update = 0;
if (likely(prev != next)) {
@@ -2648,9 +2520,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ add_preempt_count_notrace(PREEMPT_ACTIVE);
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ sub_preempt_count_notrace(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -2660,7 +2532,6 @@ asmlinkage void __sched notrace preempt_schedule(void)
} while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
-#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -2670,19 +2541,20 @@ EXPORT_SYMBOL(preempt_schedule);
*/
asmlinkage void __sched preempt_schedule_irq(void)
{
+ struct thread_info *ti = current_thread_info();
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
- BUG_ON(preempt_count() || !irqs_disabled());
+ BUG_ON(ti->preempt_count || !irqs_disabled());
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ sub_preempt_count(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -2694,6 +2566,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
+#endif /* CONFIG_PREEMPT */
+
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
@@ -2701,6 +2575,393 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, wake_flags, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+ __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+ int wake_flags = WF_SYNC;
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
@@ -3337,11 +3598,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
struct task_struct *p;
int retval;
+ get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
+ put_online_cpus();
return -ESRCH;
}
@@ -3398,6 +3661,7 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
+ put_online_cpus();
return retval;
}
@@ -3442,6 +3706,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
+ get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -3454,11 +3719,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
+ put_online_cpus();
return retval;
}
@@ -3528,11 +3794,16 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+static inline int should_resched(void)
+{
+ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
static void __cond_resched(void)
{
- __preempt_count_add(PREEMPT_ACTIVE);
+ add_preempt_count(PREEMPT_ACTIVE);
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ sub_preempt_count(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
@@ -3915,7 +4186,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
- __sched_fork(0, idle);
+ __sched_fork(idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -3941,7 +4212,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
- init_idle_preempt_count(idle, cpu);
+ task_thread_info(idle)->preempt_count = 0;
/*
* The idle tasks have their own, simple scheduling class:
@@ -4075,53 +4346,6 @@ fail:
return ret;
}
-#ifdef CONFIG_NUMA_BALANCING
-/* Migrate current task p to target_cpu */
-int migrate_task_to(struct task_struct *p, int target_cpu)
-{
- struct migration_arg arg = { p, target_cpu };
- int curr_cpu = task_cpu(p);
-
- if (curr_cpu == target_cpu)
- return 0;
-
- if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
- return -EINVAL;
-
- /* TODO: This is not properly updating schedstats */
-
- return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
-}
-
-/*
- * Requeue a task on a given node and accurately track the number of NUMA
- * tasks on the runqueues
- */
-void sched_setnuma(struct task_struct *p, int nid)
-{
- struct rq *rq;
- unsigned long flags;
- bool on_rq, running;
-
- rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
- running = task_current(rq, p);
-
- if (on_rq)
- dequeue_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (running)
- p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
- task_rq_unlock(rq, p, &flags);
-}
-#endif
-
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
@@ -4761,7 +4985,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rd yet then
+ * If we dont want to free the old_rt yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -4895,9 +5119,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
@@ -4909,19 +5130,11 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- sd = sd->parent; /* sd_busy */
}
- rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
-
- sd = lowest_flag_domain(cpu, SD_NUMA);
- rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
-
- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
- rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
@@ -5441,7 +5654,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
| sd_local_flags(level)
,
.last_balance = jiffies,
@@ -6123,17 +6335,14 @@ void __init sched_init_smp(void)
sched_init_numa();
- /*
- * There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
- * happen.
- */
+ get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
+ put_online_cpus();
hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6296,7 +6505,6 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7069,12 +7277,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- /*
- * If we need to toggle cfs_bandwidth_used, off->on must occur
- * before making related changes, and on->off must occur afterwards
- */
- if (runtime_enabled && !runtime_was_enabled)
- cfs_bandwidth_usage_inc();
+ account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7100,8 +7303,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
- if (runtime_was_enabled && !runtime_enabled)
- cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d18..1965599 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,7 +15,6 @@
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
-#include <linux/mempolicy.h>
#include "sched.h"
@@ -138,9 +137,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
-#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
-#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
#endif
@@ -163,7 +159,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
- if (task_cpu(p) != rq_cpu)
+ if (!p->on_rq || task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -229,14 +225,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
- SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
- cfs_rq->tg->cfs_bandwidth.timer_active);
- SEQ_printf(m, " .%-30s: %d\n", "throttled",
- cfs_rq->throttled);
- SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
- cfs_rq->throttle_count);
-#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -357,7 +345,7 @@ static void sched_debug_header(struct seq_file *m)
cpu_clk = local_clock();
local_irq_restore(flags);
- SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
+ SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@@ -500,56 +488,6 @@ static int __init init_sched_debug_procfs(void)
__initcall(init_sched_debug_procfs);
-#define __P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
-
-
-static void sched_show_numa(struct task_struct *p, struct seq_file *m)
-{
-#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
- int node, i;
-
- if (p->mm)
- P(mm->numa_scan_seq);
-
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
- SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
-
- for_each_online_node(node) {
- for (i = 0; i < 2; i++) {
- unsigned long nr_faults = -1;
- int cpu_current, home_node;
-
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
-
- cpu_current = !i ? (task_node(p) == node) :
- (pol && node_isset(node, pol->v.nodes));
-
- home_node = (p->numa_preferred_nid == node);
-
- SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
- i, node, cpu_current, home_node, nr_faults);
- }
- }
-
- mpol_put(pol);
-#endif
-}
-
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
unsigned long nr_switches;
@@ -653,8 +591,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "%-45s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
-
- sched_show_numa(p, m);
}
void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ad..7c70201 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,8 +681,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-static unsigned long task_h_load(struct task_struct *p);
-
static inline void __update_task_entity_contrib(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
@@ -820,12 +818,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Approximate time to scan a full NUMA task in ms. The task scan period is
- * calculated based on the tasks virtual memory size and
- * numa_balancing_scan_size.
+ * numa task sample period in ms
*/
-unsigned int sysctl_numa_balancing_scan_period_min = 1000;
-unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -833,835 +830,41 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
-static unsigned int task_nr_scan_windows(struct task_struct *p)
-{
- unsigned long rss = 0;
- unsigned long nr_scan_pages;
-
- /*
- * Calculations based on RSS as non-present and empty pages are skipped
- * by the PTE scanner and NUMA hinting faults should be trapped based
- * on resident pages
- */
- nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
- rss = get_mm_rss(p->mm);
- if (!rss)
- rss = nr_scan_pages;
-
- rss = round_up(rss, nr_scan_pages);
- return rss / nr_scan_pages;
-}
-
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
-#define MAX_SCAN_WINDOW 2560
-
-static unsigned int task_scan_min(struct task_struct *p)
-{
- unsigned int scan, floor;
- unsigned int windows = 1;
-
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
- floor = 1000 / windows;
-
- scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
- return max_t(unsigned int, floor, scan);
-}
-
-static unsigned int task_scan_max(struct task_struct *p)
-{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
-
- /* Watch for min being lower than max due to floor calculations */
- smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
- return max(smin, smax);
-}
-
-/*
- * Once a preferred node is selected the scheduler balancer will prefer moving
- * a task to that node for sysctl_numa_balancing_settle_count number of PTE
- * scans. This will give the process the chance to accumulate more faults on
- * the preferred node but still allow the scheduler to move the task again if
- * the nodes CPUs are overloaded.
- */
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
-
-static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
-{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
- rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
-}
-
-static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
-{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
- rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
-}
-
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- struct list_head task_list;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long faults[0];
-};
-
-pid_t task_numa_group_id(struct task_struct *p)
-{
- return p->numa_group ? p->numa_group->gid : 0;
-}
-
-static inline int task_faults_idx(int nid, int priv)
-{
- return 2 * nid + priv;
-}
-
-static inline unsigned long task_faults(struct task_struct *p, int nid)
-{
- if (!p->numa_faults)
- return 0;
-
- return p->numa_faults[task_faults_idx(nid, 0)] +
- p->numa_faults[task_faults_idx(nid, 1)];
-}
-
-static inline unsigned long group_faults(struct task_struct *p, int nid)
-{
- if (!p->numa_group)
- return 0;
-
- return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
-}
-
-/*
- * These return the fraction of accesses done by a particular task, or
- * task group, on a particular numa node. The group weight is given a
- * larger multiplier, in order to group tasks together that are almost
- * evenly spread out between numa nodes.
- */
-static inline unsigned long task_weight(struct task_struct *p, int nid)
-{
- unsigned long total_faults;
-
- if (!p->numa_faults)
- return 0;
-
- total_faults = p->total_numa_faults;
-
- if (!total_faults)
- return 0;
-
- return 1000 * task_faults(p, nid) / total_faults;
-}
-
-static inline unsigned long group_weight(struct task_struct *p, int nid)
-{
- if (!p->numa_group || !p->numa_group->total_faults)
- return 0;
-
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
-}
-
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static unsigned long power_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
-
-/* Cached statistics for all CPUs within a node */
-struct numa_stats {
- unsigned long nr_running;
- unsigned long load;
-
- /* Total compute capacity of CPUs on a node */
- unsigned long power;
-
- /* Approximate capacity in terms of runnable tasks on a node */
- unsigned long capacity;
- int has_capacity;
-};
-
-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
-{
- int cpu, cpus = 0;
-
- memset(ns, 0, sizeof(*ns));
- for_each_cpu(cpu, cpumask_of_node(nid)) {
- struct rq *rq = cpu_rq(cpu);
-
- ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
- ns->power += power_of(cpu);
-
- cpus++;
- }
-
- /*
- * If we raced with hotplug and there are no CPUs left in our mask
- * the @ns structure is NULL'ed and task_numa_compare() will
- * not find this node attractive.
- *
- * We'll either bail at !has_capacity, or we'll detect a huge imbalance
- * and bail there.
- */
- if (!cpus)
- return;
-
- ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
- ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
- ns->has_capacity = (ns->nr_running < ns->capacity);
-}
-
-struct task_numa_env {
- struct task_struct *p;
-
- int src_cpu, src_nid;
- int dst_cpu, dst_nid;
-
- struct numa_stats src_stats, dst_stats;
-
- int imbalance_pct, idx;
-
- struct task_struct *best_task;
- long best_imp;
- int best_cpu;
-};
-
-static void task_numa_assign(struct task_numa_env *env,
- struct task_struct *p, long imp)
-{
- if (env->best_task)
- put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
-
- env->best_task = p;
- env->best_imp = imp;
- env->best_cpu = env->dst_cpu;
-}
-
-/*
- * This checks if the overall compute and NUMA accesses of the system would
- * be improved if the source tasks was migrated to the target dst_cpu taking
- * into account that it might be best if task running on the dst_cpu should
- * be exchanged with the source task
- */
-static void task_numa_compare(struct task_numa_env *env,
- long taskimp, long groupimp)
-{
- struct rq *src_rq = cpu_rq(env->src_cpu);
- struct rq *dst_rq = cpu_rq(env->dst_cpu);
- struct task_struct *cur;
- long dst_load, src_load;
- long load;
- long imp = (groupimp > 0) ? groupimp : taskimp;
-
- rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
- cur = NULL;
-
- /*
- * "imp" is the fault differential for the source task between the
- * source and destination node. Calculate the total differential for
- * the source task and potential destination task. The more negative
- * the value is, the more rmeote accesses that would be expected to
- * be incurred if the tasks were swapped.
- */
- if (cur) {
- /* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
- goto unlock;
-
- /*
- * If dst and source tasks are in the same NUMA group, or not
- * in any group then look only at task weights.
- */
- if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
- /*
- * Add some hysteresis to prevent swapping the
- * tasks within a group over tiny differences.
- */
- if (cur->numa_group)
- imp -= imp/16;
- } else {
- /*
- * Compare the group weights. If a task is all by
- * itself (not part of a group), use the task weight
- * instead.
- */
- if (env->p->numa_group)
- imp = groupimp;
- else
- imp = taskimp;
-
- if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
- else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
- }
- }
-
- if (imp < env->best_imp)
- goto unlock;
-
- if (!cur) {
- /* Is there capacity at our destination? */
- if (env->src_stats.has_capacity &&
- !env->dst_stats.has_capacity)
- goto unlock;
-
- goto balance;
- }
-
- /* Balance doesn't matter much if we're running a task per cpu */
- if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
- goto assign;
-
- /*
- * In the overloaded case, try and keep the load balanced.
- */
-balance:
- dst_load = env->dst_stats.load;
- src_load = env->src_stats.load;
-
- /* XXX missing power terms */
- load = task_h_load(env->p);
- dst_load += load;
- src_load -= load;
-
- if (cur) {
- load = task_h_load(cur);
- dst_load -= load;
- src_load += load;
- }
-
- /* make src_load the smaller */
- if (dst_load < src_load)
- swap(dst_load, src_load);
-
- if (src_load * env->imbalance_pct < dst_load * 100)
- goto unlock;
-
-assign:
- task_numa_assign(env, cur, imp);
-unlock:
- rcu_read_unlock();
-}
-
-static void task_numa_find_cpu(struct task_numa_env *env,
- long taskimp, long groupimp)
-{
- int cpu;
-
- for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
- /* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
- continue;
-
- env->dst_cpu = cpu;
- task_numa_compare(env, taskimp, groupimp);
- }
-}
-
-static int task_numa_migrate(struct task_struct *p)
-{
- struct task_numa_env env = {
- .p = p,
-
- .src_cpu = task_cpu(p),
- .src_nid = task_node(p),
-
- .imbalance_pct = 112,
-
- .best_task = NULL,
- .best_imp = 0,
- .best_cpu = -1
- };
- struct sched_domain *sd;
- unsigned long taskweight, groupweight;
- int nid, ret;
- long taskimp, groupimp;
-
- /*
- * Pick the lowest SD_NUMA domain, as that would have the smallest
- * imbalance and would be the first to start moving tasks about.
- *
- * And we want to avoid any moving of tasks about, as that would create
- * random movement of tasks -- counter the numa conditions we're trying
- * to satisfy here.
- */
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
- env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
- rcu_read_unlock();
-
- /*
- * Cpusets can break the scheduler domain tree into smaller
- * balance domains, some of which do not cross NUMA boundaries.
- * Tasks that are "trapped" in such domains cannot be migrated
- * elsewhere, so there is no point in (re)trying.
- */
- if (unlikely(!sd)) {
- p->numa_preferred_nid = cpu_to_node(task_cpu(p));
- return -EINVAL;
- }
-
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
- env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
-
- /* If the preferred nid has capacity, try to use it. */
- if (env.dst_stats.has_capacity)
- task_numa_find_cpu(&env, taskimp, groupimp);
-
- /* No space available on the preferred nid. Look elsewhere. */
- if (env.best_cpu == -1) {
- for_each_online_node(nid) {
- if (nid == env.src_nid || nid == p->numa_preferred_nid)
- continue;
-
- /* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
- if (taskimp < 0 && groupimp < 0)
- continue;
-
- env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
- task_numa_find_cpu(&env, taskimp, groupimp);
- }
- }
-
- /* No better CPU than the current one was found. */
- if (env.best_cpu == -1)
- return -EAGAIN;
-
- sched_setnuma(p, env.dst_nid);
-
- /*
- * Reset the scan period if the task is being rescheduled on an
- * alternative node to recheck if the tasks is now properly placed.
- */
- p->numa_scan_period = task_scan_min(p);
-
- if (env.best_task == NULL) {
- int ret = migrate_task_to(p, env.best_cpu);
- return ret;
- }
-
- ret = migrate_swap(p, env.best_task);
- put_task_struct(env.best_task);
- return ret;
-}
-
-/* Attempt to migrate a task to a CPU on the preferred node. */
-static void numa_migrate_preferred(struct task_struct *p)
-{
- /* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
- return;
-
- /* Periodically retry migrating the task to the preferred node */
- p->numa_migrate_retry = jiffies + HZ;
-
- /* Success if task is already running on preferred CPU */
- if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
- return;
-
- /* Otherwise, try migrate to a CPU on the preferred node */
- task_numa_migrate(p);
-}
-
-/*
- * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
- * increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
- * scan period will decrease
- */
-#define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
-
-/*
- * Increase the scan period (slow down scanning) if the majority of
- * our memory is already on our local node, or if the majority of
- * the page accesses are shared with other processes.
- * Otherwise, decrease the scan period.
- */
-static void update_task_scan_period(struct task_struct *p,
- unsigned long shared, unsigned long private)
-{
- unsigned int period_slot;
- int ratio;
- int diff;
-
- unsigned long remote = p->numa_faults_locality[0];
- unsigned long local = p->numa_faults_locality[1];
-
- /*
- * If there were no record hinting faults then either the task is
- * completely idle or all activity is areas that are not of interest
- * to automatic numa balancing. Scan slower
- */
- if (local + shared == 0) {
- p->numa_scan_period = min(p->numa_scan_period_max,
- p->numa_scan_period << 1);
-
- p->mm->numa_next_scan = jiffies +
- msecs_to_jiffies(p->numa_scan_period);
-
- return;
- }
-
- /*
- * Prepare to scale scan period relative to the current period.
- * == NUMA_PERIOD_THRESHOLD scan period stays the same
- * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
- * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
- */
- period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
- if (!slot)
- slot = 1;
- diff = slot * period_slot;
- } else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
- /*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
- */
- period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
- }
-
- p->numa_scan_period = clamp(p->numa_scan_period + diff,
- task_scan_min(p), task_scan_max(p));
- memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
-}
-
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1, max_group_nid = -1;
- unsigned long max_faults = 0, max_group_faults = 0;
- unsigned long fault_types[2] = { 0, 0 };
- spinlock_t *group_lock = NULL;
+ int seq;
+ if (!p->mm) /* for example, ksmd faulting in a user's mm */
+ return;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
- p->numa_scan_period_max = task_scan_max(p);
-
- /* If the task is part of a group prevent parallel updates to group stats */
- if (p->numa_group) {
- group_lock = &p->numa_group->lock;
- spin_lock(group_lock);
- }
-
- /* Find the node with the highest number of faults */
- for_each_online_node(nid) {
- unsigned long faults = 0, group_faults = 0;
- int priv, i;
-
- for (priv = 0; priv < 2; priv++) {
- long diff;
-
- i = task_faults_idx(nid, priv);
- diff = -p->numa_faults[i];
-
- /* Decay existing window, copy faults since last scan */
- p->numa_faults[i] >>= 1;
- p->numa_faults[i] += p->numa_faults_buffer[i];
- fault_types[priv] += p->numa_faults_buffer[i];
- p->numa_faults_buffer[i] = 0;
-
- faults += p->numa_faults[i];
- diff += p->numa_faults[i];
- p->total_numa_faults += diff;
- if (p->numa_group) {
- /* safe because we can only change our own group */
- p->numa_group->faults[i] += diff;
- p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[i];
- }
- }
-
- if (faults > max_faults) {
- max_faults = faults;
- max_nid = nid;
- }
-
- if (group_faults > max_group_faults) {
- max_group_faults = group_faults;
- max_group_nid = nid;
- }
- }
-
- update_task_scan_period(p, fault_types[0], fault_types[1]);
-
- if (p->numa_group) {
- /*
- * If the preferred task and group nids are different,
- * iterate over the nodes again to find the best place.
- */
- if (max_nid != max_group_nid) {
- unsigned long weight, max_weight = 0;
-
- for_each_online_node(nid) {
- weight = task_weight(p, nid) + group_weight(p, nid);
- if (weight > max_weight) {
- max_weight = weight;
- max_nid = nid;
- }
- }
- }
-
- spin_unlock(group_lock);
- }
- /* Preferred node as the node with the most faults */
- if (max_faults && max_nid != p->numa_preferred_nid) {
- /* Update the preferred nid and migrate task if possible */
- sched_setnuma(p, max_nid);
- numa_migrate_preferred(p);
- }
-}
-
-static inline int get_numa_group(struct numa_group *grp)
-{
- return atomic_inc_not_zero(&grp->refcount);
-}
-
-static inline void put_numa_group(struct numa_group *grp)
-{
- if (atomic_dec_and_test(&grp->refcount))
- kfree_rcu(grp, rcu);
-}
-
-static void task_numa_group(struct task_struct *p, int cpupid, int flags,
- int *priv)
-{
- struct numa_group *grp, *my_grp;
- struct task_struct *tsk;
- bool join = false;
- int cpu = cpupid_to_cpu(cpupid);
- int i;
-
- if (unlikely(!p->numa_group)) {
- unsigned int size = sizeof(struct numa_group) +
- 2*nr_node_ids*sizeof(unsigned long);
-
- grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!grp)
- return;
-
- atomic_set(&grp->refcount, 1);
- spin_lock_init(&grp->lock);
- INIT_LIST_HEAD(&grp->task_list);
- grp->gid = p->pid;
-
- for (i = 0; i < 2*nr_node_ids; i++)
- grp->faults[i] = p->numa_faults[i];
-
- grp->total_faults = p->total_numa_faults;
-
- list_add(&p->numa_entry, &grp->task_list);
- grp->nr_tasks++;
- rcu_assign_pointer(p->numa_group, grp);
- }
-
- rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
-
- if (!cpupid_match_pid(tsk, cpupid))
- goto no_join;
-
- grp = rcu_dereference(tsk->numa_group);
- if (!grp)
- goto no_join;
-
- my_grp = p->numa_group;
- if (grp == my_grp)
- goto no_join;
-
- /*
- * Only join the other group if its bigger; if we're the bigger group,
- * the other task will join us.
- */
- if (my_grp->nr_tasks > grp->nr_tasks)
- goto no_join;
-
- /*
- * Tie-break on the grp address.
- */
- if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
- goto no_join;
-
- /* Always join threads in the same process. */
- if (tsk->mm == current->mm)
- join = true;
-
- /* Simple filter to avoid false positives due to PID collisions */
- if (flags & TNF_SHARED)
- join = true;
-
- /* Update priv based on whether false sharing was detected */
- *priv = !join;
-
- if (join && !get_numa_group(grp))
- goto no_join;
-
- rcu_read_unlock();
-
- if (!join)
- return;
-
- double_lock(&my_grp->lock, &grp->lock);
-
- for (i = 0; i < 2*nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults[i];
- grp->faults[i] += p->numa_faults[i];
- }
- my_grp->total_faults -= p->total_numa_faults;
- grp->total_faults += p->total_numa_faults;
-
- list_move(&p->numa_entry, &grp->task_list);
- my_grp->nr_tasks--;
- grp->nr_tasks++;
-
- spin_unlock(&my_grp->lock);
- spin_unlock(&grp->lock);
-
- rcu_assign_pointer(p->numa_group, grp);
-
- put_numa_group(my_grp);
- return;
-
-no_join:
- rcu_read_unlock();
- return;
-}
-
-void task_numa_free(struct task_struct *p)
-{
- struct numa_group *grp = p->numa_group;
- int i;
- void *numa_faults = p->numa_faults;
-
- if (grp) {
- spin_lock(&grp->lock);
- for (i = 0; i < 2*nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults[i];
- grp->total_faults -= p->total_numa_faults;
-
- list_del(&p->numa_entry);
- grp->nr_tasks--;
- spin_unlock(&grp->lock);
- rcu_assign_pointer(p->numa_group, NULL);
- put_numa_group(grp);
- }
-
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
- kfree(numa_faults);
+ /* FIXME: Scheduling placement policy hints go here */
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int node, int pages, bool migrated)
{
struct task_struct *p = current;
- bool migrated = flags & TNF_MIGRATED;
- int priv;
if (!numabalancing_enabled)
return;
- /* for example, ksmd faulting in a user's mm */
- if (!p->mm)
- return;
-
- /* Do not worry about placement if exiting */
- if (p->state == TASK_DEAD)
- return;
-
- /* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults)) {
- int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
-
- /* numa_faults and numa_faults_buffer share the allocation */
- p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults)
- return;
-
- BUG_ON(p->numa_faults_buffer);
- p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
- p->total_numa_faults = 0;
- memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
- }
+ /* FIXME: Allocate task-specific structure for placement policy here */
/*
- * First accesses are treated as private, otherwise consider accesses
- * to be private if the accessing pid has not changed
+ * If pages are properly placed (did not migrate) then scan slower.
+ * This is reset periodically in case of phase changes
*/
- if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
- priv = 1;
- } else {
- priv = cpupid_match_pid(p, last_cpupid);
- if (!priv && !(flags & TNF_NO_GROUP))
- task_numa_group(p, last_cpupid, flags, &priv);
- }
+ if (!migrated)
+ p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+ p->numa_scan_period + jiffies_to_msecs(10));
task_numa_placement(p);
-
- /*
- * Retry task to preferred node migration periodically, in case it
- * case it previously failed, or the scheduler moved us.
- */
- if (time_after(jiffies, p->numa_migrate_retry))
- numa_migrate_preferred(p);
-
- if (migrated)
- p->numa_pages_migrated += pages;
-
- p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
- p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -1681,7 +884,6 @@ void task_numa_work(struct callback_head *work)
struct mm_struct *mm = p->mm;
struct vm_area_struct *vma;
unsigned long start, end;
- unsigned long nr_pte_updates = 0;
long pages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -1698,9 +900,35 @@ void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
- if (!mm->numa_next_scan) {
- mm->numa_next_scan = now +
- msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ /*
+ * We do not care about task placement until a task runs on a node
+ * other than the first one used by the address space. This is
+ * largely because migrations are driven by what CPU the task
+ * is running on. If it's never scheduled on another node, it'll
+ * not migrate so why bother trapping the fault.
+ */
+ if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+ mm->first_nid = numa_node_id();
+ if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+ /* Are we running on a new node yet? */
+ if (numa_node_id() == mm->first_nid &&
+ !sched_feat_numa(NUMA_FORCE))
+ return;
+
+ mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+ }
+
+ /*
+ * Reset the scan period if enough time has gone by. Objective is that
+ * scanning will be reduced if pages are properly placed. As tasks
+ * can enter different phases this needs to be re-examined. Lacking
+ * proper tracking of reference behaviour, this blunt hammer is used.
+ */
+ migrate = mm->numa_next_reset;
+ if (time_after(now, migrate)) {
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+ xchg(&mm->numa_next_reset, next_scan);
}
/*
@@ -1710,20 +938,20 @@ void task_numa_work(struct callback_head *work)
if (time_before(now, migrate))
return;
- if (p->numa_scan_period == 0) {
- p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
- }
+ if (p->numa_scan_period == 0)
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
- * Delay this task enough that another task of this mm will likely win
- * the next time around.
+ * Do not set pte_numa if the current running node is rate-limited.
+ * This loses statistics on the fault but if we are unwilling to
+ * migrate to this node, it is less likely we can do useful work
*/
- p->node_stamp += 2 * TICK_NSEC;
+ if (migrate_ratelimited(numa_node_id()))
+ return;
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
@@ -1739,32 +967,18 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma))
continue;
- /*
- * Shared library pages mapped by multiple processes are not
- * migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
- * as migrating the pages will be of marginal benefit.
- */
- if (!vma->vm_mm ||
- (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ /* Skip small VMAs. They are not likely to be of relevance */
+ if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
- nr_pte_updates += change_prot_numa(vma, start, end);
-
- /*
- * Scan sysctl_numa_balancing_scan_size but ensure that
- * at least one PTE is updated so that unused virtual
- * address space is quickly skipped.
- */
- if (nr_pte_updates)
- pages -= (end - start) >> PAGE_SHIFT;
+ pages -= change_prot_numa(vma, start, end);
start = end;
if (pages <= 0)
@@ -1774,10 +988,10 @@ void task_numa_work(struct callback_head *work)
out:
/*
- * It is possible to reach the end of the VMA list but the last few
- * VMAs are not guaranteed to the vma_migratable. If they are not, we
- * would find the !migratable VMA on the next scan but not reset the
- * scanner to the start so check it now.
+ * It is possible to reach the end of the VMA list but the last few VMAs are
+ * not guaranteed to the vma_migratable. If they are not, we would find the
+ * !migratable VMA on the next scan but not reset the scanner to the start
+ * so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
@@ -1811,8 +1025,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now - curr->node_stamp > period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
- curr->node_stamp += period;
+ curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ curr->node_stamp = now;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1824,14 +1038,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
-
-static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
-{
-}
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -1841,12 +1047,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (!parent_entity(se))
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
- if (entity_is_task(se)) {
- struct rq *rq = rq_of(cfs_rq);
-
- account_numa_enqueue(rq, task_of(se));
- list_add(&se->group_node, &rq->cfs_tasks);
- }
+ if (entity_is_task(se))
+ list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
@@ -1857,10 +1059,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ if (entity_is_task(se))
list_del_init(&se->group_node);
- }
cfs_rq->nr_running--;
}
@@ -2178,7 +1378,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
long contrib;
/* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
+ contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
@@ -2870,14 +2070,13 @@ static inline bool cfs_bandwidth_used(void)
return static_key_false(&__cfs_bandwidth_used);
}
-void cfs_bandwidth_usage_inc(void)
+void account_cfs_bandwidth_used(int enabled, int was_enabled)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
-}
-
-void cfs_bandwidth_usage_dec(void)
-{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ /* only need to count groups transitioning between enabled/!enabled */
+ if (enabled && !was_enabled)
+ static_key_slow_inc(&__cfs_bandwidth_used);
+ else if (!enabled && was_enabled)
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -2885,8 +2084,7 @@ static bool cfs_bandwidth_used(void)
return true;
}
-void cfs_bandwidth_usage_inc(void) {}
-void cfs_bandwidth_usage_dec(void) {}
+void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
#endif /* HAVE_JUMP_LABEL */
/*
@@ -3137,8 +2335,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
}
@@ -3252,13 +2448,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (idle)
goto out_unlock;
- /*
- * if we have relooped after returning idle once, we need to update our
- * status as actually running, so that other cpus doing
- * __start_cfs_bandwidth will stop trying to cancel us.
- */
- cfs_b->timer_active = 1;
-
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
@@ -3319,13 +2508,7 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/*
- * Are we near the end of the current quota period?
- *
- * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
- * migrate_hrtimers, base is never cleared, so we are fine.
- */
+/* are we near the end of the current quota period? */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -3401,12 +2584,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
return;
- }
+ raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
@@ -3527,11 +2708,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
- hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
- /* bounce the lock to allow do_sched_cfs_period_timer to run */
+ while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
raw_spin_unlock(&cfs_b->lock);
- cpu_relax();
+ /* ensure cfs_b->lock is available while we wait */
+ hrtimer_cancel(&cfs_b->period_timer);
+
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
if (cfs_b->timer_active)
@@ -3932,7 +3113,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent || !wl) /* the trivial, non-cgroup case */
+ if (!tg->parent) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
@@ -3985,7 +3166,8 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
}
#else
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+ unsigned long wl, unsigned long wg)
{
return wl;
}
@@ -4238,10 +3420,11 @@ done:
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
+ int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
@@ -4721,12 +3904,9 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-enum fbq_type { regular, remote, all };
-
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
-#define LBF_DST_PINNED 0x04
-#define LBF_SOME_PINNED 0x08
+#define LBF_SOME_PINNED 0x04
struct lb_env {
struct sched_domain *sd;
@@ -4749,8 +3929,6 @@ struct lb_env {
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
-
- enum fbq_type fbq_type;
};
/*
@@ -4797,78 +3975,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
return delta < (s64)sysctl_sched_migration_cost;
}
-#ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
-{
- int src_nid, dst_nid;
-
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
- !(env->sd->flags & SD_NUMA)) {
- return false;
- }
-
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
-
- if (src_nid == dst_nid)
- return false;
-
- /* Always encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- return true;
-
- /* If both task and group weight improve, this move is a winner. */
- if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
- group_weight(p, dst_nid) > group_weight(p, src_nid))
- return true;
-
- return false;
-}
-
-
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
- int src_nid, dst_nid;
-
- if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
- return false;
-
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return false;
-
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
-
- if (src_nid == dst_nid)
- return false;
-
- /* Migrating away from the preferred node is always bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
-
- /* If either task or group weight get worse, don't do it. */
- if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
- group_weight(p, dst_nid) < group_weight(p, src_nid))
- return true;
-
- return false;
-}
-
-#else
-static inline bool migrate_improves_locality(struct task_struct *p,
- struct lb_env *env)
-{
- return false;
-}
-
-static inline bool migrate_degrades_locality(struct task_struct *p,
- struct lb_env *env)
-{
- return false;
-}
-#endif
-
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -4891,8 +3997,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
- env->flags |= LBF_SOME_PINNED;
-
/*
* Remember if this task can be migrated to any other cpu in
* our sched_group. We may want to revisit it if we couldn't
@@ -4901,13 +4005,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* Also avoid computing new_dst_cpu if we have already computed
* one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
- env->flags |= LBF_DST_PINNED;
+ env->flags |= LBF_SOME_PINNED;
env->new_dst_cpu = cpu;
break;
}
@@ -4926,24 +4030,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
- if (!tsk_cache_hot)
- tsk_cache_hot = migrate_degrades_locality(p, env);
-
- if (migrate_improves_locality(p, env)) {
-#ifdef CONFIG_SCHEDSTATS
- if (tsk_cache_hot) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
- }
-#endif
- return 1;
- }
+ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -4986,6 +4077,8 @@ static int move_one_task(struct lb_env *env)
return 0;
}
+static unsigned long task_h_load(struct task_struct *p);
+
static const unsigned int sched_nr_migrate_break = 32;
/*
@@ -5198,10 +4291,6 @@ struct sg_lb_stats {
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
-#ifdef CONFIG_NUMA_BALANCING
- unsigned int nr_numa_running;
- unsigned int nr_preferred_running;
-#endif
};
/*
@@ -5241,7 +4330,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
*
* Return: The load index.
*/
@@ -5358,7 +4447,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power, power_orig;
+ unsigned long power;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -5370,7 +4459,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
return;
}
- power_orig = power = 0;
+ power = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5378,33 +4467,8 @@ void update_group_power(struct sched_domain *sd, int cpu)
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group_power *sgp;
- struct rq *rq = cpu_rq(cpu);
-
- /*
- * build_sched_domains() -> init_sched_groups_power()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use power_of(), which is set irrespective of domains
- * in update_cpu_power().
- *
- * This avoids power/power_orig from being 0 and
- * causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct power_orig.
- */
- if (unlikely(!rq->sd)) {
- power_orig += power_of(cpu);
- power += power_of(cpu);
- continue;
- }
-
- sgp = rq->sd->groups->sgp;
- power_orig += sgp->power_orig;
- power += sgp->power;
- }
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
} else {
/*
* !SD_OVERLAP domains can assume that child groups
@@ -5413,14 +4477,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power_orig += group->sgp->power_orig;
power += group->sgp->power;
group = group->next;
} while (group != child->groups);
}
- sdg->sgp->power_orig = power_orig;
- sdg->sgp->power = power;
+ sdg->sgp->power_orig = sdg->sgp->power = power;
}
/*
@@ -5464,12 +4526,13 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* cpu 3 and leave one of the cpus in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
- * by noticing the lower domain failed to reach balance and had difficulty
- * moving tasks due to affinity constraints.
+ * by noticing it has a cpu that is overloaded while the remaining cpus are
+ * idle -- or rather, there's a distinct imbalance in the cpus; see
+ * sg_imbalanced().
*
* When this is so detected; this group becomes a candidate for busiest; see
- * update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * update_sd_pick_busiest(). And calculcate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditional to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -5477,36 +4540,49 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* subtle and fragile situation.
*/
-static inline int sg_imbalanced(struct sched_group *group)
+struct sg_imb_stats {
+ unsigned long max_nr_running, min_nr_running;
+ unsigned long max_cpu_load, min_cpu_load;
+};
+
+static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
{
- return group->sgp->imbalance;
+ sgi->max_cpu_load = sgi->max_nr_running = 0UL;
+ sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
}
-/*
- * Compute the group capacity.
- *
- * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit power unit capacity with that.
- */
-static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+static inline void
+update_sg_imb_stats(struct sg_imb_stats *sgi,
+ unsigned long load, unsigned long nr_running)
{
- unsigned int capacity, smt, cpus;
- unsigned int power, power_orig;
-
- power = group->sgp->power;
- power_orig = group->sgp->power_orig;
- cpus = group->group_weight;
+ if (load > sgi->max_cpu_load)
+ sgi->max_cpu_load = load;
+ if (sgi->min_cpu_load > load)
+ sgi->min_cpu_load = load;
- /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
- capacity = cpus / smt; /* cores */
+ if (nr_running > sgi->max_nr_running)
+ sgi->max_nr_running = nr_running;
+ if (sgi->min_nr_running > nr_running)
+ sgi->min_nr_running = nr_running;
+}
- capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+static inline int
+sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
+{
+ /*
+ * Consider the group unbalanced when the imbalance is larger
+ * than the average weight of a task.
+ *
+ * APZ: with cgroup the avg task weight can vary wildly and
+ * might not be a suitable number - should we keep a
+ * normalized nr_running number somewhere that negates
+ * the hierarchy?
+ */
+ if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
+ (sgi->max_nr_running - sgi->min_nr_running) > 1)
+ return 1;
- return capacity;
+ return 0;
}
/**
@@ -5521,11 +4597,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs)
{
+ struct sg_imb_stats sgi;
unsigned long nr_running;
unsigned long load;
int i;
- memset(sgs, 0, sizeof(*sgs));
+ init_sg_imb_stats(&sgi);
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
@@ -5533,22 +4610,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
/* Bias balancing toward cpus of our domain */
- if (local_group)
+ if (local_group) {
load = target_load(i, load_idx);
- else
+ } else {
load = source_load(i, load_idx);
+ update_sg_imb_stats(&sgi, load, nr_running);
+ }
sgs->group_load += load;
sgs->sum_nr_running += nr_running;
-#ifdef CONFIG_NUMA_BALANCING
- sgs->nr_numa_running += rq->nr_numa_running;
- sgs->nr_preferred_running += rq->nr_preferred_running;
-#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
+ if (local_group && (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, group->sgp->next_update)))
+ update_group_power(env->sd, env->dst_cpu);
+
/* Adjust by relative CPU power of the group */
sgs->group_power = group->sgp->power;
sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -5556,10 +4635,15 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- sgs->group_weight = group->group_weight;
+ sgs->group_imb = sg_imbalanced(sgs, &sgi);
+
+ sgs->group_capacity =
+ DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
+
+ if (!sgs->group_capacity)
+ sgs->group_capacity = fix_small_capacity(env->sd, group);
- sgs->group_imb = sg_imbalanced(group);
- sgs->group_capacity = sg_capacity(env, group);
+ sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
@@ -5609,42 +4693,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
}
-#ifdef CONFIG_NUMA_BALANCING
-static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
-{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
- return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
- return remote;
- return all;
-}
-
-static inline enum fbq_type fbq_classify_rq(struct rq *rq)
-{
- if (rq->nr_running > rq->nr_numa_running)
- return regular;
- if (rq->nr_running > rq->nr_preferred_running)
- return remote;
- return all;
-}
-#else
-static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
-{
- return all;
-}
-
-static inline enum fbq_type fbq_classify_rq(struct rq *rq)
-{
- return regular;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
+ * @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env,
+ struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
@@ -5664,17 +4720,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (local_group) {
sds->local = sg;
sgs = &sds->local_stat;
-
- if (env->idle != CPU_NEWLY_IDLE ||
- time_after_eq(jiffies, sg->sgp->next_update))
- update_group_power(env->sd, env->dst_cpu);
}
+ memset(sgs, 0, sizeof(*sgs));
update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
- if (local_group)
- goto next_group;
-
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
@@ -5685,25 +4735,21 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_capacity)
+ if (prefer_sibling && !local_group &&
+ sds->local && sds->local_stat.group_has_capacity)
sgs->group_capacity = min(sgs->group_capacity, 1U);
- if (update_sd_pick_busiest(env, sds, sg, sgs)) {
- sds->busiest = sg;
- sds->busiest_stat = *sgs;
- }
-
-next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_pwr += sgs->group_power;
+ if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
+ sds->busiest = sg;
+ sds->busiest_stat = *sgs;
+ }
+
sg = sg->next;
} while (sg != env->sd->groups);
-
- if (env->sd->flags & SD_NUMA)
- env->fbq_type = fbq_classify_group(&sds->busiest_stat);
}
/**
@@ -6007,39 +5053,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long power, capacity, wl;
- enum fbq_type rt;
-
- rq = cpu_rq(i);
- rt = fbq_classify_rq(rq);
+ unsigned long power = power_of(i);
+ unsigned long capacity = DIV_ROUND_CLOSEST(power,
+ SCHED_POWER_SCALE);
+ unsigned long wl;
- /*
- * We classify groups/runqueues into three groups:
- * - regular: there are !numa tasks
- * - remote: there are numa tasks that run on the 'wrong' node
- * - all: there is no distinction
- *
- * In order to avoid migrating ideally placed numa tasks,
- * ignore those when there's better options.
- *
- * If we ignore the actual busiest queue to migrate another
- * task, the next balance pass can still reduce the busiest
- * queue by moving tasks around inside the node.
- *
- * If we cannot move enough load due to this classification
- * the next pass will adjust the group classification and
- * allow migration of more tasks.
- *
- * Both cases only affect the total convergence complexity.
- */
- if (rt > env->fbq_type)
- continue;
-
- power = power_of(i);
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
+ rq = cpu_rq(i);
wl = weighted_cpuload(i);
/*
@@ -6142,7 +5164,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -6156,7 +5177,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
- .fbq_type = all,
};
/*
@@ -6248,17 +5268,17 @@ more_balance:
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
- if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
-
- /* Prevent to re-select dst_cpu via env's cpus */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
- env.flags &= ~LBF_DST_PINNED;
+ env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
@@ -6266,18 +5286,6 @@ more_balance:
goto more_balance;
}
- /*
- * We failed to reach balance because of affinity.
- */
- if (sd_parent) {
- int *group_imbalance = &sd_parent->groups->sgp->imbalance;
-
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
- *group_imbalance = 1;
- } else if (*group_imbalance)
- *group_imbalance = 0;
- }
-
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -6385,7 +5393,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- u64 curr_cost = 0;
this_rq->idle_stamp = rq_clock(this_rq);
@@ -6402,27 +5409,15 @@ void idle_balance(int this_cpu, struct rq *this_rq)
for_each_domain(this_cpu, sd) {
unsigned long interval;
int continue_balancing = 1;
- u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
- break;
-
if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
-
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
-
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
-
- curr_cost += domain_cost;
}
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6444,9 +5439,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
*/
this_rq->next_balance = next_balance;
}
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
}
/*
@@ -6580,16 +5572,16 @@ static inline void nohz_balance_exit_idle(int cpu)
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
- int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference_check_sched_domain(this_rq()->sd);
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ for (; sd; sd = sd->parent)
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6597,16 +5589,16 @@ unlock:
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
- int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference_check_sched_domain(this_rq()->sd);
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ for (; sd; sd = sd->parent)
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6670,39 +5662,15 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize, need_decay = 0;
- u64 max_cost = 0;
+ int need_serialize;
update_blocked_averages(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {
- /*
- * Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
- */
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
- }
- max_cost += sd->max_newidle_lb_cost;
-
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- /*
- * Stop the load balance at this level. There is another
- * CPU in our sched group which is doing load balancing more
- * actively.
- */
- if (!continue_balancing) {
- if (need_decay)
- continue;
- break;
- }
-
interval = sd->balance_interval;
if (idle != CPU_IDLE)
interval *= sd->busy_factor;
@@ -6721,7 +5689,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
- * The LBF_DST_PINNED logic could have changed
+ * The LBF_SOME_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
@@ -6736,14 +5704,14 @@ out:
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
- }
- if (need_decay) {
+
/*
- * Ensure the rq-wide value also decays but keep it at a
- * reasonable floor to avoid funnies with rq->avg_idle.
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
*/
- rq->max_idle_balance_cost =
- max((u64)sysctl_sched_migration_cost, max_cost);
+ if (!continue_balancing)
+ break;
}
rcu_read_unlock();
@@ -6813,8 +5781,6 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
struct sched_domain *sd;
- struct sched_group_power *sgp;
- int nr_busy;
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6840,22 +5806,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
- if (sd) {
- sgp = sd->groups->sgp;
- nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group_power *sgp = sg->sgp;
+ int nr_busy = atomic_read(&sgp->nr_busy_cpus);
- if (nr_busy > 1)
+ if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
goto need_kick_unlock;
- }
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
-
- if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
+ if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+ && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+ if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
+ break;
+ }
rcu_read_unlock();
return 0;
@@ -7248,8 +6214,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- /* guarantee group entities always have weight */
- update_load_set(&se->load, NICE_0_LOAD);
+ update_load_set(&se->load, 0);
se->parent = parent;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929..99399f8 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,23 +63,10 @@ SCHED_FEAT(LB_MIN, false)
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
* at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
*/
#ifdef CONFIG_NUMA_BALANCING
SCHED_FEAT(NUMA, false)
-
-/*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
- */
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
-
-/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
- */
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
+SCHED_FEAT(NUMA_FORCE, false)
#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9..d8da010 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275..01970c8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,10 +246,8 @@ static inline void rt_set_overload(struct rq *rq)
* if we should look at the mask. It would be a shame
* if we looked at the mask, but the mask was not
* updated yet.
- *
- * Matched by the barrier in pull_rt_task().
*/
- smp_wmb();
+ wmb();
atomic_inc(&rq->rd->rto_count);
}
@@ -1171,10 +1169,13 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ int cpu;
+
+ cpu = task_cpu(p);
if (p->nr_cpus_allowed == 1)
goto out;
@@ -1212,7 +1213,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*/
if (curr && unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ curr->prio <= p->prio) &&
+ (p->nr_cpus_allowed > 1)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1628,12 +1630,6 @@ static int pull_rt_task(struct rq *this_rq)
if (likely(!rt_overloaded(this_rq)))
return 0;
- /*
- * Match the barrier from rt_set_overloaded; this guarantees that if we
- * see overloaded we must also see the rto_mask bit.
- */
- smp_rmb();
-
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1935,8 +1931,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = sched_rr_timeslice;
/*
- * Requeue to the end of queue if we (and all of our ancestors) are not
- * the only element on the queue
+ * Requeue to the end of queue if we (and all of our ancestors) are the
+ * only element on the queue
*/
for_each_sched_rt_entity(rt_se) {
if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b2..b3c5653 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,7 +6,6 @@
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
#include <linux/tick.h>
-#include <linux/slab.h>
#include "cpupri.h"
#include "cpuacct.h"
@@ -409,10 +408,6 @@ struct rq {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned int nr_running;
-#ifdef CONFIG_NUMA_BALANCING
- unsigned int nr_numa_running;
- unsigned int nr_preferred_running;
-#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
@@ -481,9 +476,6 @@ struct rq {
u64 age_stamp;
u64 idle_stamp;
u64 avg_idle;
-
- /* This is used to determine avg_idle's max value */
- u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -560,12 +552,6 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-#ifdef CONFIG_NUMA_BALANCING
-extern void sched_setnuma(struct task_struct *p, int node);
-extern int migrate_task_to(struct task_struct *p, int cpu);
-extern int migrate_swap(struct task_struct *, struct task_struct *);
-#endif /* CONFIG_NUMA_BALANCING */
-
#ifdef CONFIG_SMP
#define rcu_dereference_check_sched_domain(p) \
@@ -607,24 +593,9 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
return hsd;
}
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
- struct sched_domain *sd;
-
- for_each_domain(cpu, sd) {
- if (sd->flags & flag)
- break;
- }
-
- return sd;
-}
-
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
atomic_t ref;
@@ -634,7 +605,6 @@ struct sched_group_power {
*/
unsigned int power, power_orig;
unsigned long next_update;
- int imbalance; /* XXX unrelated to power but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -749,7 +719,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
task_thread_info(p)->cpu = cpu;
- p->wake_cpu = cpu;
#endif
}
@@ -1005,7 +974,7 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1251,24 +1220,6 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
-static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
-{
- if (l1 > l2)
- swap(l1, l2);
-
- spin_lock(l1);
- spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
-}
-
-static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
-{
- if (l1 > l2)
- swap(l1, l2);
-
- raw_spin_lock(l1);
- raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
-}
-
/*
* double_rq_lock - safely lock two runqueues
*
@@ -1354,8 +1305,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void cfs_bandwidth_usage_inc(void);
-extern void cfs_bandwidth_usage_dec(void);
+extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043..c7edee7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
*/
-static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_dequeued(struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now = rq_clock(task_rq(t)), delta = 0;
if (unlikely(sched_info_on()))
if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
- rq_sched_info_dequeued(rq, delta);
+ rq_sched_info_dequeued(task_rq(t), delta);
}
/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
-static void sched_info_arrive(struct rq *rq, struct task_struct *t)
+static void sched_info_arrive(struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now = rq_clock(task_rq(t)), delta = 0;
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- rq_sched_info_arrive(rq, delta);
+ rq_sched_info_arrive(task_rq(t), delta);
}
/*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_queued(struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued)
- t->sched_info.last_queued = rq_clock(rq);
+ t->sched_info.last_queued = rq_clock(task_rq(t));
}
/*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
* sched_info_queued() to mark that it has now again started waiting on
* the runqueue.
*/
-static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
+static inline void sched_info_depart(struct task_struct *t)
{
- unsigned long long delta = rq_clock(rq) -
+ unsigned long long delta = rq_clock(task_rq(t)) -
t->sched_info.last_arrival;
- rq_sched_info_depart(rq, delta);
+ rq_sched_info_depart(task_rq(t), delta);
if (t->state == TASK_RUNNING)
- sched_info_queued(rq, t);
+ sched_info_queued(t);
}
/*
@@ -128,34 +128,32 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq,
- struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
+ struct rq *rq = task_rq(prev);
+
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
- sched_info_depart(rq, prev);
+ sched_info_depart(prev);
if (next != rq->idle)
- sched_info_arrive(rq, next);
+ sched_info_arrive(next);
}
static inline void
-sched_info_switch(struct rq *rq,
- struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
- __sched_info_switch(rq, prev, next);
+ __sched_info_switch(prev, next);
}
#else
-#define sched_info_queued(rq, t) do { } while (0)
+#define sched_info_queued(t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
-#define sched_info_dequeued(rq, t) do { } while (0)
-#define sched_info_depart(rq, t) do { } while (0)
-#define sched_info_arrive(rq, next) do { } while (0)
-#define sched_info_switch(rq, t, next) do { } while (0)
+#define sched_info_dequeued(t) do { } while (0)
+#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de..e08fbee 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
diff --git a/kernel/locking/semaphore.c b/kernel/semaphore.c
index 6815171..6815171 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/semaphore.c
diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30e..ded28b9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
-int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
+int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
{
int err;
diff --git a/kernel/smp.c b/kernel/smp.c
index bd9f940..0564571 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -15,9 +15,9 @@
#include "smpboot.h"
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
enum {
CSD_FLAG_LOCK = 0x01,
- CSD_FLAG_WAIT = 0x02,
};
struct call_function_data {
@@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd)
static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
@@ -139,15 +139,13 @@ static void csd_unlock(struct call_single_data *csd)
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
+static
+void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
unsigned long flags;
int ipi;
- if (wait)
- csd->flags |= CSD_FLAG_WAIT;
-
raw_spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
list_add_tail(&csd->list, &dst->list);
@@ -342,7 +340,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
}
put_cpu();
}
-EXPORT_SYMBOL_GPL(__smp_call_function_single);
/**
* smp_call_function_many(): Run a function on a set of other CPUs.
@@ -462,6 +459,7 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
return 0;
}
EXPORT_SYMBOL(smp_call_function);
+#endif /* USE_GENERIC_SMP_HELPERS */
/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
@@ -526,11 +524,6 @@ void __init setup_nr_cpu_ids(void)
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
-void __weak smp_announce(void)
-{
- printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
-}
-
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
@@ -547,7 +540,7 @@ void __init smp_init(void)
}
/* Any cleanup work */
- smp_announce();
+ printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 11025cc..d7d498d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
* Distribute under GPLv2.
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ * Remote softirq infrastructure is by Jens Axboe.
*/
#include <linux/export.h>
@@ -27,6 +29,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
+#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
@@ -97,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
raw_local_irq_save(flags);
/*
- * The preempt tracer hooks into preempt_count_add and will break
+ * The preempt tracer hooks into add_preempt_count and will break
* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
* is set and before current->softirq_enabled is cleared.
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
- __preempt_count_add(cnt);
+ preempt_count() += cnt;
/*
* Were softirqs turned off above:
*/
@@ -117,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
- preempt_count_add(cnt);
+ add_preempt_count(cnt);
barrier();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -131,11 +134,12 @@ EXPORT_SYMBOL(local_bh_disable);
static void __local_bh_enable(unsigned int cnt)
{
+ WARN_ON_ONCE(in_irq());
WARN_ON_ONCE(!irqs_disabled());
if (softirq_count() == cnt)
trace_softirqs_on(_RET_IP_);
- preempt_count_sub(cnt);
+ sub_preempt_count(cnt);
}
/*
@@ -145,7 +149,6 @@ static void __local_bh_enable(unsigned int cnt)
*/
void _local_bh_enable(void)
{
- WARN_ON_ONCE(in_irq());
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
@@ -166,17 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip)
* Keep preemption disabled until we are done with
* softirq processing:
*/
- preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
+ sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
- if (unlikely(!in_interrupt() && local_softirq_pending())) {
- /*
- * Run softirq if any pending. And do it in its own stack
- * as we may be calling this deep in a task call stack already.
- */
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
do_softirq();
- }
- preempt_count_dec();
+ dec_preempt_count();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_enable();
#endif
@@ -258,7 +256,7 @@ restart:
" exited with %08x?\n", vec_nr,
softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
- preempt_count_set(prev_count);
+ preempt_count() = prev_count;
}
rcu_bh_qs(cpu);
@@ -282,11 +280,10 @@ restart:
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
- WARN_ON_ONCE(in_interrupt());
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
-
+#ifndef __ARCH_HAS_DO_SOFTIRQ
asmlinkage void do_softirq(void)
{
@@ -301,11 +298,13 @@ asmlinkage void do_softirq(void)
pending = local_softirq_pending();
if (pending)
- do_softirq_own_stack();
+ __do_softirq();
local_irq_restore(flags);
}
+#endif
+
/*
* Enter an interrupt context.
*/
@@ -330,21 +329,15 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
if (!force_irqthreads) {
-#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
* it is the irq stack, because it should be near empty
- * at this stage.
+ * at this stage. But we have no way to know if the arch
+ * calls irq_exit() on the irq stack. So call softirq
+ * in its own stack to prevent from any overrun on top
+ * of a potentially deep task stack.
*/
- __do_softirq();
-#else
- /*
- * Otherwise, irq_exit() is called on the task stack that can
- * be potentially deep already. So call softirq in its own stack
- * to prevent from any overrun.
- */
- do_softirq_own_stack();
-#endif
+ do_softirq();
} else {
wakeup_softirqd();
}
@@ -376,7 +369,7 @@ void irq_exit(void)
account_irq_exit_time(current);
trace_hardirq_exit();
- preempt_count_sub(HARDIRQ_OFFSET);
+ sub_preempt_count(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
@@ -625,17 +618,146 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
}
EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
+/*
+ * Remote softirq bits
+ */
+
+DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+
+static void __local_trigger(struct call_single_data *cp, int softirq)
+{
+ struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+
+ list_add_tail(&cp->list, head);
+
+ /* Trigger the softirq only if the list was previously empty. */
+ if (head->next == &cp->list)
+ raise_softirq_irqoff(softirq);
+}
+
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+static void remote_softirq_receive(void *data)
+{
+ struct call_single_data *cp = data;
+ unsigned long flags;
+ int softirq;
+
+ softirq = *(int *)cp->info;
+ local_irq_save(flags);
+ __local_trigger(cp, softirq);
+ local_irq_restore(flags);
+}
+
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+ if (cpu_online(cpu)) {
+ cp->func = remote_softirq_receive;
+ cp->info = &softirq;
+ cp->flags = 0;
+
+ __smp_call_function_single(cpu, cp, 0);
+ return 0;
+ }
+ return 1;
+}
+#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+ return 1;
+}
+#endif
+
+/**
+ * __send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @this_cpu: the currently executing cpu
+ * @softirq: the softirq for the work
+ *
+ * Attempt to schedule softirq work on a remote cpu. If this cannot be
+ * done, the work is instead queued up on the local cpu.
+ *
+ * Interrupts must be disabled.
+ */
+void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+{
+ if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+ __local_trigger(cp, softirq);
+}
+EXPORT_SYMBOL(__send_remote_softirq);
+
+/**
+ * send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @softirq: the softirq for the work
+ *
+ * Like __send_remote_softirq except that disabling interrupts and
+ * computing the current cpu is done for the caller.
+ */
+void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+ unsigned long flags;
+ int this_cpu;
+
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ __send_remote_softirq(cp, cpu, this_cpu, softirq);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(send_remote_softirq);
+
+static int remote_softirq_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ int cpu = (unsigned long) hcpu;
+ int i;
+
+ local_irq_disable();
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+ struct list_head *local_head;
+
+ if (list_empty(head))
+ continue;
+
+ local_head = &__get_cpu_var(softirq_work_list[i]);
+ list_splice_init(head, local_head);
+ raise_softirq_irqoff(i);
+ }
+ local_irq_enable();
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block remote_softirq_cpu_notifier = {
+ .notifier_call = remote_softirq_cpu_notify,
+};
+
void __init softirq_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
+ int i;
+
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
+ for (i = 0; i < NR_SOFTIRQS; i++)
+ INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
}
+ register_hotcpu_notifier(&remote_softirq_cpu_notifier);
+
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
@@ -649,10 +771,6 @@ static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable();
if (local_softirq_pending()) {
- /*
- * We can safely run softirq on inline stack, as we are not deep
- * in the task stack here.
- */
__do_softirq();
rcu_note_context_switch(cpu);
local_irq_enable();
diff --git a/kernel/locking/spinlock.c b/kernel/spinlock.c
index 4b082b5..4b082b5 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/spinlock.c
diff --git a/kernel/rcu/srcu.c b/kernel/srcu.c
index 01d5ccb..01d5ccb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/srcu.c
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e0..c09f295 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
-#include <linux/lglock.h>
/*
* Structure to determine completion condition and record errors. May
@@ -44,14 +43,6 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
-
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
@@ -124,184 +115,6 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
return done.executed ? done.ret : -ENOENT;
}
-/* This controls the threads on each CPU. */
-enum multi_stop_state {
- /* Dummy starting state for thread. */
- MULTI_STOP_NONE,
- /* Awaiting everyone to be scheduled. */
- MULTI_STOP_PREPARE,
- /* Disable interrupts. */
- MULTI_STOP_DISABLE_IRQ,
- /* Run the function */
- MULTI_STOP_RUN,
- /* Exit */
- MULTI_STOP_EXIT,
-};
-
-struct multi_stop_data {
- int (*fn)(void *);
- void *data;
- /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
- unsigned int num_threads;
- const struct cpumask *active_cpus;
-
- enum multi_stop_state state;
- atomic_t thread_ack;
-};
-
-static void set_state(struct multi_stop_data *msdata,
- enum multi_stop_state newstate)
-{
- /* Reset ack counter. */
- atomic_set(&msdata->thread_ack, msdata->num_threads);
- smp_wmb();
- msdata->state = newstate;
-}
-
-/* Last one to ack a state moves to the next state. */
-static void ack_state(struct multi_stop_data *msdata)
-{
- if (atomic_dec_and_test(&msdata->thread_ack))
- set_state(msdata, msdata->state + 1);
-}
-
-/* This is the cpu_stop function which stops the CPU. */
-static int multi_cpu_stop(void *data)
-{
- struct multi_stop_data *msdata = data;
- enum multi_stop_state curstate = MULTI_STOP_NONE;
- int cpu = smp_processor_id(), err = 0;
- unsigned long flags;
- bool is_active;
-
- /*
- * When called from stop_machine_from_inactive_cpu(), irq might
- * already be disabled. Save the state and restore it on exit.
- */
- local_save_flags(flags);
-
- if (!msdata->active_cpus)
- is_active = cpu == cpumask_first(cpu_online_mask);
- else
- is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
-
- /* Simple state machine */
- do {
- /* Chill out and ensure we re-read multi_stop_state. */
- cpu_relax();
- if (msdata->state != curstate) {
- curstate = msdata->state;
- switch (curstate) {
- case MULTI_STOP_DISABLE_IRQ:
- local_irq_disable();
- hard_irq_disable();
- break;
- case MULTI_STOP_RUN:
- if (is_active)
- err = msdata->fn(msdata->data);
- break;
- default:
- break;
- }
- ack_state(msdata);
- }
- } while (curstate != MULTI_STOP_EXIT);
-
- local_irq_restore(flags);
- return err;
-}
-
-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
-/**
- * stop_two_cpus - stops two cpus
- * @cpu1: the cpu to stop
- * @cpu2: the other cpu to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Stops both the current and specified CPU and runs @fn on one of them.
- *
- * returns when both are completed.
- */
-int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
-{
- struct cpu_stop_done done;
- struct cpu_stop_work work1, work2;
- struct irq_cpu_stop_queue_work_info call_args;
- struct multi_stop_data msdata;
-
- preempt_disable();
- msdata = (struct multi_stop_data){
- .fn = fn,
- .data = arg,
- .num_threads = 2,
- .active_cpus = cpumask_of(cpu1),
- };
-
- work1 = work2 = (struct cpu_stop_work){
- .fn = multi_cpu_stop,
- .arg = &msdata,
- .done = &done
- };
-
- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
- cpu_stop_init_done(&done, 2);
- set_state(&msdata, MULTI_STOP_PREPARE);
-
- /*
- * If we observe both CPUs active we know _cpu_down() cannot yet have
- * queued its stop_machine works and therefore ours will get executed
- * first. Or its not either one of our CPUs that's getting unplugged,
- * in which case we don't care.
- *
- * This relies on the stopper workqueues to be FIFO.
- */
- if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
- preempt_enable();
- return -ENOENT;
- }
-
- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 0);
- lg_local_unlock(&stop_cpus_lock);
- preempt_enable();
-
- wait_for_completion(&done.completion);
-
- return done.executed ? done.ret : -ENOENT;
-}
-
/**
* stop_one_cpu_nowait - stop a cpu but don't wait for completion
* @cpu: cpu to stop
@@ -346,10 +159,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- lg_global_lock(&stop_cpus_lock);
+ preempt_disable();
for_each_cpu(cpu, cpumask)
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
- lg_global_unlock(&stop_cpus_lock);
+ preempt_enable();
}
static int __stop_cpus(const struct cpumask *cpumask,
@@ -546,14 +359,98 @@ early_initcall(cpu_stop_init);
#ifdef CONFIG_STOP_MACHINE
+/* This controls the threads on each CPU. */
+enum stopmachine_state {
+ /* Dummy starting state for thread. */
+ STOPMACHINE_NONE,
+ /* Awaiting everyone to be scheduled. */
+ STOPMACHINE_PREPARE,
+ /* Disable interrupts. */
+ STOPMACHINE_DISABLE_IRQ,
+ /* Run the function */
+ STOPMACHINE_RUN,
+ /* Exit */
+ STOPMACHINE_EXIT,
+};
+
+struct stop_machine_data {
+ int (*fn)(void *);
+ void *data;
+ /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+ unsigned int num_threads;
+ const struct cpumask *active_cpus;
+
+ enum stopmachine_state state;
+ atomic_t thread_ack;
+};
+
+static void set_state(struct stop_machine_data *smdata,
+ enum stopmachine_state newstate)
+{
+ /* Reset ack counter. */
+ atomic_set(&smdata->thread_ack, smdata->num_threads);
+ smp_wmb();
+ smdata->state = newstate;
+}
+
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct stop_machine_data *smdata)
+{
+ if (atomic_dec_and_test(&smdata->thread_ack))
+ set_state(smdata, smdata->state + 1);
+}
+
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
+{
+ struct stop_machine_data *smdata = data;
+ enum stopmachine_state curstate = STOPMACHINE_NONE;
+ int cpu = smp_processor_id(), err = 0;
+ unsigned long flags;
+ bool is_active;
+
+ /*
+ * When called from stop_machine_from_inactive_cpu(), irq might
+ * already be disabled. Save the state and restore it on exit.
+ */
+ local_save_flags(flags);
+
+ if (!smdata->active_cpus)
+ is_active = cpu == cpumask_first(cpu_online_mask);
+ else
+ is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
+
+ /* Simple state machine */
+ do {
+ /* Chill out and ensure we re-read stopmachine_state. */
+ cpu_relax();
+ if (smdata->state != curstate) {
+ curstate = smdata->state;
+ switch (curstate) {
+ case STOPMACHINE_DISABLE_IRQ:
+ local_irq_disable();
+ hard_irq_disable();
+ break;
+ case STOPMACHINE_RUN:
+ if (is_active)
+ err = smdata->fn(smdata->data);
+ break;
+ default:
+ break;
+ }
+ ack_state(smdata);
+ }
+ } while (curstate != STOPMACHINE_EXIT);
+
+ local_irq_restore(flags);
+ return err;
+}
+
int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
- struct multi_stop_data msdata = {
- .fn = fn,
- .data = data,
- .num_threads = num_online_cpus(),
- .active_cpus = cpus,
- };
+ struct stop_machine_data smdata = { .fn = fn, .data = data,
+ .num_threads = num_online_cpus(),
+ .active_cpus = cpus };
if (!stop_machine_initialized) {
/*
@@ -564,7 +461,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
unsigned long flags;
int ret;
- WARN_ON_ONCE(msdata.num_threads != 1);
+ WARN_ON_ONCE(smdata.num_threads != 1);
local_irq_save(flags);
hard_irq_disable();
@@ -575,8 +472,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
}
/* Set the initial state and stop all online cpus. */
- set_state(&msdata, MULTI_STOP_PREPARE);
- return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
+ set_state(&smdata, STOPMACHINE_PREPARE);
+ return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
}
int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -616,25 +513,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
const struct cpumask *cpus)
{
- struct multi_stop_data msdata = { .fn = fn, .data = data,
+ struct stop_machine_data smdata = { .fn = fn, .data = data,
.active_cpus = cpus };
struct cpu_stop_done done;
int ret;
/* Local CPU must be inactive and CPU hotplug in progress. */
BUG_ON(cpu_active(raw_smp_processor_id()));
- msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
+ smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
/* No proper task established and can't sleep - busy wait for lock. */
while (!mutex_trylock(&stop_cpus_mutex))
cpu_relax();
/* Schedule work on other CPUs and execute directly for local CPU */
- set_state(&msdata, MULTI_STOP_PREPARE);
+ set_state(&smdata, STOPMACHINE_PREPARE);
cpu_stop_init_done(&done, num_active_cpus());
- queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
+ queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
&done);
- ret = multi_cpu_stop(&msdata);
+ ret = stop_machine_cpu_stop(&smdata);
/* Busy wait for completion. */
while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index c723113..c18ecca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,7 @@
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/workqueue.h>
#include <linux/capability.h>
#include <linux/device.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 34a6047..b2f06f3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
-static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -371,6 +371,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "numa_balancing_scan_period_reset",
+ .data = &sysctl_numa_balancing_scan_period_reset,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
@@ -384,20 +391,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "numa_balancing_settle_count",
- .data = &sysctl_numa_balancing_settle_count,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_migrate_deferred",
- .data = &sysctl_numa_balancing_migrate_deferred,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
@@ -969,10 +962,9 @@ static struct ctl_table kern_table[] = {
{
.procname = "hung_task_check_count",
.data = &sysctl_hung_task_check_count,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .proc_handler = proc_doulongvec_minmax,
},
{
.procname = "hung_task_timeout_secs",
@@ -1057,7 +1049,6 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
.proc_handler = perf_proc_update_handler,
- .extra1 = &one,
},
{
.procname = "perf_cpu_time_max_percent",
@@ -2223,11 +2214,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
*i = val;
} else {
val = convdiv * (*i) / convmul;
- if (!first) {
+ if (!first)
err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- }
err = proc_put_long(&buffer, &left, val, false);
if (err)
break;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd..b609213 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file,
if (get_user(value, vec + i))
goto out_kfree;
- str += scnprintf(str, end - str, "%lu\t", value);
+ str += snprintf(str, end - str, "%lu\t", value);
}
result = kernel_write(file, buffer, str - buffer, 0);
@@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file,
if (get_user(value, vec + i))
goto out_kfree;
- str += scnprintf(str, end - str, "%lu\t", value);
+ str += snprintf(str, end - str, "%lu\t", value);
}
result = kernel_write(file, buffer, str - buffer, 0);
@@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file,
if (get_user(dnaddr, (__le16 __user *)newval))
goto out;
- len = scnprintf(buf, sizeof(buf), "%hu.%hu",
+ len = snprintf(buf, sizeof(buf), "%hu.%hu",
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 4aef390..0000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-
- __INITRODATA
-
- .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
- .incbin "kernel/x509_certificate_list"
- .globl VMLINUX_SYMBOL(system_certificate_list_end)
-VMLINUX_SYMBOL(system_certificate_list_end):
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 564dd93..0000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const u8 system_certificate_list_end[];
-
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
- pr_notice("Initialise system trusted keyring\n");
-
- system_trusted_keyring =
- keyring_alloc(".system_keyring",
- KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(system_trusted_keyring))
- panic("Can't allocate system trusted keyring\n");
-
- set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
- return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
- key_ref_t key;
- const u8 *p, *end;
- size_t plen;
-
- pr_notice("Loading compiled-in X.509 certificates\n");
-
- end = system_certificate_list_end;
- p = system_certificate_list;
- while (p < end) {
- /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
- * than 256 bytes in size.
- */
- if (end - p < 4)
- goto dodgy_cert;
- if (p[0] != 0x30 &&
- p[1] != 0x82)
- goto dodgy_cert;
- plen = (p[2] << 8) | p[3];
- plen += 4;
- if (plen > end - p)
- goto dodgy_cert;
-
- key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
- "asymmetric",
- NULL,
- p,
- plen,
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ),
- KEY_ALLOC_NOT_IN_QUOTA |
- KEY_ALLOC_TRUSTED);
- if (IS_ERR(key)) {
- pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
- PTR_ERR(key));
- } else {
- pr_notice("Loaded X.509 cert '%s'\n",
- key_ref_to_ptr(key)->description);
- key_ref_put(key);
- }
- p += plen;
- }
-
- return 0;
-
-dodgy_cert:
- pr_err("Problem parsing in-kernel X.509 certificate list\n");
- return 0;
-}
-late_initcall(load_system_certificate_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13d2f7c..145bb4d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,7 +290,6 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
struct listener_list *listeners;
struct listener *s, *tmp, *s2;
unsigned int cpu;
- int ret = 0;
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
@@ -305,10 +304,9 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
for_each_cpu(cpu, mask) {
s = kmalloc_node(sizeof(struct listener),
GFP_KERNEL, cpu_to_node(cpu));
- if (!s) {
- ret = -ENOMEM;
+ if (!s)
goto cleanup;
- }
+
s->pid = pid;
s->valid = 1;
@@ -341,7 +339,7 @@ cleanup:
}
up_write(&listeners->sem);
}
- return ret;
+ return 0;
}
static int parse(struct nlattr *na, struct cpumask *mask)
@@ -406,15 +404,11 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
if (!na)
goto err;
- if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
- nla_nest_cancel(skb, na);
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0)
goto err;
- }
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
- if (!ret) {
- nla_nest_cancel(skb, na);
+ if (!ret)
goto err;
- }
nla_nest_end(skb, na);
return nla_data(ret);
@@ -673,18 +667,17 @@ err:
nlmsg_free(rep_skb);
}
-static const struct genl_ops taskstats_ops[] = {
- {
- .cmd = TASKSTATS_CMD_GET,
- .doit = taskstats_user_cmd,
- .policy = taskstats_cmd_get_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = CGROUPSTATS_CMD_GET,
- .doit = cgroupstats_user_cmd,
- .policy = cgroupstats_cmd_get_policy,
- },
+static struct genl_ops taskstats_ops = {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_user_cmd,
+ .policy = taskstats_cmd_get_policy,
+ .flags = GENL_ADMIN_PERM,
+};
+
+static struct genl_ops cgroupstats_ops = {
+ .cmd = CGROUPSTATS_CMD_GET,
+ .doit = cgroupstats_user_cmd,
+ .policy = cgroupstats_cmd_get_policy,
};
/* Needed early in initialization */
@@ -703,13 +696,26 @@ static int __init taskstats_init(void)
{
int rc;
- rc = genl_register_family_with_ops(&family, taskstats_ops);
+ rc = genl_register_family(&family);
if (rc)
return rc;
+ rc = genl_register_ops(&family, &taskstats_ops);
+ if (rc < 0)
+ goto err;
+
+ rc = genl_register_ops(&family, &cgroupstats_ops);
+ if (rc < 0)
+ goto err_cgroup_ops;
+
family_registered = 1;
pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
return 0;
+err_cgroup_ops:
+ genl_unregister_ops(&family, &taskstats_ops);
+err:
+ genl_unregister_family(&family);
+ return rc;
}
/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 3ce6e8c..2b62fe8 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
# RCU_USER_QS dependency
depends on HAVE_CONTEXT_TRACKING
# VIRT_CPU_ACCOUNTING_GEN dependency
- depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+ depends on 64BIT
select NO_HZ_COMMON
select RCU_USER_QS
select RCU_NOCB_CPU
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65..eec50fc 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
if (!alarmtimer_get_rtcdev())
- return -EINVAL;
+ return -ENOTSUPP;
return hrtimer_get_res(baseid, tp);
}
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
- return -EINVAL;
+ return -ENOTSUPP;
*tp = ktime_to_timespec(base->gettime());
return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad60..662c579 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
const char *buf, size_t count)
{
char name[CS_NAME_LEN];
- ssize_t ret = sysfs_get_uname(buf, name, count);
+ size_t ret = sysfs_get_uname(buf, name, count);
struct clock_event_device *ce;
if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502..50a8736 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,7 +479,6 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
-void clocksource_mark_unstable(struct clocksource *cs) { }
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -538,55 +537,40 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
}
/**
- * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
- * @mult: cycle to nanosecond multiplier
- * @shift: cycle to nanosecond divisor (power of two)
- * @maxadj: maximum adjustment value to mult (~11%)
- * @mask: bitmask for two's complement subtraction of non 64 bit counters
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs: Pointer to clocksource
+ *
*/
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+static u64 clocksource_max_deferment(struct clocksource *cs)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
* cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
+ * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
* which is equivalent to the below.
- * max_cycles < (2^63)/(mult + maxadj)
- * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
- * max_cycles < 2^(63 - log2(mult + maxadj))
- * max_cycles < 1 << (63 - log2(mult + maxadj))
+ * max_cycles < (2^63)/(cs->mult + cs->maxadj)
+ * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
+ * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
+ * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
+ * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
* Please note that we add 1 to the result of the log2 to account for
* any rounding errors, ensure the above inequality is satisfied and
* no overflow will occur.
*/
- max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+ max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
/*
* The actual maximum number of cycles we can defer the clocksource is
- * determined by the minimum of max_cycles and mask.
+ * determined by the minimum of max_cycles and cs->mask.
* Note: Here we subtract the maxadj to make sure we don't sleep for
* too long if there's a large negative adjustment.
*/
- max_cycles = min(max_cycles, mask);
- max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
-
- return max_nsecs;
-}
-
-/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
- *
- */
-static u64 clocksource_max_deferment(struct clocksource *cs)
-{
- u64 max_nsecs;
+ max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+ max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+ cs->shift);
- max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
- cs->mask);
/*
* To ensure that the clocksource does not wrap whilst we are idle,
* limit the time the clocksource can be deferred by 12.5%. Please
@@ -909,7 +893,7 @@ sysfs_show_current_clocksources(struct device *dev,
return count;
}
-ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
{
size_t ret = cnt;
@@ -940,7 +924,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- ssize_t ret;
+ size_t ret;
mutex_lock(&clocksource_mutex);
@@ -968,7 +952,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
{
struct clocksource *cs;
char name[CS_NAME_LEN];
- ssize_t ret;
+ size_t ret;
ret = sysfs_get_uname(buf, name, count);
if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af8d1d4..bb22151 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,7 +475,6 @@ static void sync_cmos_clock(struct work_struct *work)
* called as close as possible to 500 ms before the new second starts.
* This code is run on a timer. If the clock is set, that timer
* may not expire at the correct time. Thus, we adjust...
- * We want the clock to be within a couple of ticks from the target.
*/
if (!ntp_synced()) {
/*
@@ -486,7 +485,7 @@ static void sync_cmos_clock(struct work_struct *work)
}
getnstimeofday(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
struct timespec adjust = now;
fail = -ENODEV;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68b7993..0b479a6 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,28 +8,25 @@
#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <linux/ktime.h>
#include <linux/kernel.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/syscore_ops.h>
-#include <linux/hrtimer.h>
+#include <linux/timer.h>
#include <linux/sched_clock.h>
-#include <linux/seqlock.h>
-#include <linux/bitops.h>
struct clock_data {
- ktime_t wrap_kt;
u64 epoch_ns;
- u64 epoch_cyc;
- seqcount_t seq;
+ u32 epoch_cyc;
+ u32 epoch_cyc_copy;
unsigned long rate;
u32 mult;
u32 shift;
bool suspended;
};
-static struct hrtimer sched_clock_timer;
+static void sched_clock_poll(unsigned long wrap_ticks);
+static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
@@ -38,46 +35,42 @@ static struct clock_data cd = {
.mult = NSEC_PER_SEC / HZ,
};
-static u64 __read_mostly sched_clock_mask;
+static u32 __read_mostly sched_clock_mask = 0xffffffff;
-static u64 notrace jiffy_sched_clock_read(void)
+static u32 notrace jiffy_sched_clock_read(void)
{
- /*
- * We don't need to use get_jiffies_64 on 32-bit arches here
- * because we register with BITS_PER_LONG
- */
- return (u64)(jiffies - INITIAL_JIFFIES);
+ return (u32)(jiffies - INITIAL_JIFFIES);
}
-static u32 __read_mostly (*read_sched_clock_32)(void);
-
-static u64 notrace read_sched_clock_32_wrapper(void)
-{
- return read_sched_clock_32();
-}
-
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
-unsigned long long notrace sched_clock(void)
+static unsigned long long notrace sched_clock_32(void)
{
u64 epoch_ns;
- u64 epoch_cyc;
- u64 cyc;
- unsigned long seq;
+ u32 epoch_cyc;
+ u32 cyc;
if (cd.suspended)
return cd.epoch_ns;
+ /*
+ * Load the epoch_cyc and epoch_ns atomically. We do this by
+ * ensuring that we always write epoch_cyc, epoch_ns and
+ * epoch_cyc_copy in strict order, and read them in strict order.
+ * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
+ * the middle of an update, and we should repeat the load.
+ */
do {
- seq = read_seqcount_begin(&cd.seq);
epoch_cyc = cd.epoch_cyc;
+ smp_rmb();
epoch_ns = cd.epoch_ns;
- } while (read_seqcount_retry(&cd.seq, seq));
+ smp_rmb();
+ } while (epoch_cyc != cd.epoch_cyc_copy);
cyc = read_sched_clock();
cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -90,46 +83,49 @@ unsigned long long notrace sched_clock(void)
static void notrace update_sched_clock(void)
{
unsigned long flags;
- u64 cyc;
+ u32 cyc;
u64 ns;
cyc = read_sched_clock();
ns = cd.epoch_ns +
cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
cd.mult, cd.shift);
-
+ /*
+ * Write epoch_cyc and epoch_ns in a way that the update is
+ * detectable in cyc_to_fixed_sched_clock().
+ */
raw_local_irq_save(flags);
- write_seqcount_begin(&cd.seq);
+ cd.epoch_cyc_copy = cyc;
+ smp_wmb();
cd.epoch_ns = ns;
+ smp_wmb();
cd.epoch_cyc = cyc;
- write_seqcount_end(&cd.seq);
raw_local_irq_restore(flags);
}
-static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
+static void sched_clock_poll(unsigned long wrap_ticks)
{
+ mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
update_sched_clock();
- hrtimer_forward_now(hrt, cd.wrap_kt);
- return HRTIMER_RESTART;
}
-void __init sched_clock_register(u64 (*read)(void), int bits,
- unsigned long rate)
+void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
{
- unsigned long r;
+ unsigned long r, w;
u64 res, wrap;
char r_unit;
if (cd.rate > rate)
return;
+ BUG_ON(bits > 32);
WARN_ON(!irqs_disabled());
read_sched_clock = read;
- sched_clock_mask = CLOCKSOURCE_MASK(bits);
+ sched_clock_mask = (1ULL << bits) - 1;
cd.rate = rate;
/* calculate the mult/shift to convert counter ticks to ns. */
- clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
+ clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
r = rate;
if (r >= 4000000) {
@@ -142,14 +138,20 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
r_unit = ' ';
/* calculate how many ns until we wrap */
- wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
- cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
+ do_div(wrap, NSEC_PER_MSEC);
+ w = wrap;
/* calculate the ns resolution of this counter */
res = cyc_to_ns(1ULL, cd.mult, cd.shift);
- pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
- bits, r, r_unit, res, wrap);
+ pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
+ bits, r, r_unit, res, w);
+ /*
+ * Start the timer to keep sched_clock() properly updated and
+ * sets the initial epoch.
+ */
+ sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
update_sched_clock();
/*
@@ -164,10 +166,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
pr_debug("Registered %pF as sched_clock source\n", read);
}
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
+unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
+
+unsigned long long notrace sched_clock(void)
{
- read_sched_clock_32 = read;
- sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
+ return sched_clock_func();
}
void __init sched_clock_postinit(void)
@@ -177,22 +180,14 @@ void __init sched_clock_postinit(void)
* make it the final one one.
*/
if (read_sched_clock == jiffy_sched_clock_read)
- sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
+ setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
- update_sched_clock();
-
- /*
- * Start the timer to keep sched_clock() properly updated and
- * sets the initial epoch.
- */
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sched_clock_timer.function = sched_clock_poll;
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ sched_clock_poll(sched_clock_timer.data);
}
static int sched_clock_suspend(void)
{
- sched_clock_poll(&sched_clock_timer);
+ sched_clock_poll(sched_clock_timer.data);
cd.suspended = true;
return 0;
}
@@ -200,6 +195,7 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void)
{
cd.epoch_cyc = read_sched_clock();
+ cd.epoch_cyc_copy = cd.epoch_cyc;
cd.suspended = false;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690..218bcb5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,7 +70,6 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
struct clock_event_device *newdev)
{
if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
- (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
(newdev->features & CLOCK_EVT_FEAT_C3STOP))
return false;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 162b03a..64522ec 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,21 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
-
-/*
- * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
- * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
- * variable has two functions:
- *
- * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
- * timekeeping lock all at once. Only the CPU which is assigned to do the
- * update is handling it.
- *
- * 2) Hand off the duty in the NOHZ idle case by setting the value to
- * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
- * at it will take over and keep the time keeping alive. The handover
- * procedure also covers cpu hotplug.
- */
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 18e71f7..bc906ca 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
extern void clockevents_shutdown(struct clock_event_device *dev);
-extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/*
* NO_HZ / high resolution timer shared code
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ea20f7d..3612fc7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-static int tick_nohz_enabled __read_mostly = 1;
-int tick_nohz_active __read_mostly;
+int tick_nohz_enabled __read_mostly = 1;
+
/*
* Enable / Disable tickless mode
*/
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, idle;
- if (!tick_nohz_active)
+ if (!tick_nohz_enabled)
return -1;
now = ktime_get();
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, iowait;
- if (!tick_nohz_active)
+ if (!tick_nohz_enabled)
return -1;
now = ktime_get();
@@ -711,10 +711,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
- ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
return false;
- }
if (need_resched())
return false;
@@ -801,6 +799,11 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = &__get_cpu_var(tick_cpu_sched);
+ /*
+ * set ts->inidle unconditionally. even if the system did not
+ * switch to nohz mode the cpu frequency governers rely on the
+ * update of the idle time accounting in tick_nohz_start_idle().
+ */
ts->inidle = 1;
__tick_nohz_idle_enter(ts);
@@ -970,7 +973,7 @@ static void tick_nohz_switch_to_nohz(void)
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t next;
- if (!tick_nohz_active)
+ if (!tick_nohz_enabled)
return;
local_irq_disable();
@@ -978,7 +981,7 @@ static void tick_nohz_switch_to_nohz(void)
local_irq_enable();
return;
}
- tick_nohz_active = 1;
+
ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
@@ -1136,10 +1139,8 @@ void tick_setup_sched_timer(void)
}
#ifdef CONFIG_NO_HZ_COMMON
- if (tick_nohz_enabled) {
+ if (tick_nohz_enabled)
ts->nohz_mode = NOHZ_MODE_HIGHRES;
- tick_nohz_active = 1;
- }
#endif
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 87b4f00..947ba25 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
tk->xtime_nsec -= remainder;
tk->xtime_nsec += 1ULL << tk->shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
+
}
#else
#define old_vsyscall_fixup(tk)
@@ -1613,10 +1613,9 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
* ktime_get_update_offsets - hrtimer helper
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
- * @offs_tai: pointer to storage for monotonic -> clock tai offset
*
* Returns current monotonic time and updates the offsets
- * Called from hrtimer_interrupt() or retrigger_next_event()
+ * Called from hrtimer_interupt() or retrigger_next_event()
*/
ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
ktime_t *offs_tai)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f2..0b537f2 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
period = ktime_to_timespec(time);
ms = period.tv_nsec / 1000000;
- seq_puts(m, "Timer Stats Version: v0.3\n");
+ seq_puts(m, "Timer Stats Version: v0.2\n");
seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
if (atomic_read(&overflow_count))
- seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
- seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
+ seq_printf(m, "Overflow: %d entries\n",
+ atomic_read(&overflow_count));
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
- if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
seq_printf(m, "%4luD, %5d %-16s ",
entry->count, entry->pid, entry->comm);
} else {
diff --git a/kernel/timer.c b/kernel/timer.c
index accfd24..4296d13 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
unsigned long data)
{
- int count = preempt_count();
+ int preempt_count = preempt_count();
#ifdef CONFIG_LOCKDEP
/*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
lock_map_release(&lockdep_map);
- if (count != preempt_count()) {
+ if (preempt_count != preempt_count()) {
WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
- fn, count, preempt_count());
+ fn, preempt_count, preempt_count());
/*
* Restore the preempt count. That gives us a decent
* chance to survive and extract information. If the
* callback kept a lock held, bad luck, but not worse
* than the BUG() we had.
*/
- preempt_count_set(count);
+ preempt_count() = preempt_count;
}
}
@@ -1518,8 +1518,9 @@ static int init_timers_cpu(int cpu)
/*
* The APs use this path later in boot
*/
- base = kzalloc_node(sizeof(*base), GFP_KERNEL,
- cpu_to_node(cpu));
+ base = kmalloc_node(sizeof(*base),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
if (!base)
return -ENOMEM;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f785aef..b8b8560 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,7 +26,6 @@
#include <linux/export.h>
#include <linux/time.h>
#include <linux/uaccess.h>
-#include <linux/list.h>
#include <trace/events/block.h>
@@ -39,9 +38,6 @@ static unsigned int blktrace_seq __read_mostly = 1;
static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
-static LIST_HEAD(running_trace_list);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
-
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -111,18 +107,10 @@ record_it:
* Send out a notify for this process, if we haven't done so since a trace
* started
*/
-static void trace_note_tsk(struct task_struct *tsk)
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
{
- unsigned long flags;
- struct blk_trace *bt;
-
tsk->btrace_seq = blktrace_seq;
- spin_lock_irqsave(&running_trace_lock, flags);
- list_for_each_entry(bt, &running_trace_list, running_list) {
- trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm));
- }
- spin_unlock_irqrestore(&running_trace_lock, flags);
+ trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
}
static void trace_note_time(struct blk_trace *bt)
@@ -241,15 +229,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
goto record_it;
}
- if (unlikely(tsk->btrace_seq != blktrace_seq))
- trace_note_tsk(tsk);
-
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
+
+ if (unlikely(tsk->btrace_seq != blktrace_seq))
+ trace_note_tsk(bt, tsk);
+
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -488,7 +477,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->dir = dir;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
- INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -579,12 +567,13 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.end_lba = cbuts.end_lba,
.pid = cbuts.pid,
};
+ memcpy(&buts.name, &cbuts.name, 32);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
if (ret)
return ret;
- if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
+ if (copy_to_user(arg, &buts.name, 32)) {
blk_trace_remove(q);
return -EFAULT;
}
@@ -612,9 +601,6 @@ int blk_trace_startstop(struct request_queue *q, int start)
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
- spin_lock_irq(&running_trace_lock);
- list_add(&bt->running_list, &running_trace_list);
- spin_unlock_irq(&running_trace_lock);
trace_note_time(bt);
ret = 0;
@@ -622,9 +608,6 @@ int blk_trace_startstop(struct request_queue *q, int start)
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
- spin_lock_irq(&running_trace_lock);
- list_del_init(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
ret = 0;
}
@@ -1489,9 +1472,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
- spin_lock_irq(&running_trace_lock);
- list_del(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0e9f9ea..03cf44a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,6 +367,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
static int __register_ftrace_function(struct ftrace_ops *ops)
{
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
if (FTRACE_WARN_ON(ops == &global_ops))
return -EINVAL;
@@ -425,6 +428,9 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
{
int ret;
+ if (ftrace_disabled)
+ return -ENODEV;
+
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
@@ -2082,15 +2088,10 @@ static void ftrace_startup_enable(int command)
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
bool hash_enable = true;
- int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
- ret = __register_ftrace_function(ops);
- if (ret)
- return ret;
-
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
@@ -2112,17 +2113,12 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return 0;
}
-static int ftrace_shutdown(struct ftrace_ops *ops, int command)
+static void ftrace_shutdown(struct ftrace_ops *ops, int command)
{
bool hash_disable = true;
- int ret;
if (unlikely(ftrace_disabled))
- return -ENODEV;
-
- ret = __unregister_ftrace_function(ops);
- if (ret)
- return ret;
+ return;
ftrace_start_up--;
/*
@@ -2157,10 +2153,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
}
if (!command || !ftrace_enabled)
- return 0;
+ return;
ftrace_run_update_code(command);
- return 0;
}
static void ftrace_startup_sysctl(void)
@@ -3065,13 +3060,16 @@ static void __enable_ftrace_function_probe(void)
if (i == FTRACE_FUNC_HASHSIZE)
return;
- ret = ftrace_startup(&trace_probe_ops, 0);
+ ret = __register_ftrace_function(&trace_probe_ops);
+ if (!ret)
+ ret = ftrace_startup(&trace_probe_ops, 0);
ftrace_probe_registered = 1;
}
static void __disable_ftrace_function_probe(void)
{
+ int ret;
int i;
if (!ftrace_probe_registered)
@@ -3084,7 +3082,9 @@ static void __disable_ftrace_function_probe(void)
}
/* no more funcs left */
- ftrace_shutdown(&trace_probe_ops, 0);
+ ret = __unregister_ftrace_function(&trace_probe_ops);
+ if (!ret)
+ ftrace_shutdown(&trace_probe_ops, 0);
ftrace_probe_registered = 0;
}
@@ -3307,11 +3307,7 @@ void unregister_ftrace_function_probe_all(char *glob)
static LIST_HEAD(ftrace_commands);
static DEFINE_MUTEX(ftrace_cmd_mutex);
-/*
- * Currently we only register ftrace commands from __init, so mark this
- * __init too.
- */
-__init int register_ftrace_command(struct ftrace_func_command *cmd)
+int register_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p;
int ret = 0;
@@ -3330,11 +3326,7 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd)
return ret;
}
-/*
- * Currently we only unregister ftrace commands from __init, so mark
- * this __init too.
- */
-__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p, *n;
int ret = -ENODEV;
@@ -3649,7 +3641,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
static int __init set_graph_function(char *str)
{
@@ -3667,7 +3659,7 @@ static void __init set_ftrace_early_graph(char *buf)
func = strsep(&buf, ",");
/* we allow only one expression at a time */
ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- FTRACE_GRAPH_MAX_FUNCS, func);
+ func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -3784,25 +3776,15 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
int ftrace_graph_count;
-int ftrace_graph_notrace_count;
+int ftrace_graph_filter_enabled;
unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-
-struct ftrace_graph_data {
- unsigned long *table;
- size_t size;
- int *count;
- const struct seq_operations *seq_ops;
-};
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
- struct ftrace_graph_data *fgd = m->private;
-
- if (*pos >= *fgd->count)
+ if (*pos >= ftrace_graph_count)
return NULL;
- return &fgd->table[*pos];
+ return &ftrace_graph_funcs[*pos];
}
static void *
@@ -3814,12 +3796,10 @@ g_next(struct seq_file *m, void *v, loff_t *pos)
static void *g_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_graph_data *fgd = m->private;
-
mutex_lock(&graph_lock);
/* Nothing, tell g_show to print all functions are enabled */
- if (!*fgd->count && !*pos)
+ if (!ftrace_graph_filter_enabled && !*pos)
return (void *)1;
return __g_next(m, pos);
@@ -3855,88 +3835,38 @@ static const struct seq_operations ftrace_graph_seq_ops = {
};
static int
-__ftrace_graph_open(struct inode *inode, struct file *file,
- struct ftrace_graph_data *fgd)
+ftrace_graph_open(struct inode *inode, struct file *file)
{
int ret = 0;
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
mutex_lock(&graph_lock);
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
- *fgd->count = 0;
- memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+ ftrace_graph_filter_enabled = 0;
+ ftrace_graph_count = 0;
+ memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
mutex_unlock(&graph_lock);
- if (file->f_mode & FMODE_READ) {
- ret = seq_open(file, fgd->seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = fgd;
- }
- } else
- file->private_data = fgd;
+ if (file->f_mode & FMODE_READ)
+ ret = seq_open(file, &ftrace_graph_seq_ops);
return ret;
}
static int
-ftrace_graph_open(struct inode *inode, struct file *file)
-{
- struct ftrace_graph_data *fgd;
-
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
- fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
- if (fgd == NULL)
- return -ENOMEM;
-
- fgd->table = ftrace_graph_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_count;
- fgd->seq_ops = &ftrace_graph_seq_ops;
-
- return __ftrace_graph_open(inode, file, fgd);
-}
-
-static int
-ftrace_graph_notrace_open(struct inode *inode, struct file *file)
-{
- struct ftrace_graph_data *fgd;
-
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
- fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
- if (fgd == NULL)
- return -ENOMEM;
-
- fgd->table = ftrace_graph_notrace_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_notrace_count;
- fgd->seq_ops = &ftrace_graph_seq_ops;
-
- return __ftrace_graph_open(inode, file, fgd);
-}
-
-static int
ftrace_graph_release(struct inode *inode, struct file *file)
{
- if (file->f_mode & FMODE_READ) {
- struct seq_file *m = file->private_data;
-
- kfree(m->private);
+ if (file->f_mode & FMODE_READ)
seq_release(inode, file);
- } else {
- kfree(file->private_data);
- }
-
return 0;
}
static int
-ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, char *buffer)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -3949,7 +3879,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
/* decode regex */
type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
- if (!not && *idx >= size)
+ if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
return -EBUSY;
search_len = strlen(search);
@@ -3977,7 +3907,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
fail = 0;
if (!exists) {
array[(*idx)++] = rec->ip;
- if (*idx >= size)
+ if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
goto out;
}
} else {
@@ -3995,6 +3925,8 @@ out:
if (fail)
return -EINVAL;
+ ftrace_graph_filter_enabled = !!(*idx);
+
return 0;
}
@@ -4003,33 +3935,36 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
- ssize_t read, ret = 0;
- struct ftrace_graph_data *fgd = file->private_data;
+ ssize_t read, ret;
if (!cnt)
return 0;
- if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
- return -ENOMEM;
+ mutex_lock(&graph_lock);
+
+ if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
read = trace_get_user(&parser, ubuf, cnt, ppos);
if (read >= 0 && trace_parser_loaded((&parser))) {
parser.buffer[parser.idx] = 0;
- mutex_lock(&graph_lock);
-
/* we allow only one expression at a time */
- ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
- parser.buffer);
-
- mutex_unlock(&graph_lock);
+ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+ parser.buffer);
+ if (ret)
+ goto out_free;
}
- if (!ret)
- ret = read;
+ ret = read;
+out_free:
trace_parser_put(&parser);
+out_unlock:
+ mutex_unlock(&graph_lock);
return ret;
}
@@ -4041,14 +3976,6 @@ static const struct file_operations ftrace_graph_fops = {
.llseek = ftrace_filter_lseek,
.release = ftrace_graph_release,
};
-
-static const struct file_operations ftrace_graph_notrace_fops = {
- .open = ftrace_graph_notrace_open,
- .read = seq_read,
- .write = ftrace_graph_write,
- .llseek = ftrace_filter_lseek,
- .release = ftrace_graph_release,
-};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
@@ -4070,9 +3997,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
trace_create_file("set_graph_function", 0444, d_tracer,
NULL,
&ftrace_graph_fops);
- trace_create_file("set_graph_notrace", 0444, d_tracer,
- NULL,
- &ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
return 0;
@@ -4366,15 +4290,12 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) \
- ({ \
- int ___ret = __register_ftrace_function(ops); \
- if (!___ret) \
- (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
- ___ret; \
+# define ftrace_startup(ops, command) \
+ ({ \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ 0; \
})
-# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
-
+# define ftrace_shutdown(ops, command) do { } while (0)
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
@@ -4399,21 +4320,12 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
*/
preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT);
-
- /*
- * Control funcs (perf) uses RCU. Only trace if
- * RCU is currently active.
- */
- if (!rcu_is_watching())
- goto out;
-
do_for_each_ftrace_op(op, ftrace_control_list) {
if (!(op->flags & FTRACE_OPS_FL_STUB) &&
!ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip, regs))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
- out:
trace_recursion_clear(TRACE_CONTROL_BIT);
preempt_enable_notrace();
}
@@ -4783,7 +4695,9 @@ int register_ftrace_function(struct ftrace_ops *ops)
mutex_lock(&ftrace_lock);
- ret = ftrace_startup(ops, 0);
+ ret = __register_ftrace_function(ops);
+ if (!ret)
+ ret = ftrace_startup(ops, 0);
mutex_unlock(&ftrace_lock);
@@ -4802,7 +4716,9 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
int ret;
mutex_lock(&ftrace_lock);
- ret = ftrace_shutdown(ops, 0);
+ ret = __unregister_ftrace_function(ops);
+ if (!ret)
+ ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
return ret;
@@ -4996,13 +4912,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
return NOTIFY_DONE;
}
-/* Just a place holder for function graph */
-static struct ftrace_ops fgraph_ops __read_mostly = {
- .func = ftrace_stub,
- .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
- FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -5029,7 +4938,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_return = retfunc;
ftrace_graph_entry = entryfunc;
- ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -5046,7 +4955,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_active--;
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
+ ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d20cd9..7974ba2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -235,33 +235,13 @@ void trace_array_put(struct trace_array *this_tr)
mutex_unlock(&trace_types_lock);
}
-int filter_check_discard(struct ftrace_event_file *file, void *rec,
- struct ring_buffer *buffer,
- struct ring_buffer_event *event)
+int filter_current_check_discard(struct ring_buffer *buffer,
+ struct ftrace_event_call *call, void *rec,
+ struct ring_buffer_event *event)
{
- if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(file->filter, rec)) {
- ring_buffer_discard_commit(buffer, event);
- return 1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(filter_check_discard);
-
-int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
- struct ring_buffer *buffer,
- struct ring_buffer_event *event)
-{
- if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(call->filter, rec)) {
- ring_buffer_discard_commit(buffer, event);
- return 1;
- }
-
- return 0;
+ return filter_check_discard(call, rec, buffer, event);
}
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
@@ -863,12 +843,9 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
if (isspace(ch)) {
parser->buffer[parser->idx] = 0;
parser->cont = false;
- } else if (parser->idx < parser->size - 1) {
+ } else {
parser->cont = true;
parser->buffer[parser->idx++] = ch;
- } else {
- ret = -EINVAL;
- goto out;
}
*ppos += read;
@@ -1284,6 +1261,21 @@ int is_tracing_stopped(void)
}
/**
+ * ftrace_off_permanent - disable all ftrace code permanently
+ *
+ * This should only be called when a serious anomally has
+ * been detected. This will turn off the function tracing,
+ * ring buffers, and other tracing utilites. It takes no
+ * locks and can be called from any context.
+ */
+void ftrace_off_permanent(void)
+{
+ tracing_disabled = 1;
+ ftrace_stop();
+ tracing_off_permanent();
+}
+
+/**
* tracing_start - quick start of the tracer
*
* If tracing is enabled but was stopped by tracing_stop,
@@ -1517,8 +1509,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+ (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
@@ -1639,7 +1630,7 @@ trace_function(struct trace_array *tr,
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
}
@@ -1723,7 +1714,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
entry->size = trace.nr_entries;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
out:
@@ -1825,7 +1816,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
trace.entries = entry->caller;
save_stack_trace_user(&trace);
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
out_drop_count:
@@ -2017,7 +2008,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
entry->fmt = fmt;
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
+ if (!filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
}
@@ -2072,7 +2063,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
memcpy(&entry->buf, tbuffer, len);
entry->buf[len] = '\0';
- if (!call_filter_check_discard(call, entry, buffer, event)) {
+ if (!filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
}
@@ -2769,7 +2760,7 @@ static void show_snapshot_main_help(struct seq_file *m)
seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
seq_printf(m, "# Takes a snapshot of the main buffer.\n");
- seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
+ seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
seq_printf(m, "# is not a '0' or '1')\n");
}
@@ -2973,11 +2964,6 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
return 0;
}
-bool tracing_is_disabled(void)
-{
- return (tracing_disabled) ? true: false;
-}
-
/*
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
@@ -5468,12 +5454,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = {
.func = ftrace_trace_snapshot_callback,
};
-static __init int register_snapshot_cmd(void)
+static int register_snapshot_cmd(void)
{
return register_ftrace_command(&ftrace_snapshot_cmd);
}
#else
-static inline __init int register_snapshot_cmd(void) { return 0; }
+static inline int register_snapshot_cmd(void) { return 0; }
#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
@@ -6267,17 +6253,6 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->trace = iter->tr->current_trace;
iter->cpu_file = RING_BUFFER_ALL_CPUS;
iter->trace_buffer = &global_trace.trace_buffer;
-
- if (iter->trace && iter->trace->open)
- iter->trace->open(iter);
-
- /* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->trace_buffer->buffer))
- iter->iter_flags |= TRACE_FILE_ANNOTATE;
-
- /* Output in nanoseconds only if we are using a clock in nanoseconds. */
- if (trace_clocks[iter->tr->clock_id].in_ns)
- iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
}
void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea189e0..10c86fb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,7 +124,6 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_PREEMPT_RESCHED = 0x20,
};
#define TRACE_BUF_SIZE 1024
@@ -193,8 +192,8 @@ struct trace_array {
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
- struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
- struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+ DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+ DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
#endif
int stop_count;
int clock_id;
@@ -515,7 +514,6 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
-bool tracing_is_disabled(void);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
@@ -713,8 +711,6 @@ extern unsigned long trace_flags;
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
-#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
-#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
@@ -734,16 +730,15 @@ extern void __trace_graph_return(struct trace_array *tr,
#ifdef CONFIG_DYNAMIC_FTRACE
/* TODO: make this variable */
#define FTRACE_GRAPH_MAX_FUNCS 32
+extern int ftrace_graph_filter_enabled;
extern int ftrace_graph_count;
extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
-extern int ftrace_graph_notrace_count;
-extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
static inline int ftrace_graph_addr(unsigned long addr)
{
int i;
- if (!ftrace_graph_count)
+ if (!ftrace_graph_filter_enabled)
return 1;
for (i = 0; i < ftrace_graph_count; i++) {
@@ -763,31 +758,11 @@ static inline int ftrace_graph_addr(unsigned long addr)
return 0;
}
-
-static inline int ftrace_graph_notrace_addr(unsigned long addr)
-{
- int i;
-
- if (!ftrace_graph_notrace_count)
- return 0;
-
- for (i = 0; i < ftrace_graph_notrace_count; i++) {
- if (addr == ftrace_graph_notrace_funcs[i])
- return 1;
- }
-
- return 0;
-}
#else
static inline int ftrace_graph_addr(unsigned long addr)
{
return 1;
}
-
-static inline int ftrace_graph_notrace_addr(unsigned long addr)
-{
- return 0;
-}
#endif /* CONFIG_DYNAMIC_FTRACE */
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
@@ -1011,9 +986,9 @@ struct filter_pred {
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_file *file,
+extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_file *file,
+extern int apply_event_filter(struct ftrace_event_call *call,
char *filter_string);
extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string);
@@ -1024,6 +999,20 @@ extern int filter_assign_type(const char *type);
struct ftrace_event_field *
trace_find_event_field(struct ftrace_event_call *call, char *name);
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(call->filter, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+
extern void trace_event_enable_cmd_record(bool enable);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9b..d594da0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
out:
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f42..80c36bc 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,15 +24,9 @@ static int total_ref_count;
static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
- if (tp_event->perf_perm) {
- int ret = tp_event->perf_perm(tp_event, p_event);
- if (ret)
- return ret;
- }
-
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event) &&
- perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EPERM;
/* No tracing, just counting, so no obvious leak */
@@ -179,7 +173,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
int perf_trace_init(struct perf_event *p_event)
{
struct ftrace_event_call *tp_event;
- u64 event_id = p_event->attr.config;
+ int event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a11800a..368a4d5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -989,7 +989,7 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
struct trace_seq *s;
int r = -ENODEV;
@@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- print_event_filter(file, s);
+ call = event_file_data(filp);
+ if (call)
+ print_event_filter(call, s);
mutex_unlock(&event_mutex);
- if (file)
+ if (call)
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
@@ -1021,7 +1021,7 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
char *buf;
int err = -ENODEV;
@@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
buf[cnt] = '\0';
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- err = apply_event_filter(file, buf);
+ call = event_file_data(filp);
+ if (call)
+ err = apply_event_filter(call, buf);
mutex_unlock(&event_mutex);
free_page((unsigned long) buf);
@@ -1062,9 +1062,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
struct trace_array *tr;
int ret;
- if (tracing_is_disabled())
- return -ENODEV;
-
/* Make sure the system still exists */
mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
@@ -1111,9 +1108,6 @@ static int system_tr_open(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_is_disabled())
- return -ENODEV;
-
if (trace_array_get(tr) < 0)
return -ENODEV;
@@ -1130,12 +1124,11 @@ static int system_tr_open(struct inode *inode, struct file *filp)
if (ret < 0) {
trace_array_put(tr);
kfree(dir);
- return ret;
}
filp->private_data = dir;
- return 0;
+ return ret;
}
static int subsystem_release(struct inode *inode, struct file *file)
@@ -1546,7 +1539,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
return -1;
}
}
- trace_create_file("filter", 0644, file->dir, file,
+ trace_create_file("filter", 0644, file->dir, call,
&ftrace_event_filter_fops);
trace_create_file("format", 0444, file->dir, call,
@@ -1584,7 +1577,6 @@ static void event_remove(struct ftrace_event_call *call)
if (file->event_call != call)
continue;
ftrace_event_enable_disable(file, 0);
- destroy_preds(file);
/*
* The do_for_each_event_file() is
* a double loop. After finding the call for this
@@ -1708,7 +1700,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- destroy_call_preds(call);
+ destroy_preds(call);
}
static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -2314,9 +2306,6 @@ int event_trace_del_tracer(struct trace_array *tr)
/* Disable any running events */
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
- /* Access to events are within rcu_read_lock_sched() */
- synchronize_sched();
-
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2468f56..97daa8c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,18 +637,10 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
-static inline struct event_filter *event_filter(struct ftrace_event_file *file)
-{
- if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- return file->event_call->filter;
- else
- return file->filter;
-}
-
/* caller must hold event_mutex */
-void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter = event_filter(file);
+ struct event_filter *filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
@@ -774,21 +766,11 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void call_filter_disable(struct ftrace_event_call *call)
+static void filter_disable(struct ftrace_event_call *call)
{
call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
-static void filter_disable(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call_filter_disable(call);
- else
- file->flags &= ~FTRACE_EVENT_FL_FILTERED;
-}
-
static void __free_filter(struct event_filter *filter)
{
if (!filter)
@@ -799,30 +781,16 @@ static void __free_filter(struct event_filter *filter)
kfree(filter);
}
-void destroy_call_preds(struct ftrace_event_call *call)
-{
- __free_filter(call->filter);
- call->filter = NULL;
-}
-
-static void destroy_file_preds(struct ftrace_event_file *file)
-{
- __free_filter(file->filter);
- file->filter = NULL;
-}
-
/*
- * Called when destroying the ftrace_event_file.
- * The file is being freed, so we do not need to worry about
- * the file being currently used. This is for module code removing
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
* the tracepoints from within it.
*/
-void destroy_preds(struct ftrace_event_file *file)
+void destroy_preds(struct ftrace_event_call *call)
{
- if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- destroy_call_preds(file->event_call);
- else
- destroy_file_preds(file);
+ __free_filter(call->filter);
+ call->filter = NULL;
}
static struct event_filter *__alloc_filter(void)
@@ -857,56 +825,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
return 0;
}
-static inline void __remove_filter(struct ftrace_event_file *file)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
{
- struct ftrace_event_call *call = file->event_call;
-
- filter_disable(file);
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- remove_filter_string(call->filter);
- else
- remove_filter_string(file->filter);
-}
-
-static void filter_free_subsystem_preds(struct event_subsystem *system,
- struct trace_array *tr)
-{
- struct ftrace_event_file *file;
struct ftrace_event_call *call;
- list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
+ list_for_each_entry(call, &ftrace_events, list) {
if (strcmp(call->class->system, system->name) != 0)
continue;
- __remove_filter(file);
- }
-}
-
-static inline void __free_subsystem_filter(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
- __free_filter(call->filter);
- call->filter = NULL;
- } else {
- __free_filter(file->filter);
- file->filter = NULL;
+ filter_disable(call);
+ remove_filter_string(call->filter);
}
}
-static void filter_free_subsystem_filters(struct event_subsystem *system,
- struct trace_array *tr)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
{
- struct ftrace_event_file *file;
struct ftrace_event_call *call;
- list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
+ list_for_each_entry(call, &ftrace_events, list) {
if (strcmp(call->class->system, system->name) != 0)
continue;
- __free_subsystem_filter(file);
+ __free_filter(call->filter);
+ call->filter = NULL;
}
}
@@ -1677,85 +1617,15 @@ fail:
return err;
}
-static inline void event_set_filtered_flag(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags |= TRACE_EVENT_FL_FILTERED;
- else
- file->flags |= FTRACE_EVENT_FL_FILTERED;
-}
-
-static inline void event_set_filter(struct ftrace_event_file *file,
- struct event_filter *filter)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- rcu_assign_pointer(call->filter, filter);
- else
- rcu_assign_pointer(file->filter, filter);
-}
-
-static inline void event_clear_filter(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- RCU_INIT_POINTER(call->filter, NULL);
- else
- RCU_INIT_POINTER(file->filter, NULL);
-}
-
-static inline void
-event_set_no_set_filter_flag(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
- else
- file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
-}
-
-static inline void
-event_clear_no_set_filter_flag(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
- else
- file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
-}
-
-static inline bool
-event_no_set_filter_flag(struct ftrace_event_file *file)
-{
- struct ftrace_event_call *call = file->event_call;
-
- if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
- return true;
-
- if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
- (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
- return true;
-
- return false;
-}
-
struct filter_list {
struct list_head list;
struct event_filter *filter;
};
static int replace_system_preds(struct event_subsystem *system,
- struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
- struct ftrace_event_file *file;
struct ftrace_event_call *call;
struct filter_list *filter_item;
struct filter_list *tmp;
@@ -1763,8 +1633,8 @@ static int replace_system_preds(struct event_subsystem *system,
bool fail = true;
int err;
- list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
+ list_for_each_entry(call, &ftrace_events, list) {
+
if (strcmp(call->class->system, system->name) != 0)
continue;
@@ -1774,20 +1644,18 @@ static int replace_system_preds(struct event_subsystem *system,
*/
err = replace_preds(call, NULL, ps, filter_string, true);
if (err)
- event_set_no_set_filter_flag(file);
+ call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
else
- event_clear_no_set_filter_flag(file);
+ call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
}
- list_for_each_entry(file, &tr->events, list) {
+ list_for_each_entry(call, &ftrace_events, list) {
struct event_filter *filter;
- call = file->event_call;
-
if (strcmp(call->class->system, system->name) != 0)
continue;
- if (event_no_set_filter_flag(file))
+ if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
continue;
filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
@@ -1808,17 +1676,17 @@ static int replace_system_preds(struct event_subsystem *system,
err = replace_preds(call, filter, ps, filter_string, false);
if (err) {
- filter_disable(file);
+ filter_disable(call);
parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
append_filter_err(ps, filter);
} else
- event_set_filtered_flag(file);
+ call->flags |= TRACE_EVENT_FL_FILTERED;
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
*/
- filter = event_filter(file);
- event_set_filter(file, filter_item->filter);
+ filter = call->filter;
+ rcu_assign_pointer(call->filter, filter_item->filter);
filter_item->filter = filter;
fail = false;
@@ -1948,7 +1816,6 @@ static int create_filter(struct ftrace_event_call *call,
* and always remembers @filter_str.
*/
static int create_system_filter(struct event_subsystem *system,
- struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
struct event_filter *filter = NULL;
@@ -1957,7 +1824,7 @@ static int create_system_filter(struct event_subsystem *system,
err = create_filter_start(filter_str, true, &ps, &filter);
if (!err) {
- err = replace_system_preds(system, tr, ps, filter_str);
+ err = replace_system_preds(system, ps, filter_str);
if (!err) {
/* System filters just show a default message */
kfree(filter->filter_string);
@@ -1973,25 +1840,20 @@ static int create_system_filter(struct event_subsystem *system,
}
/* caller must hold event_mutex */
-int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
- struct ftrace_event_call *call = file->event_call;
struct event_filter *filter;
int err;
if (!strcmp(strstrip(filter_string), "0")) {
- filter_disable(file);
- filter = event_filter(file);
-
+ filter_disable(call);
+ filter = call->filter;
if (!filter)
return 0;
-
- event_clear_filter(file);
-
+ RCU_INIT_POINTER(call->filter, NULL);
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
-
return 0;
}
@@ -2004,15 +1866,14 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
* string
*/
if (filter) {
- struct event_filter *tmp;
+ struct event_filter *tmp = call->filter;
- tmp = event_filter(file);
if (!err)
- event_set_filtered_flag(file);
+ call->flags |= TRACE_EVENT_FL_FILTERED;
else
- filter_disable(file);
+ filter_disable(call);
- event_set_filter(file, filter);
+ rcu_assign_pointer(call->filter, filter);
if (tmp) {
/* Make sure the call is done with the filter */
@@ -2028,7 +1889,6 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string)
{
struct event_subsystem *system = dir->subsystem;
- struct trace_array *tr = dir->tr;
struct event_filter *filter;
int err = 0;
@@ -2041,18 +1901,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
}
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system, tr);
+ filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
synchronize_sched();
- filter_free_subsystem_filters(system, tr);
+ filter_free_subsystem_filters(system);
__free_filter(filter);
goto out_unlock;
}
- err = create_system_filter(system, tr, filter_string, &filter);
+ err = create_system_filter(system, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e7..d21a746 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \
.event.type = etype, \
.class = &event_class_ftrace_##call, \
.print_fmt = print, \
- .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
struct ftrace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0b99120..b5c0924 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -82,9 +82,9 @@ static struct trace_array *graph_array;
* to fill in space into DURATION column.
*/
enum {
- FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT,
- FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT,
- FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
+ DURATION_FILL_FULL = -1,
+ DURATION_FILL_START = -2,
+ DURATION_FILL_END = -3,
};
static enum print_line_t
@@ -114,37 +114,16 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
return -EBUSY;
}
- /*
- * The curr_ret_stack is an index to ftrace return stack of
- * current task. Its value should be in [0, FTRACE_RETFUNC_
- * DEPTH) when the function graph tracer is used. To support
- * filtering out specific functions, it makes the index
- * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
- * so when it sees a negative index the ftrace will ignore
- * the record. And the index gets recovered when returning
- * from the filtered function by adding the FTRACE_NOTRACE_
- * DEPTH and then it'll continue to record functions normally.
- *
- * The curr_ret_stack is initialized to -1 and get increased
- * in this function. So it can be less than -1 only if it was
- * filtered out via ftrace_graph_notrace_addr() which can be
- * set from set_graph_notrace file in debugfs by user.
- */
- if (current->curr_ret_stack < -1)
- return -EBUSY;
-
calltime = trace_clock_local();
index = ++current->curr_ret_stack;
- if (ftrace_graph_notrace_addr(func))
- current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
barrier();
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = calltime;
current->ret_stack[index].subtime = 0;
current->ret_stack[index].fp = frame_pointer;
- *depth = current->curr_ret_stack;
+ *depth = index;
return 0;
}
@@ -158,17 +137,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
index = current->curr_ret_stack;
- /*
- * A negative index here means that it's just returned from a
- * notrace'd function. Recover index to get an original
- * return address. See ftrace_push_return_trace().
- *
- * TODO: Need to check whether the stack gets corrupted.
- */
- if (index < 0)
- index += FTRACE_NOTRACE_DEPTH;
-
- if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ if (unlikely(index < 0)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as well panic, otherwise we have no where to go */
@@ -224,15 +193,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
trace.rettime = trace_clock_local();
barrier();
current->curr_ret_stack--;
- /*
- * The curr_ret_stack can be less than -1 only if it was
- * filtered out and it's about to return from the function.
- * Recover the index and continue to trace normal functions.
- */
- if (current->curr_ret_stack < -1) {
- current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
- return ret;
- }
/*
* The trace should run after decrementing the ret counter
@@ -270,7 +230,7 @@ int __trace_graph_entry(struct trace_array *tr,
return 0;
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, call, entry, event))
__buffer_unlock_commit(buffer, event);
return 1;
@@ -299,20 +259,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
/* trace it when it is-nested-in or is a function enabled. */
if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
- ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
+ ftrace_graph_ignore_irqs()) ||
(max_depth && trace->depth >= max_depth))
return 0;
- /*
- * Do not trace a function if it's filtered by set_graph_notrace.
- * Make the index of ret stack negative to indicate that it should
- * ignore further functions. But it needs its own ret stack entry
- * to recover the original index in order to continue tracing after
- * returning from the function.
- */
- if (ftrace_graph_notrace_addr(trace->func))
- return 1;
-
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -385,7 +335,7 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, call, entry, event))
__buffer_unlock_commit(buffer, event);
}
@@ -702,7 +652,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* No overhead */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
+ ret = print_graph_duration(DURATION_FILL_START, s, flags);
if (ret != TRACE_TYPE_HANDLED)
return ret;
@@ -714,7 +664,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ ret = print_graph_duration(DURATION_FILL_END, s, flags);
if (ret != TRACE_TYPE_HANDLED)
return ret;
@@ -779,14 +729,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
return TRACE_TYPE_HANDLED;
/* No real adata, just filling the column with spaces */
- switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
- case FLAGS_FILL_FULL:
+ switch (duration) {
+ case DURATION_FILL_FULL:
ret = trace_seq_puts(s, " | ");
return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
- case FLAGS_FILL_START:
+ case DURATION_FILL_START:
ret = trace_seq_puts(s, " ");
return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
- case FLAGS_FILL_END:
+ case DURATION_FILL_END:
ret = trace_seq_puts(s, " |");
return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
}
@@ -902,7 +852,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+ ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
if (ret != TRACE_TYPE_HANDLED)
return ret;
@@ -1222,7 +1172,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+ ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
if (ret != TRACE_TYPE_HANDLED)
return ret;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index dae9541..243f683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
entry->ip = (unsigned long)tp->rp.kp.addr;
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
- if (!filter_check_discard(ftrace_file, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit_regs(buffer, event,
irq_flags, pc, regs);
}
@@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
- if (!filter_check_discard(ftrace_file, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit_regs(buffer, event,
irq_flags, pc, regs);
}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b8..b3dcfb2 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->rw = *rw;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, 0, pc);
}
@@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->map = *map;
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, 0, pc);
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284..34e7cba 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,23 +618,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
-
- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
- TRACE_FLAG_PREEMPT_RESCHED)) {
- case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
- need_resched = 'N';
- break;
- case TRACE_FLAG_NEED_RESCHED:
- need_resched = 'n';
- break;
- case TRACE_FLAG_PREEMPT_RESCHED:
- need_resched = 'p';
- break;
- default:
- need_resched = '.';
- break;
- }
-
+ need_resched =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
hardsoft_irq =
(hardirq && softirq) ? 'H' :
hardirq ? 'h' :
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9..4e98e3b 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_state = next->state;
entry->next_cpu = task_cpu(next);
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, flags, pc);
}
@@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_state = wakee->state;
entry->next_cpu = task_cpu(wakee);
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit(buffer, event, flags, pc);
}
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af6736..847f88a 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -43,15 +43,46 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
/* The root directory for all stat files */
static struct dentry *stat_dir;
-static void __reset_stat_session(struct stat_session *session)
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct tracer_stat *ts,
+ struct rb_node *node)
{
- struct stat_node *snode, *n;
+ struct stat_node *snode;
+ struct rb_node *parent = rb_parent(node);
+
+ if (node->rb_left)
+ return node->rb_left;
+ else if (node->rb_right)
+ return node->rb_right;
+ else {
+ if (!parent)
+ ;
+ else if (parent->rb_left == node)
+ parent->rb_left = NULL;
+ else
+ parent->rb_right = NULL;
- rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) {
- if (session->ts->stat_release)
- session->ts->stat_release(snode->stat);
+ snode = container_of(node, struct stat_node, node);
+ if (ts->stat_release)
+ ts->stat_release(snode->stat);
kfree(snode);
+
+ return parent;
}
+}
+
+static void __reset_stat_session(struct stat_session *session)
+{
+ struct rb_node *node = session->stat_root.rb_node;
+
+ while (node)
+ node = release_next(session->ts, node);
session->stat_root = RB_ROOT;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ea90eb5..559329d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -302,7 +302,6 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
- struct ftrace_event_file *ftrace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -315,13 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
-
- /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
- ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
- if (!ftrace_file)
- return;
-
- if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+ if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -343,7 +336,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
- if (!filter_check_discard(ftrace_file, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, sys_data->enter_event,
+ entry, event))
trace_current_buffer_unlock_commit(buffer, event,
irq_flags, pc);
}
@@ -351,7 +345,6 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
struct trace_array *tr = data;
- struct ftrace_event_file *ftrace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -363,13 +356,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
-
- /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
- ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
- if (!ftrace_file)
- return;
-
- if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))
+ if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -390,7 +377,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- if (!filter_check_discard(ftrace_file, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, sys_data->exit_event,
+ entry, event))
trace_current_buffer_unlock_commit(buffer, event,
irq_flags, pc);
}
@@ -409,7 +397,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
if (!tr->sys_refcount_enter)
ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (!ret) {
- rcu_assign_pointer(tr->enter_syscall_files[num], file);
+ set_bit(num, tr->enabled_enter_syscalls);
tr->sys_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
@@ -427,7 +415,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_enter--;
- rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
+ clear_bit(num, tr->enabled_enter_syscalls);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
@@ -447,7 +435,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
if (!tr->sys_refcount_exit)
ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
- rcu_assign_pointer(tr->exit_syscall_files[num], file);
+ set_bit(num, tr->enabled_exit_syscalls);
tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
@@ -465,7 +453,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
- rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
+ clear_bit(num, tr->enabled_exit_syscalls);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b6dcc42..272261b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -128,7 +128,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
- tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
return tu;
error:
@@ -562,7 +561,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
- if (!call_filter_check_discard(call, entry, buffer, event))
+ if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit(buffer, event, 0, 0);
}
diff --git a/kernel/up.c b/kernel/up.c
index 509403e..630d72b 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,17 +22,6 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-void __smp_call_function_single(int cpu, struct call_single_data *csd,
- int wait)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- csd->func(csd->info);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(__smp_call_function_single);
-
int on_each_cpu(smp_call_func_t func, void *info, int wait)
{
unsigned long flags;
diff --git a/kernel/user.c b/kernel/user.c
index a3a0dbf..5bbb919 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,10 +51,6 @@ struct user_namespace init_user_ns = {
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
-#ifdef CONFIG_KEYS_KERBEROS_CACHE
- .krb_cache_register_sem =
- __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem),
-#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62..13fb113 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,9 +101,6 @@ int create_user_ns(struct cred *new)
set_cred_user_ns(new, ns);
-#ifdef CONFIG_PERSISTENT_KEYRINGS
- init_rwsem(&ns->persistent_keyring_register_sem);
-#endif
return 0;
}
@@ -133,9 +130,6 @@ void free_user_ns(struct user_namespace *ns)
do {
parent = ns->parent;
-#ifdef CONFIG_PERSISTENT_KEYRINGS
- key_put(ns->persistent_keyring_register);
-#endif
proc_free_inum(ns->proc_inum);
kmem_cache_free(user_ns_cachep, ns);
ns = parent;
diff --git a/kernel/sched/wait.c b/kernel/wait.c
index 7d50f79..d550920 100644
--- a/kernel/sched/wait.c
+++ b/kernel/wait.c
@@ -53,109 +53,6 @@ EXPORT_SYMBOL(remove_wait_queue);
/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
-{
- wait_queue_t *curr, *next;
-
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
- break;
- }
-}
-
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
-{
- __wake_up_common(q, mode, nr, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
- __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
-{
- unsigned long flags;
- int wake_flags = 1; /* XXX WF_SYNC */
-
- if (unlikely(!q))
- return;
-
- if (unlikely(nr_exclusive != 1))
- wake_flags = 0;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
- __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
-
-/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
* wake-function that tests for the wait-queue being active
@@ -195,30 +92,6 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
-{
- unsigned long flags;
-
- if (signal_pending_state(state, current))
- return -ERESTARTSYS;
-
- wait->private = current;
- wait->func = autoremove_wake_function;
-
- spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list)) {
- if (wait->flags & WQ_FLAG_EXCLUSIVE)
- __add_wait_queue_tail(q, wait);
- else
- __add_wait_queue(q, wait);
- }
- set_current_state(state);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return 0;
-}
-EXPORT_SYMBOL(prepare_to_wait_event);
-
/**
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c66912be..987293d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,9 +305,6 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
-/* I: attributes used when instantiating ordered pools on demand */
-static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
-
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -521,21 +518,14 @@ static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
-/**
- * worker_pool_assign_id - allocate ID and assing it to @pool
- * @pool: the pool pointer of interest
- *
- * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
- * successfully, -errno on failure.
- */
+/* allocate ID and assign it to @pool */
static int worker_pool_assign_id(struct worker_pool *pool)
{
int ret;
lockdep_assert_held(&wq_pool_mutex);
- ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
- GFP_KERNEL);
+ ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
if (ret >= 0) {
pool->id = ret;
return 0;
@@ -1330,7 +1320,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
debug_work_activate(work);
- /* if draining, only works from the same workqueue are allowed */
+ /* if dying, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
@@ -1746,17 +1736,16 @@ static struct worker *create_worker(struct worker_pool *pool)
if (IS_ERR(worker->task))
goto fail;
- set_user_nice(worker->task, pool->attrs->nice);
-
- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
-
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
+ set_user_nice(worker->task, pool->attrs->nice);
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ /* prevent userland from meddling with cpumask of workqueue workers */
+ worker->task->flags |= PF_NO_SETAFFINITY;
+
/*
* The caller is responsible for ensuring %POOL_DISASSOCIATED
* remains stable across this function. See the comments above the
@@ -4117,7 +4106,7 @@ out_unlock:
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
- int cpu, ret;
+ int cpu;
if (!(wq->flags & WQ_UNBOUND)) {
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4137,13 +4126,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
mutex_unlock(&wq->mutex);
}
return 0;
- } else if (wq->flags & __WQ_ORDERED) {
- ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
- /* there should only be single pwq for ordering guarantee */
- WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
- wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
- "ordering guarantee broken for workqueue %s\n", wq->name);
- return ret;
} else {
return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
@@ -5027,6 +5009,10 @@ static int __init init_workqueues(void)
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int i, cpu;
+ /* make sure we have enough bits for OFFQ pool ID */
+ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
+ WORK_CPU_END * NR_STD_WORKER_POOLS);
+
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5065,23 +5051,13 @@ static int __init init_workqueues(void)
}
}
- /* create default unbound and ordered wq attrs */
+ /* create default unbound wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
-
- /*
- * An ordered wq should have only one pwq as ordering is
- * guaranteed by max_active which is enforced by pwqs.
- * Turn off NUMA so that dfl_pwq is used for all nodes.
- */
- BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
- attrs->nice = std_nice[i];
- attrs->no_numa = true;
- ordered_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);