diff options
author | Scott Wood <scottwood@freescale.com> | 2014-04-07 23:49:35 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2014-04-07 23:49:35 (GMT) |
commit | 62b8c978ee6b8d135d9e7953221de58000dba986 (patch) | |
tree | 683b04b2e627f6710c22c151b23c8cc9a165315e /kernel | |
parent | 78fd82238d0e5716578c326404184a27ba67fd6e (diff) | |
download | linux-fsl-qoriq-62b8c978ee6b8d135d9e7953221de58000dba986.tar.xz |
Rewind v3.13-rc3+ (78fd82238d0e5716) to v3.12
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.hz | 2 | ||||
-rw-r--r-- | kernel/Makefile | 81 | ||||
-rw-r--r-- | kernel/audit.c | 153 | ||||
-rw-r--r-- | kernel/audit.h | 3 | ||||
-rw-r--r-- | kernel/auditfilter.c | 3 | ||||
-rw-r--r-- | kernel/auditsc.c | 133 | ||||
-rw-r--r-- | kernel/bounds.c | 6 | ||||
-rw-r--r-- | kernel/cgroup.c | 290 | ||||
-rw-r--r-- | kernel/context_tracking.c | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 49 | ||||
-rw-r--r-- | kernel/cpu/idle.c | 16 | ||||
-rw-r--r-- | kernel/cpuset.c | 8 | ||||
-rw-r--r-- | kernel/debug/debug_core.c | 32 | ||||
-rw-r--r-- | kernel/debug/debug_core.h | 3 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_debugger.c | 5 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_main.c | 3 | ||||
-rw-r--r-- | kernel/delayacct.c | 7 | ||||
-rw-r--r-- | kernel/elfcore.c | 10 | ||||
-rw-r--r-- | kernel/events/core.c | 180 | ||||
-rw-r--r-- | kernel/events/internal.h | 35 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 101 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 223 | ||||
-rw-r--r-- | kernel/extable.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 13 | ||||
-rw-r--r-- | kernel/futex.c | 2 | ||||
-rw-r--r-- | kernel/gcov/Kconfig | 30 | ||||
-rw-r--r-- | kernel/gcov/Makefile | 32 | ||||
-rw-r--r-- | kernel/gcov/base.c | 32 | ||||
-rw-r--r-- | kernel/gcov/fs.c | 52 | ||||
-rw-r--r-- | kernel/gcov/gcc_3_4.c | 115 | ||||
-rw-r--r-- | kernel/gcov/gcc_4_7.c | 560 | ||||
-rw-r--r-- | kernel/gcov/gcov.h | 65 | ||||
-rw-r--r-- | kernel/hung_task.c | 17 | ||||
-rw-r--r-- | kernel/irq/chip.c | 2 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 13 | ||||
-rw-r--r-- | kernel/irq/manage.c | 4 | ||||
-rw-r--r-- | kernel/irq/pm.c | 2 | ||||
-rw-r--r-- | kernel/irq/settings.h | 7 | ||||
-rw-r--r-- | kernel/irq/spurious.c | 12 | ||||
-rw-r--r-- | kernel/jump_label.c | 5 | ||||
-rw-r--r-- | kernel/kexec.c | 2 | ||||
-rw-r--r-- | kernel/kprobes.c | 4 | ||||
-rw-r--r-- | kernel/kthread.c | 73 | ||||
-rw-r--r-- | kernel/lglock.c (renamed from kernel/locking/lglock.c) | 0 | ||||
-rw-r--r-- | kernel/lockdep.c (renamed from kernel/locking/lockdep.c) | 8 | ||||
-rw-r--r-- | kernel/lockdep_internals.h (renamed from kernel/locking/lockdep_internals.h) | 0 | ||||
-rw-r--r-- | kernel/lockdep_proc.c (renamed from kernel/locking/lockdep_proc.c) | 15 | ||||
-rw-r--r-- | kernel/lockdep_states.h (renamed from kernel/locking/lockdep_states.h) | 0 | ||||
-rw-r--r-- | kernel/locking/Makefile | 25 | ||||
-rw-r--r-- | kernel/locking/percpu-rwsem.c | 165 | ||||
-rw-r--r-- | kernel/locking/rwsem-spinlock.c | 296 | ||||
-rw-r--r-- | kernel/locking/rwsem-xadd.c | 293 | ||||
-rw-r--r-- | kernel/locking/spinlock_debug.c | 302 | ||||
-rw-r--r-- | kernel/modsign_certificate.S | 12 | ||||
-rw-r--r-- | kernel/modsign_pubkey.c | 104 | ||||
-rw-r--r-- | kernel/module-internal.h | 2 | ||||
-rw-r--r-- | kernel/module.c | 169 | ||||
-rw-r--r-- | kernel/module_signing.c | 11 | ||||
-rw-r--r-- | kernel/mutex-debug.c (renamed from kernel/locking/mutex-debug.c) | 0 | ||||
-rw-r--r-- | kernel/mutex-debug.h (renamed from kernel/locking/mutex-debug.h) | 0 | ||||
-rw-r--r-- | kernel/mutex.c (renamed from kernel/locking/mutex.c) | 2 | ||||
-rw-r--r-- | kernel/mutex.h (renamed from kernel/locking/mutex.h) | 0 | ||||
-rw-r--r-- | kernel/padata.c | 9 | ||||
-rw-r--r-- | kernel/panic.c | 2 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 8 | ||||
-rw-r--r-- | kernel/power/Kconfig | 16 | ||||
-rw-r--r-- | kernel/power/qos.c | 26 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 9 | ||||
-rw-r--r-- | kernel/power/user.c | 21 | ||||
-rw-r--r-- | kernel/printk/printk.c | 35 | ||||
-rw-r--r-- | kernel/ptrace.c | 3 | ||||
-rw-r--r-- | kernel/rcu.h (renamed from kernel/rcu/rcu.h) | 7 | ||||
-rw-r--r-- | kernel/rcu/Makefile | 6 | ||||
-rw-r--r-- | kernel/rcupdate.c (renamed from kernel/rcu/update.c) | 10 | ||||
-rw-r--r-- | kernel/rcutiny.c (renamed from kernel/rcu/tiny.c) | 37 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h (renamed from kernel/rcu/tiny_plugin.h) | 0 | ||||
-rw-r--r-- | kernel/rcutorture.c (renamed from kernel/rcu/torture.c) | 6 | ||||
-rw-r--r-- | kernel/rcutree.c (renamed from kernel/rcu/tree.c) | 200 | ||||
-rw-r--r-- | kernel/rcutree.h (renamed from kernel/rcu/tree.h) | 2 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h (renamed from kernel/rcu/tree_plugin.h) | 88 | ||||
-rw-r--r-- | kernel/rcutree_trace.c (renamed from kernel/rcu/tree_trace.c) | 2 | ||||
-rw-r--r-- | kernel/rtmutex-debug.c (renamed from kernel/locking/rtmutex-debug.c) | 0 | ||||
-rw-r--r-- | kernel/rtmutex-debug.h (renamed from kernel/locking/rtmutex-debug.h) | 0 | ||||
-rw-r--r-- | kernel/rtmutex-tester.c (renamed from kernel/locking/rtmutex-tester.c) | 0 | ||||
-rw-r--r-- | kernel/rtmutex.c (renamed from kernel/locking/rtmutex.c) | 0 | ||||
-rw-r--r-- | kernel/rtmutex.h (renamed from kernel/locking/rtmutex.h) | 0 | ||||
-rw-r--r-- | kernel/rtmutex_common.h (renamed from kernel/locking/rtmutex_common.h) | 0 | ||||
-rw-r--r-- | kernel/rwsem.c (renamed from kernel/locking/rwsem.c) | 0 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/completion.c | 299 | ||||
-rw-r--r-- | kernel/sched/core.c | 703 | ||||
-rw-r--r-- | kernel/sched/debug.c | 68 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1445 | ||||
-rw-r--r-- | kernel/sched/features.h | 19 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 22 | ||||
-rw-r--r-- | kernel/sched/sched.h | 54 | ||||
-rw-r--r-- | kernel/sched/stats.h | 46 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
-rw-r--r-- | kernel/semaphore.c (renamed from kernel/locking/semaphore.c) | 0 | ||||
-rw-r--r-- | kernel/signal.c | 2 | ||||
-rw-r--r-- | kernel/smp.c | 19 | ||||
-rw-r--r-- | kernel/softirq.c | 184 | ||||
-rw-r--r-- | kernel/spinlock.c (renamed from kernel/locking/spinlock.c) | 0 | ||||
-rw-r--r-- | kernel/srcu.c (renamed from kernel/rcu/srcu.c) | 0 | ||||
-rw-r--r-- | kernel/stop_machine.c | 303 | ||||
-rw-r--r-- | kernel/sys.c | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 34 | ||||
-rw-r--r-- | kernel/sysctl_binary.c | 6 | ||||
-rw-r--r-- | kernel/system_certificates.S | 10 | ||||
-rw-r--r-- | kernel/system_keyring.c | 105 | ||||
-rw-r--r-- | kernel/taskstats.c | 54 | ||||
-rw-r--r-- | kernel/time/Kconfig | 2 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 4 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 2 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 52 | ||||
-rw-r--r-- | kernel/time/ntp.c | 3 | ||||
-rw-r--r-- | kernel/time/sched_clock.c | 114 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 1 | ||||
-rw-r--r-- | kernel/time/tick-common.c | 15 | ||||
-rw-r--r-- | kernel/time/tick-internal.h | 2 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 25 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 5 | ||||
-rw-r--r-- | kernel/time/timer_stats.c | 8 | ||||
-rw-r--r-- | kernel/timer.c | 13 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 36 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 225 | ||||
-rw-r--r-- | kernel/trace/trace.c | 85 | ||||
-rw-r--r-- | kernel/trace/trace.h | 51 | ||||
-rw-r--r-- | kernel/trace/trace_branch.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 10 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 35 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 218 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 82 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_mmiotrace.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 19 | ||||
-rw-r--r-- | kernel/trace/trace_sched_switch.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_stat.c | 41 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 32 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 3 | ||||
-rw-r--r-- | kernel/up.c | 11 | ||||
-rw-r--r-- | kernel/user.c | 4 | ||||
-rw-r--r-- | kernel/user_namespace.c | 6 | ||||
-rw-r--r-- | kernel/wait.c (renamed from kernel/sched/wait.c) | 127 | ||||
-rw-r--r-- | kernel/workqueue.c | 50 |
147 files changed, 2542 insertions, 6416 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a8..94fabd5 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) diff --git a/kernel/Makefile b/kernel/Makefile index bbaf7d5..1ce4755 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,44 +6,56 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + rcupdate.o extable.o params.o posix-timers.o \ + kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o smpboot.o + async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ -obj-y += locking/ obj-y += power/ obj-y += printk/ obj-y += cpu/ obj-y += irq/ -obj-y += rcu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -69,6 +81,12 @@ obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_TREE_RCU) += rcutree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o +obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o +obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -123,52 +141,19 @@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) -############################################################################### -# -# Roll all the X.509 certificates that we can find together and pull them into -# the kernel so that they get loaded into the system trusted keyring during -# boot. +ifeq ($(CONFIG_MODULE_SIG),y) # -# We look in the source root and the build root for all files whose name ends -# in ".x509". Unfortunately, this will generate duplicate filenames, so we -# have make canonicalise the pathnames and then sort them to discard the -# duplicates. +# Pull the signing certificate and any extra certificates into the kernel # -############################################################################### -ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) -X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 -X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ - $(or $(realpath $(CERT)),$(CERT)))) - -ifeq ($(X509_CERTIFICATES),) -$(warning *** No X.509 certificates found ***) -endif -ifneq ($(wildcard $(obj)/.x509.list),) -ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(info X.509 certificate list changed) -$(shell rm $(obj)/.x509.list) -endif -endif - -kernel/system_certificates.o: $(obj)/x509_certificate_list - -quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") +quiet_cmd_touch = TOUCH $@ + cmd_touch = touch $@ -targets += $(obj)/x509_certificate_list -$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list - $(call if_changed,x509certs) +extra_certificates: + $(call cmd,touch) -targets += $(obj)/.x509.list -$(obj)/.x509.list: - @echo $(X509_CERTIFICATES) >$@ +kernel/modsign_certificate.o: signing_key.x509 extra_certificates -clean-files := x509_certificate_list .x509.list -endif - -ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/audit.c b/kernel/audit.c index 906ae5a0..7b0e23a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,6 +60,7 @@ #ifdef CONFIG_SECURITY #include <linux/security.h> #endif +#include <net/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> @@ -139,17 +140,6 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); -static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, - .mask = -1, - .features = 0, - .lock = 0,}; - -static char *audit_feature_names[2] = { - "only_unset_loginuid", - "loginuid_immutable", -}; - - /* Serialize requests from userspace. */ DEFINE_MUTEX(audit_cmd_mutex); @@ -594,8 +584,6 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: - case AUDIT_GET_FEATURE: - case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -625,7 +613,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); - if (!audit_enabled && msg_type != AUDIT_USER_AVC) { + if (!audit_enabled) { *ab = NULL; return rc; } @@ -640,94 +628,6 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return rc; } -int is_audit_feature_set(int i) -{ - return af.features & AUDIT_FEATURE_TO_MASK(i); -} - - -static int audit_get_feature(struct sk_buff *skb) -{ - u32 seq; - - seq = nlmsg_hdr(skb)->nlmsg_seq; - - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &af, sizeof(af)); - - return 0; -} - -static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, - u32 old_lock, u32 new_lock, int res) -{ - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); - audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", - audit_feature_names[which], !!old_feature, !!new_feature, - !!old_lock, !!new_lock, res); - audit_log_end(ab); -} - -static int audit_set_feature(struct sk_buff *skb) -{ - struct audit_features *uaf; - int i; - - BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); - uaf = nlmsg_data(nlmsg_hdr(skb)); - - /* if there is ever a version 2 we should handle that here */ - - for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { - u32 feature = AUDIT_FEATURE_TO_MASK(i); - u32 old_feature, new_feature, old_lock, new_lock; - - /* if we are not changing this feature, move along */ - if (!(feature & uaf->mask)) - continue; - - old_feature = af.features & feature; - new_feature = uaf->features & feature; - new_lock = (uaf->lock | af.lock) & feature; - old_lock = af.lock & feature; - - /* are we changing a locked feature? */ - if ((af.lock & feature) && (new_feature != old_feature)) { - audit_log_feature_change(i, old_feature, new_feature, - old_lock, new_lock, 0); - return -EPERM; - } - } - /* nothing invalid, do the changes */ - for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { - u32 feature = AUDIT_FEATURE_TO_MASK(i); - u32 old_feature, new_feature, old_lock, new_lock; - - /* if we are not changing this feature, move along */ - if (!(feature & uaf->mask)) - continue; - - old_feature = af.features & feature; - new_feature = uaf->features & feature; - old_lock = af.lock & feature; - new_lock = (uaf->lock | af.lock) & feature; - - if (new_feature != old_feature) - audit_log_feature_change(i, old_feature, new_feature, - old_lock, new_lock, 1); - - if (new_feature) - af.features |= feature; - else - af.features &= ~feature; - af.lock |= new_lock; - } - - return 0; -} - static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -759,7 +659,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) switch (msg_type) { case AUDIT_GET: - memset(&status_set, 0, sizeof(status_set)); status_set.enabled = audit_enabled; status_set.failure = audit_failure; status_set.pid = audit_pid; @@ -771,7 +670,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) &status_set, sizeof(status_set)); break; case AUDIT_SET: - if (nlmsg_len(nlh) < sizeof(struct audit_status)) + if (nlh->nlmsg_len < sizeof(struct audit_status)) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { @@ -800,16 +699,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit); break; - case AUDIT_GET_FEATURE: - err = audit_get_feature(skb); - if (err) - return err; - break; - case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); - if (err) - return err; - break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: @@ -826,8 +715,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) - audit_log_format(ab, " msg='%.*s'", - AUDIT_MESSAGE_TEXT_MAX, + audit_log_format(ab, " msg='%.1024s'", (char *)data); else { int size; @@ -930,7 +818,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct task_struct *tsk = current; spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty; + s.enabled = tsk->signal->audit_tty != 0; s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); @@ -944,7 +832,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; @@ -1179,6 +1067,13 @@ static void wait_for_auditd(unsigned long sleep_time) remove_wait_queue(&audit_backlog_wait, &wait); } +/* Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the tsk is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, tsk + * should be NULL. */ + /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) @@ -1494,7 +1389,7 @@ void audit_log_session_info(struct audit_buffer *ab) u32 sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); + audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) @@ -1641,26 +1536,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } } - /* log the audit_names record type */ - audit_log_format(ab, " nametype="); - switch(n->type) { - case AUDIT_TYPE_NORMAL: - audit_log_format(ab, "NORMAL"); - break; - case AUDIT_TYPE_PARENT: - audit_log_format(ab, "PARENT"); - break; - case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, "DELETE"); - break; - case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, "CREATE"); - break; - default: - audit_log_format(ab, "UNKNOWN"); - break; - } - audit_log_fcaps(ab, n); audit_log_end(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index b779642..123c9b7 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -197,9 +197,6 @@ struct audit_context { int fd; int flags; } mmap; - struct { - int argc; - } execve; }; int fds[2]; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 51f3fd4..f7aee8b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -343,7 +343,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: - case AUDIT_INODE: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -424,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 90594c9..9845cb3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -95,6 +95,13 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 +struct audit_aux_data_execve { + struct audit_aux_data d; + int argc; + int envc; + struct mm_struct *mm; +}; + struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -114,6 +121,12 @@ struct audit_aux_data_bprm_fcaps { struct audit_cap_data new_pcap; }; +struct audit_aux_data_capset { + struct audit_aux_data d; + pid_t pid; + struct audit_cap_data cap; +}; + struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -553,7 +566,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_INODE: if (name) - result = audit_comparator(name->ino, f->op, f->val); + result = (name->ino == f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { @@ -930,10 +943,8 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (state == AUDIT_DISABLED) { - clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + if (state == AUDIT_DISABLED) return 0; - } if (!(context = audit_alloc_context(state))) { kfree(key); @@ -1138,16 +1149,20 @@ static int audit_log_single_execve_arg(struct audit_context *context, } static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) + struct audit_buffer **ab, + struct audit_aux_data_execve *axi) { int i, len; size_t len_sent = 0; const char __user *p; char *buf; - p = (const char __user *)current->mm->arg_start; + if (axi->mm != current->mm) + return; /* execve failed, no additional info */ + + p = (const char __user *)axi->mm->arg_start; - audit_log_format(*ab, "argc=%d", context->execve.argc); + audit_log_format(*ab, "argc=%d", axi->argc); /* * we need some kernel buffer to hold the userspace args. Just @@ -1161,7 +1176,7 @@ static void audit_log_execve_info(struct audit_context *context, return; } - for (i = 0; i < context->execve.argc; i++) { + for (i = 0; i < axi->argc; i++) { len = audit_log_single_execve_arg(context, ab, i, &len_sent, p, buf); if (len <= 0) @@ -1264,9 +1279,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; } - case AUDIT_EXECVE: { - audit_log_execve_info(context, &ab); - break; } } audit_log_end(ab); } @@ -1313,6 +1325,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts switch (aux->type) { + case AUDIT_EXECVE: { + struct audit_aux_data_execve *axi = (void *)aux; + audit_log_execve_info(context, &ab, axi); + break; } + case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1947,43 +1964,6 @@ int auditsc_get_stamp(struct audit_context *ctx, /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); -static int audit_set_loginuid_perm(kuid_t loginuid) -{ - /* if we are unset, we don't need privs */ - if (!audit_loginuid_set(current)) - return 0; - /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ - if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) - return -EPERM; - /* it is set, you need permission */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - /* reject if this is not an unset and we don't allow that */ - if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) - return -EPERM; - return 0; -} - -static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, - unsigned int oldsessionid, unsigned int sessionid, - int rc) -{ - struct audit_buffer *ab; - uid_t uid, ologinuid, nloginuid; - - uid = from_kuid(&init_user_ns, task_uid(current)); - ologinuid = from_kuid(&init_user_ns, koldloginuid); - nloginuid = from_kuid(&init_user_ns, kloginuid), - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; - audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " - "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, - nloginuid, oldsessionid, sessionid, !rc); - audit_log_end(ab); -} - /** * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value @@ -1995,26 +1975,37 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - unsigned int oldsessionid, sessionid = (unsigned int)-1; - kuid_t oldloginuid; - int rc; - - oldloginuid = audit_get_loginuid(current); - oldsessionid = audit_get_sessionid(current); + struct audit_context *context = task->audit_context; + unsigned int sessionid; - rc = audit_set_loginuid_perm(loginuid); - if (rc) - goto out; +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + if (audit_loginuid_set(task)) + return -EPERM; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - /* are we setting or clearing? */ - if (uid_valid(loginuid)) - sessionid = atomic_inc_return(&session_id); + sessionid = atomic_inc_return(&session_id); + if (context && context->in_syscall) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old auid=%u new auid=%u" + " old ses=%u new ses=%u", + task->pid, + from_kuid(&init_user_ns, task_uid(task)), + from_kuid(&init_user_ns, task->loginuid), + from_kuid(&init_user_ns, loginuid), + task->sessionid, sessionid); + audit_log_end(ab); + } + } task->sessionid = sessionid; task->loginuid = loginuid; -out: - audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); - return rc; + return 0; } /** @@ -2135,12 +2126,22 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -void __audit_bprm(struct linux_binprm *bprm) +int __audit_bprm(struct linux_binprm *bprm) { + struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - context->type = AUDIT_EXECVE; - context->execve.argc = bprm->argc; + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->argc = bprm->argc; + ax->envc = bprm->envc; + ax->mm = bprm->mm; + ax->d.type = AUDIT_EXECVE; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; } diff --git a/kernel/bounds.c b/kernel/bounds.c index 5253204..0c9b862 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,8 +10,6 @@ #include <linux/mmzone.h> #include <linux/kbuild.h> #include <linux/page_cgroup.h> -#include <linux/log2.h> -#include <linux/spinlock_types.h> void foo(void) { @@ -19,9 +17,5 @@ void foo(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); -#ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); -#endif - DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); /* End of constants */ } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8b729c2..8bd9cfd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -90,14 +90,6 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); /* - * cgroup destruction makes heavy use of work items and there can be a lot - * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq - * which may lead to deadlock. - */ -static struct workqueue_struct *cgroup_destroy_wq; - -/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -133,6 +125,38 @@ struct cfent { }; /* + * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when + * cgroup_subsys->use_id != 0. + */ +#define CSS_ID_MAX (65535) +struct css_id { + /* + * The css to which this ID points. This pointer is set to valid value + * after cgroup is populated. If cgroup is removed, this will be NULL. + * This pointer is expected to be RCU-safe because destroy() + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. + */ + struct cgroup_subsys_state __rcu *css; + /* + * ID of this css. + */ + unsigned short id; + /* + * Depth in hierarchy which this ID belongs to. + */ + unsigned short depth; + /* + * ID is freed by RCU. (and lookup routine is RCU safe.) + */ + struct rcu_head rcu_head; + /* + * Hierarchy of CSS ID belongs to. + */ + unsigned short stack[0]; /* Array of Length (depth+1) */ +}; + +/* * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { @@ -199,7 +223,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -364,6 +387,9 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; +static int cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *css); + /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to @@ -815,6 +841,8 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; +static int alloc_css_id(struct cgroup_subsys_state *child_css); + static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -880,7 +908,7 @@ static void cgroup_free_rcu(struct rcu_head *head) struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); + schedule_work(&cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -904,6 +932,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1490,7 +1523,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, + .d_delete = cgroup_delete, }; struct inode *inode = @@ -2430,7 +2463,7 @@ static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_file_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2491,8 +2524,6 @@ static int cgroup_file_release(struct inode *inode, struct file *file) ret = cft->release(inode, file); if (css->ss) css_put(css); - if (file->f_op == &cgroup_seqfile_operations) - single_release(inode, file); return ret; } @@ -4209,6 +4240,21 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) goto err; } } + + /* This cgroup is ready now */ + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + struct css_id *id = rcu_dereference_protected(css->id, true); + + /* + * Update id->css pointer and make this css visible from + * CSS ID functions. This pointer will be dereferened + * from RCU-read-side without locks. + */ + if (id) + rcu_assign_pointer(id->css, css); + } + return 0; err: cgroup_clear_dir(cgrp, subsys_mask); @@ -4260,7 +4306,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + schedule_work(&css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4277,6 +4323,7 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->cgroup = cgrp; css->ss = ss; css->flags = 0; + css->id = NULL; if (cgrp->parent) css->parent = cgroup_css(cgrp->parent, ss); @@ -4408,6 +4455,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; init_css(css, ss, cgrp); + + if (ss->use_id) { + err = alloc_css_id(css); + if (err) + goto err_free_all; + } } /* @@ -4550,7 +4603,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + schedule_work(&css->destroy_work); } /** @@ -4872,6 +4925,12 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* our new subsystem will be attached to the dummy hierarchy. */ init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ + if (ss->use_id) { + ret = cgroup_init_idr(ss, css); + if (ret) + goto err_unload; + } /* * Now we need to entangle the css into the existing css_sets. unlike @@ -4937,6 +4996,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) offline_css(cgroup_css(cgroup_dummy_top, ss)); + if (ss->use_id) + idr_destroy(&ss->idr); + /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4963,7 +5025,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * remove subsystem's css from the cgroup_dummy_top and free it - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. + * the cgrp->subsys pointer to find their state. note that this + * also takes care of freeing the css_id. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); @@ -5034,6 +5097,8 @@ int __init cgroup_init(void) for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); + if (ss->use_id) + cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* allocate id for the dummy hierarchy */ @@ -5074,22 +5139,6 @@ out: return err; } -static int __init cgroup_wq_init(void) -{ - /* - * There isn't much point in executing destruction path in - * parallel. Good chunk is serialized with cgroup_mutex anyway. - * Use 1 for @max_active. - * - * We would prefer to do this in cgroup_init() above, but that - * is called before init_workqueues(): so leave this until after. - */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); - return 0; -} -core_initcall(cgroup_wq_init); - /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy @@ -5469,6 +5518,181 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +/* + * Functons for CSS ID. + */ + +/* to get ID other than 0, this should be called when !cgroup_is_dead() */ +unsigned short css_id(struct cgroup_subsys_state *css) +{ + struct css_id *cssid; + + /* + * This css_id() can return correct value when somone has refcnt + * on this or this is under rcu_read_lock(). Once css->id is allocated, + * it's unchanged until freed. + */ + cssid = rcu_dereference_raw(css->id); + + if (cssid) + return cssid->id; + return 0; +} +EXPORT_SYMBOL_GPL(css_id); + +/** + * css_is_ancestor - test "root" css is an ancestor of "child" + * @child: the css to be tested. + * @root: the css supporsed to be an ancestor of the child. + * + * Returns true if "root" is an ancestor of "child" in its hierarchy. Because + * this function reads css->id, the caller must hold rcu_read_lock(). + * But, considering usual usage, the csses should be valid objects after test. + * Assuming that the caller will do some action to the child if this returns + * returns true, the caller must take "child";s reference count. + * If "child" is valid object and this returns true, "root" is valid, too. + */ + +bool css_is_ancestor(struct cgroup_subsys_state *child, + const struct cgroup_subsys_state *root) +{ + struct css_id *child_id; + struct css_id *root_id; + + child_id = rcu_dereference(child->id); + if (!child_id) + return false; + root_id = rcu_dereference(root->id); + if (!root_id) + return false; + if (child_id->depth < root_id->depth) + return false; + if (child_id->stack[root_id->depth] != root_id->id) + return false; + return true; +} + +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) +{ + struct css_id *id = rcu_dereference_protected(css->id, true); + + /* When this is called before css_id initialization, id can be NULL */ + if (!id) + return; + + BUG_ON(!ss->use_id); + + rcu_assign_pointer(id->css, NULL); + rcu_assign_pointer(css->id, NULL); + spin_lock(&ss->id_lock); + idr_remove(&ss->idr, id->id); + spin_unlock(&ss->id_lock); + kfree_rcu(id, rcu_head); +} +EXPORT_SYMBOL_GPL(free_css_id); + +/* + * This is called by init or create(). Then, calls to this function are + * always serialized (By cgroup_mutex() at create()). + */ + +static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) +{ + struct css_id *newid; + int ret, size; + + BUG_ON(!ss->use_id); + + size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); + newid = kzalloc(size, GFP_KERNEL); + if (!newid) + return ERR_PTR(-ENOMEM); + + idr_preload(GFP_KERNEL); + spin_lock(&ss->id_lock); + /* Don't use 0. allocates an ID of 1-65535 */ + ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); + spin_unlock(&ss->id_lock); + idr_preload_end(); + + /* Returns error when there are no free spaces for new ID.*/ + if (ret < 0) + goto err_out; + + newid->id = ret; + newid->depth = depth; + return newid; +err_out: + kfree(newid); + return ERR_PTR(ret); + +} + +static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *rootcss) +{ + struct css_id *newid; + + spin_lock_init(&ss->id_lock); + idr_init(&ss->idr); + + newid = get_new_cssid(ss, 0); + if (IS_ERR(newid)) + return PTR_ERR(newid); + + newid->stack[0] = newid->id; + RCU_INIT_POINTER(newid->css, rootcss); + RCU_INIT_POINTER(rootcss->id, newid); + return 0; +} + +static int alloc_css_id(struct cgroup_subsys_state *child_css) +{ + struct cgroup_subsys_state *parent_css = css_parent(child_css); + struct css_id *child_id, *parent_id; + int i, depth; + + parent_id = rcu_dereference_protected(parent_css->id, true); + depth = parent_id->depth + 1; + + child_id = get_new_cssid(child_css->ss, depth); + if (IS_ERR(child_id)) + return PTR_ERR(child_id); + + for (i = 0; i < depth; i++) + child_id->stack[i] = parent_id->stack[i]; + child_id->stack[depth] = child_id->id; + /* + * child_id->css pointer will be set after this cgroup is available + * see cgroup_populate_dir() + */ + rcu_assign_pointer(child_css->id, child_id); + + return 0; +} + +/** + * css_lookup - lookup css by id + * @ss: cgroup subsys to be looked into. + * @id: the id + * + * Returns pointer to cgroup_subsys_state if there is valid one with id. + * NULL if not. Should be called under rcu_read_lock() + */ +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) +{ + struct css_id *cssid = NULL; + + BUG_ON(!ss->use_id); + cssid = idr_find(&ss->idr, id); + + if (unlikely(!cssid)) + return NULL; + + return rcu_dereference(cssid->css); +} +EXPORT_SYMBOL_GPL(css_lookup); + /** * css_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e5f3917..859c8df 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -asmlinkage void __sched notrace preempt_schedule_context(void) +void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; diff --git a/kernel/cpu.c b/kernel/cpu.c index deff2e6..d7f07a2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -306,28 +306,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } - - /* - * By now we've cleared cpu_active_mask, wait for all preempt-disabled - * and RCU users of this state to go away such that all new such users - * will observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so explicitly call both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ -#ifdef CONFIG_PREEMPT - synchronize_sched(); -#endif - synchronize_rcu(); - smpboot_park_threads(cpu); - /* - * So now all preempt/rcu users must observe !cpu_active(). - */ - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ @@ -440,6 +420,11 @@ int cpu_up(unsigned int cpu) { int err = 0; +#ifdef CONFIG_MEMORY_HOTPLUG + int nid; + pg_data_t *pgdat; +#endif + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -450,9 +435,27 @@ int cpu_up(unsigned int cpu) return -EINVAL; } - err = try_online_node(cpu_to_node(cpu)); - if (err) - return err; +#ifdef CONFIG_MEMORY_HOTPLUG + nid = cpu_to_node(cpu); + if (!node_online(nid)) { + err = mem_online_node(nid); + if (err) + return err; + } + + pgdat = NODE_DATA(nid); + if (!pgdat) { + printk(KERN_ERR + "Can't online cpu %d due to NULL pgdat\n", cpu); + return -ENOMEM; + } + + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } +#endif cpu_maps_update_begin(); diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a..e695c0a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!tif_need_resched()) + while (!need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -92,7 +92,8 @@ static void cpu_idle_loop(void) if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { - if (!current_clr_polling_and_test()) { + current_clr_polling(); + if (!need_resched()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); @@ -102,16 +103,9 @@ static void cpu_idle_loop(void) } else { local_irq_enable(); } - __current_set_polling(); + current_set_polling(); } arch_cpu_idle_exit(); - /* - * We need to test and propagate the TIF_NEED_RESCHED - * bit here because we might not have send the - * reschedule IPI to idle tasks. - */ - if (tif_need_resched()) - set_preempt_need_resched(); } tick_nohz_idle_exit(); schedule_preempt_disabled(); @@ -135,7 +129,7 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - __current_set_polling(); + current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4772034..6bf981e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1033,10 +1033,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) { - local_irq_disable(); + if (need_loop) write_seqcount_begin(&tsk->mems_allowed_seq); - } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -1044,10 +1042,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; - if (need_loop) { + if (need_loop) write_seqcount_end(&tsk->mems_allowed_seq); - local_irq_enable(); - } task_unlock(tsk); } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7d2f35e..0506d44 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -575,12 +575,8 @@ return_normal: raw_spin_lock(&dbg_slave_lock); #ifdef CONFIG_SMP - /* If send_ready set, slaves are already waiting */ - if (ks->send_ready) - atomic_set(ks->send_ready, 1); - /* Signal the other CPUs to enter kgdb_wait() */ - else if ((!kgdb_single_step) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -682,11 +678,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); - memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; ks->signo = signo; ks->err_code = ecode; + ks->kgdb_usethreadid = 0; ks->linux_regs = regs; if (kgdb_reenter_check(ks)) @@ -736,30 +732,6 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) -{ -#ifdef CONFIG_SMP - if (!kgdb_io_ready(0) || !send_ready) - return 1; - - if (kgdb_info[cpu].enter_kgdb == 0) { - struct kgdb_state kgdb_var; - struct kgdb_state *ks = &kgdb_var; - - memset(ks, 0, sizeof(struct kgdb_state)); - ks->cpu = cpu; - ks->ex_vector = trapnr; - ks->signo = SIGTRAP; - ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; - ks->linux_regs = regs; - ks->send_ready = send_ready; - kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); - return 0; - } -#endif - return 1; -} - static void kgdb_console_write(struct console *co, const char *s, unsigned count) { diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 572aa4f..2235967 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -26,7 +26,6 @@ struct kgdb_state { unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; - atomic_t *send_ready; }; /* Exception state values */ @@ -75,13 +74,11 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); -#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { return DBG_PASS_EVENT; } -#define KGDB_KDB_REASON_SYSTEM_NMI 0 #endif /* CONFIG_KGDB_KDB */ #endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca3..328d18e 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -69,10 +69,7 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; - if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP) - reason = KDB_REASON_SYSTEM_NMI; - - else if (in_nmi()) + if (in_nmi()) reason = KDB_REASON_NMI; for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8..00eb8f7 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1200,9 +1200,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, instruction_pointer(regs)); kdb_dumpregs(regs); break; - case KDB_REASON_SYSTEM_NMI: - kdb_printf("due to System NonMaskable Interrupt\n"); - break; case KDB_REASON_NMI: kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b7..d473988 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -108,6 +108,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); cputime_to_timespec(utime + stime, &ts); @@ -152,6 +158,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_count += tsk->delays->freepages_count; spin_unlock_irqrestore(&tsk->delays->lock, flags); +done: return 0; } diff --git a/kernel/elfcore.c b/kernel/elfcore.c index e556751..ff915ef 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -1,19 +1,23 @@ #include <linux/elf.h> #include <linux/fs.h> #include <linux/mm.h> -#include <linux/binfmts.h> + +#include <asm/elf.h> + Elf_Half __weak elf_core_extra_phdrs(void) { return 0; } -int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) +int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) { return 1; } -int __weak elf_core_write_extra_data(struct coredump_params *cprm) +int __weak elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) { return 1; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 72348dc..953c143 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; -static int perf_sample_allowed_ns __read_mostly = - DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; +static atomic_t perf_sample_allowed_ns __read_mostly = + ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); void update_perf_cpu_limits(void) { @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) tmp *= sysctl_perf_cpu_time_max_percent; do_div(tmp, 100); - ACCESS_ONCE(perf_sample_allowed_ns) = tmp; + atomic_set(&perf_sample_allowed_ns, tmp); } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -228,15 +228,14 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 -static DEFINE_PER_CPU(u64, running_sample_length); +DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; u64 local_samples_len; - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - if (allowed_ns == 0) + if (atomic_read(&perf_sample_allowed_ns) == 0) return; /* decay the counter by 1 average sample */ @@ -252,7 +251,7 @@ void perf_sample_event_took(u64 sample_len_ns) */ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - if (avg_local_sample_len <= allowed_ns) + if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) return; if (max_samples_per_tick <= 1) @@ -263,9 +262,10 @@ void perf_sample_event_took(u64 sample_len_ns) perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %lld), lowering " + "perf samples too long (%lld > %d), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, + avg_local_sample_len, + atomic_read(&perf_sample_allowed_ns), sysctl_perf_event_sample_rate); update_perf_cpu_limits(); @@ -899,7 +899,6 @@ static void unclone_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); ctx->parent_ctx = NULL; } - ctx->generation++; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1137,8 +1136,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; - - ctx->generation++; } /* @@ -1204,9 +1201,6 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); - if (sample_type & PERF_SAMPLE_TRANSACTION) - size += sizeof(data->txn); - event->header_size = size; } @@ -1316,8 +1310,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; - - ctx->generation++; } static void perf_group_detach(struct perf_event *event) @@ -2154,38 +2146,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, } /* - * Test whether two contexts are equivalent, i.e. whether they have both been - * cloned from the same version of the same context. - * - * Equivalence is measured using a generation number in the context that is - * incremented on each modification to it; see unclone_ctx(), list_add_event() - * and list_del_event(). + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { - /* Pinning disables the swap optimization */ - if (ctx1->pin_count || ctx2->pin_count) - return 0; - - /* If ctx1 is the parent of ctx2 */ - if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) - return 1; - - /* If ctx2 is the parent of ctx1 */ - if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) - return 1; - - /* - * If ctx1 and ctx2 have the same parent; we flatten the parent - * hierarchy, see perf_event_init_context(). - */ - if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && - ctx1->parent_gen == ctx2->parent_gen) - return 1; - - /* Unmatched */ - return 0; + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; } static void __perf_event_sync_stat(struct perf_event *event, @@ -2234,6 +2210,9 @@ static void __perf_event_sync_stat(struct perf_event *event, perf_event_update_userpage(next_event); } +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { @@ -2265,7 +2244,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; - struct perf_event_context *parent, *next_parent; + struct perf_event_context *parent; struct perf_cpu_context *cpuctx; int do_switch = 1; @@ -2277,18 +2256,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, return; rcu_read_lock(); - next_ctx = next->perf_event_ctxp[ctxn]; - if (!next_ctx) - goto unlock; - parent = rcu_dereference(ctx->parent_ctx); - next_parent = rcu_dereference(next_ctx->parent_ctx); - - /* If neither context have a parent context; they cannot be clones. */ - if (!parent && !next_parent) - goto unlock; - - if (next_parent == ctx || next_ctx == parent || next_parent == parent) { + next_ctx = next->perf_event_ctxp[ctxn]; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both @@ -2316,7 +2287,6 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } -unlock: rcu_read_unlock(); if (do_switch) { @@ -4602,9 +4572,6 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); - if (sample_type & PERF_SAMPLE_TRANSACTION) - perf_output_put(handle, data->txn); - if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5133,26 +5100,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + const char *name; + + memset(tmp, 0, sizeof(tmp)); if (file) { struct inode *inode; dev_t dev; - - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) { - name = "//enomem"; - goto cpy_name; - } /* - * d_path() works from the end of the rb backwards, so we + * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); if (IS_ERR(name)) { - name = "//toolong"; - goto cpy_name; + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -5160,39 +5128,34 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); - goto got_name; + } else { - name = (char *)arch_vma_name(vma); - if (name) - goto cpy_name; + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp) - 1); + tmp[sizeof(tmp) - 1] = '\0'; + goto got_name; + } - if (vma->vm_start <= vma->vm_mm->start_brk && + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { - name = "[heap]"; - goto cpy_name; - } - if (vma->vm_start <= vma->vm_mm->start_stack && + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { - name = "[stack]"; - goto cpy_name; + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; } - name = "//anon"; - goto cpy_name; + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; } -cpy_name: - strlcpy(tmp, name, sizeof(tmp)); - name = tmp; got_name: - /* - * Since our buffer works in 8 byte units we need to align our string - * size to a multiple of 8. However, we must guarantee the tail end is - * zero'd out to avoid leaking random bits to userspace. - */ - size = strlen(name)+1; - while (!IS_ALIGNED(size, sizeof(u64))) - name[size++] = '\0'; + size = ALIGN(strlen(name)+1, sizeof(u64)); mmap_event->file_name = name; mmap_event->file_size = size; @@ -5680,6 +5643,11 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5713,6 +5681,9 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); @@ -6321,7 +6292,6 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } -static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, @@ -6366,19 +6336,17 @@ perf_event_mux_interval_ms_store(struct device *dev, return count; } -static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static struct attribute *pmu_dev_attrs[] = { - &dev_attr_type.attr, - &dev_attr_perf_event_mux_interval_ms.attr, - NULL, +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_RW(perf_event_mux_interval_ms), + __ATTR_NULL, }; -ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", - .dev_groups = pmu_dev_groups, + .dev_attrs = pmu_dev_attrs, }; static void pmu_dev_release(struct device *dev) @@ -7158,6 +7126,7 @@ SYSCALL_DEFINE5(perf_event_open, } perf_install_in_context(ctx, event, event->cpu); + ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7240,6 +7209,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); + ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b2187..ca65997 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) } #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ +static inline unsigned int \ func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ + const void *buf, unsigned int len) \ { \ unsigned long size, written; \ \ do { \ - size = min(handle->size, len); \ + size = min_t(unsigned long, handle->size, len); \ + \ written = memcpy_func(handle->addr, buf, size); \ - written = size - written; \ \ len -= written; \ handle->addr += written; \ @@ -110,37 +110,20 @@ func_name(struct perf_output_handle *handle, \ return len; \ } -static inline unsigned long -memcpy_common(void *dst, const void *src, unsigned long n) +static inline int memcpy_common(void *dst, const void *src, size_t n) { memcpy(dst, src, n); - return 0; + return n; } DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) -static inline unsigned long -memcpy_skip(void *dst, const void *src, unsigned long n) -{ - return 0; -} +#define MEMCPY_SKIP(dst, src, n) (n) -DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) +DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) #ifndef arch_perf_out_copy_user -#define arch_perf_out_copy_user arch_perf_out_copy_user - -static inline unsigned long -arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) -{ - unsigned long ret; - - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, src, n); - pagefault_enable(); - - return ret; -} +#define arch_perf_out_copy_user __copy_from_user_inatomic #endif DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e8b168a..9c2ddfb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -12,10 +12,40 @@ #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/slab.h> -#include <linux/circ_buf.h> #include "internal.h" +static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long sz = perf_data_size(rb); + unsigned long mask = sz - 1; + + /* + * check if user-writable + * overwrite : over-write its own tail + * !overwrite: buffer possibly drops events. + */ + if (rb->overwrite) + return true; + + /* + * verify that payload is not bigger than buffer + * otherwise masking logic may fail to detect + * the "not enough space" condition + */ + if ((head - offset) > sz) + return false; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); @@ -85,8 +115,8 @@ again: rb->user_page->data_head = head; /* - * Now check if we missed an update -- rely on previous implied - * compiler barriers to force a re-read. + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read rb->head. */ if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); @@ -105,7 +135,8 @@ int perf_output_begin(struct perf_output_handle *handle, { struct ring_buffer *rb; unsigned long tail, offset, head; - int have_lost, page_shift; + int have_lost; + struct perf_sample_data sample_data; struct { struct perf_event_header header; u64 id; @@ -120,63 +151,57 @@ int perf_output_begin(struct perf_output_handle *handle, event = event->parent; rb = rcu_dereference(event->rb); - if (unlikely(!rb)) + if (!rb) goto out; - if (unlikely(!rb->nr_pages)) - goto out; + handle->rb = rb; + handle->event = event; - handle->rb = rb; - handle->event = event; + if (!rb->nr_pages) + goto out; have_lost = local_read(&rb->lost); - if (unlikely(have_lost)) { - size += sizeof(lost_event); - if (event->attr.sample_id_all) - size += event->id_header_size; + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; } perf_output_get_handle(handle); do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + * + * See perf_output_put_handle(). + */ tail = ACCESS_ONCE(rb->user_page->data_tail); + smp_mb(); offset = head = local_read(&rb->head); - if (!rb->overwrite && - unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) - goto fail; head += size; + if (unlikely(!perf_output_space(rb, tail, offset, head))) + goto fail; } while (local_cmpxchg(&rb->head, offset, head) != offset); - /* - * Separate the userpage->tail read from the data stores below. - * Matches the MB userspace SHOULD issue after reading the data - * and before storing the new tail position. - * - * See perf_output_put_handle(). - */ - smp_mb(); - - if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) + if (head - local_read(&rb->wakeup) > rb->watermark) local_add(rb->watermark, &rb->wakeup); - page_shift = PAGE_SHIFT + page_order(rb); + handle->page = offset >> (PAGE_SHIFT + page_order(rb)); + handle->page &= rb->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); + handle->addr = rb->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; - handle->page = (offset >> page_shift) & (rb->nr_pages - 1); - offset &= (1UL << page_shift) - 1; - handle->addr = rb->data_pages[handle->page] + offset; - handle->size = (1UL << page_shift) - offset; - - if (unlikely(have_lost)) { - struct perf_sample_data sample_data; - - lost_event.header.size = sizeof(lost_event); + if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); - perf_event_header__init_id(&lost_event.header, - &sample_data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, &sample_data); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 24b7d6c..ad8e1bd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -35,7 +35,6 @@ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> -#include <linux/task_work.h> #include <linux/uprobes.h> @@ -245,12 +244,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and - * uprobe_write_opcode accordingly. This would never be a problem for archs - * that have fixed length instructions. + * write_opcode accordingly. This would never be a problem for archs that + * have fixed length instructions. */ /* - * uprobe_write_opcode - write the opcode at a given virtual address. + * write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -261,7 +260,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, +static int write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; @@ -315,7 +314,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -330,7 +329,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -504,8 +503,9 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) return ret; } -static int __copy_insn(struct address_space *mapping, struct file *filp, - void *insn, int nbytes, loff_t offset) +static int +__copy_insn(struct address_space *mapping, struct file *filp, char *insn, + unsigned long nbytes, loff_t offset) { struct page *page; @@ -527,28 +527,28 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, static int copy_insn(struct uprobe *uprobe, struct file *filp) { - struct address_space *mapping = uprobe->inode->i_mapping; - loff_t offs = uprobe->offset; - void *insn = uprobe->arch.insn; - int size = MAX_UINSN_BYTES; - int len, err = -EIO; + struct address_space *mapping; + unsigned long nbytes; + int bytes; - /* Copy only available bytes, -EIO if nothing was read */ - do { - if (offs >= i_size_read(uprobe->inode)) - break; + nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); + mapping = uprobe->inode->i_mapping; - len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); - err = __copy_insn(mapping, filp, insn, len, offs); - if (err) - break; - - insn += len; - offs += len; - size -= len; - } while (size); + /* Instruction at end of binary; copy only available bytes */ + if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) + bytes = uprobe->inode->i_size - uprobe->offset; + else + bytes = MAX_UINSN_BYTES; - return err; + /* Instruction at the page-boundary; copy bytes in second page */ + if (nbytes < bytes) { + int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes); + if (err) + return err; + bytes = nbytes; + } + return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, @@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* uprobe_write_opcode() assumes we don't cross page boundary */ + /* write_opcode() assumes we don't cross page boundary */ BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); @@ -1096,22 +1096,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon } /* Slot allocation for XOL */ -static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) +static int xol_add_vma(struct xol_area *area) { + struct mm_struct *mm = current->mm; int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; - if (!area->vaddr) { - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, - PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { - ret = area->vaddr; - goto fail; - } + ret = -ENOMEM; + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, @@ -1121,19 +1120,30 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; + ret = 0; fail: up_write(&mm->mmap_sem); return ret; } -static struct xol_area *__create_xol_area(unsigned long vaddr) +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; - uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct xol_area *area; + uprobe_opcode_t insn = UPROBE_SWBP_INSN; + + area = mm->uprobes_state.xol_area; + if (area) + goto ret; - area = kmalloc(sizeof(*area), GFP_KERNEL); + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1145,14 +1155,13 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->page) goto free_bitmap; - area->vaddr = vaddr; - init_waitqueue_head(&area->wq); - /* Reserve the 1st slot for get_trampoline_vaddr() */ + /* allocate first slot of task's xol_area for the return probes */ set_bit(0, area->bitmap); - atomic_set(&area->slot_count, 1); copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + atomic_set(&area->slot_count, 1); + init_waitqueue_head(&area->wq); - if (!xol_add_vma(mm, area)) + if (!xol_add_vma(area)) return area; __free_page(area->page); @@ -1161,25 +1170,9 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) free_area: kfree(area); out: - return NULL; -} - -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) -{ - struct mm_struct *mm = current->mm; - struct xol_area *area; - - if (!mm->uprobes_state.xol_area) - __create_xol_area(0); - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + ret: + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ return area; } @@ -1263,8 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, - uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); + copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1353,6 +1345,14 @@ void uprobe_free_utask(struct task_struct *t) } /* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t) +{ + t->utask = NULL; +} + +/* * Allocate a uprobe_task object for the task if if necessary. * Called when the thread hits a breakpoint. * @@ -1367,90 +1367,6 @@ static struct uprobe_task *get_utask(void) return current->utask; } -static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) -{ - struct uprobe_task *n_utask; - struct return_instance **p, *o, *n; - - n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); - if (!n_utask) - return -ENOMEM; - t->utask = n_utask; - - p = &n_utask->return_instances; - for (o = o_utask->return_instances; o; o = o->next) { - n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); - if (!n) - return -ENOMEM; - - *n = *o; - atomic_inc(&n->uprobe->ref); - n->next = NULL; - - *p = n; - p = &n->next; - n_utask->depth++; - } - - return 0; -} - -static void uprobe_warn(struct task_struct *t, const char *msg) -{ - pr_warn("uprobe: %s:%d failed to %s\n", - current->comm, current->pid, msg); -} - -static void dup_xol_work(struct callback_head *work) -{ - kfree(work); - - if (current->flags & PF_EXITING) - return; - - if (!__create_xol_area(current->utask->vaddr)) - uprobe_warn(current, "dup xol area"); -} - -/* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) -{ - struct uprobe_task *utask = current->utask; - struct mm_struct *mm = current->mm; - struct callback_head *work; - struct xol_area *area; - - t->utask = NULL; - - if (!utask || !utask->return_instances) - return; - - if (mm == t->mm && !(flags & CLONE_VFORK)) - return; - - if (dup_utask(t, utask)) - return uprobe_warn(t, "dup ret instances"); - - /* The task can fork() after dup_xol_work() fails */ - area = mm->uprobes_state.xol_area; - if (!area) - return uprobe_warn(t, "dup xol area"); - - if (mm == t->mm) - return; - - /* TODO: move it into the union in uprobe_task */ - work = kmalloc(sizeof(*work), GFP_KERNEL); - if (!work) - return uprobe_warn(t, "dup xol area"); - - t->utask->vaddr = area->vaddr; - init_task_work(work, dup_xol_work); - task_work_add(t, work, true); -} - /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. @@ -1941,4 +1857,9 @@ static int __init init_uprobes(void) return register_die_notifier(&uprobe_exception_nb); } -__initcall(init_uprobes); +module_init(init_uprobes); + +static void __exit exit_uprobes(void) +{ +} +module_exit(exit_uprobes); diff --git a/kernel/extable.c b/kernel/extable.c index 763faf0..832cb28 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) static inline int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && - addr < (unsigned long)_einittext) + addr <= (unsigned long)_einittext) return 1; return 0; } @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && - addr < (unsigned long)_etext) + addr <= (unsigned long)_etext) return 1; if (system_state == SYSTEM_BOOTING && diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be..086fe73 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); + mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,9 +814,12 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; +#endif if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1310,7 +1313,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(clone_flags, p); + sched_fork(p); retval = perf_event_init_task(p); if (retval) @@ -1370,6 +1373,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif + uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ @@ -1486,7 +1490,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, perf_event_fork(p); trace_task_newtask(p, clone_flags); - uprobe_copy_process(p, clone_flags); return p; diff --git a/kernel/futex.c b/kernel/futex.c index 80ba086..c3a1a55 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -66,7 +66,7 @@ #include <asm/futex.h> -#include "locking/rtmutex_common.h" +#include "rtmutex_common.h" int __read_mostly futex_cmpxchg_enabled; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d04ce8a..d4da55d 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -46,34 +46,4 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. -choice - prompt "Specify GCOV format" - depends on GCOV_KERNEL - default GCOV_FORMAT_AUTODETECT - ---help--- - The gcov format is usually determined by the GCC version, but there are - exceptions where format changes are integrated in lower-version GCCs. - In such a case use this option to adjust the format used in the kernel - accordingly. - - If unsure, choose "Autodetect". - -config GCOV_FORMAT_AUTODETECT - bool "Autodetect" - ---help--- - Select this option to use the format that corresponds to your GCC - version. - -config GCOV_FORMAT_3_4 - bool "GCC 3.4 format" - ---help--- - Select this option to use the format defined by GCC 3.4. - -config GCOV_FORMAT_4_7 - bool "GCC 4.7 format" - ---help--- - Select this option to use the format defined by GCC 4.7. - -endchoice - endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 52aa7e8..e97ca59 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,33 +1,3 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -# if-lt -# Usage VAR := $(call if-lt, $(a), $(b)) -# Returns 1 if (a < b) -if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) - -ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) - cc-ver := 0304 -else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) - cc-ver := 0407 -else -# Use cc-version if available, otherwise set 0 -# -# scripts/Kbuild.include, which contains cc-version function, is not included -# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" -# Meaning cc-ver is empty causing if-lt test to fail with -# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. -# This has no affect on the clean phase, but the error message could be -# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version -# is not available. We can probably move if-lt to Kbuild.include, so it's also -# not defined during clean or to include Kbuild.include in -# scripts/Makefile.clean. But the following workaround seems least invasive. - cc-ver := $(if $(call cc-version),$(call cc-version),0) -endif - -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o - -ifeq ($(call if-lt, $(cc-ver), 0407),1) - obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o -else - obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o -endif +obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b..9b22d03 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -20,6 +20,7 @@ #include <linux/mutex.h> #include "gcov.h" +static struct gcov_info *gcov_info_head; static int gcov_events_enabled; static DEFINE_MUTEX(gcov_lock); @@ -33,7 +34,7 @@ void __gcov_init(struct gcov_info *info) mutex_lock(&gcov_lock); if (gcov_version == 0) { - gcov_version = gcov_info_version(info); + gcov_version = info->version; /* * Printing gcc's version magic may prove useful for debugging * incompatibility reports. @@ -44,7 +45,8 @@ void __gcov_init(struct gcov_info *info) * Add new profiling data structure to list and inform event * listener. */ - gcov_info_link(info); + info->next = gcov_info_head; + gcov_info_head = info; if (gcov_events_enabled) gcov_event(GCOV_ADD, info); mutex_unlock(&gcov_lock); @@ -79,12 +81,6 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_delta); -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - /** * gcov_enable_events - enable event reporting through gcov_event() * @@ -95,15 +91,13 @@ EXPORT_SYMBOL(__gcov_merge_ior); */ void gcov_enable_events(void) { - struct gcov_info *info = NULL; + struct gcov_info *info; mutex_lock(&gcov_lock); gcov_events_enabled = 1; - /* Perform event callback for previously registered entries. */ - while ((info = gcov_info_next(info))) + for (info = gcov_info_head; info; info = info->next) gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); } @@ -118,23 +112,25 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) { struct module *mod = data; - struct gcov_info *info = NULL; - struct gcov_info *prev = NULL; + struct gcov_info *info; + struct gcov_info *prev; if (event != MODULE_STATE_GOING) return NOTIFY_OK; mutex_lock(&gcov_lock); - + prev = NULL; /* Remove entries located in module from linked list. */ - while ((info = gcov_info_next(info))) { + for (info = gcov_info_head; info; info = info->next) { if (within(info, mod->module_core, mod->core_size)) { - gcov_info_unlink(prev, info); + if (prev) + prev->next = info->next; + else + gcov_info_head = info->next; if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); } else prev = info; } - mutex_unlock(&gcov_lock); return NOTIFY_OK; diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a..7a7d2ee 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str) unsigned long val; if (kstrtoul(str, 0, &val)) { - pr_warn("invalid gcov_persist parameter '%s'\n", str); + pr_warning("invalid gcov_persist parameter '%s'\n", str); return 0; } gcov_persist = val; @@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name) list_for_each_entry(node, &all_head, all) { info = get_node_info(node); - if (info && (strcmp(gcov_info_filename(info), name) == 0)) + if (info && (strcmp(info->filename, name) == 0)) return node; } @@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, seq = file->private_data; info = gcov_iter_get_info(seq->private); mutex_lock(&node_lock); - node = get_node_by_name(gcov_info_filename(info)); + node = get_node_by_name(info->filename); if (node) { /* Reset counts or remove node for unloaded modules. */ if (node->num_loaded == 0) @@ -365,7 +365,7 @@ static const char *deskew(const char *basename) */ static void add_links(struct gcov_node *node, struct dentry *parent) { - const char *basename; + char *basename; char *target; int num; int i; @@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent) if (!node->links) return; for (i = 0; i < num; i++) { - target = get_link_target( - gcov_info_filename(get_node_info(node)), - &gcov_link[i]); + target = get_link_target(get_node_info(node)->filename, + &gcov_link[i]); if (!target) goto out_err; - basename = kbasename(target); - if (basename == target) + basename = strrchr(target, '/'); + if (!basename) goto out_err; + basename++; node->links[i] = debugfs_create_symlink(deskew(basename), parent, target); if (!node->links[i]) @@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, } else node->dentry = debugfs_create_dir(node->name, parent->dentry); if (!node->dentry) { - pr_warn("could not create file\n"); + pr_warning("could not create file\n"); kfree(node); return NULL; } @@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, err_nomem: kfree(node); - pr_warn("out of memory\n"); + pr_warning("out of memory\n"); return NULL; } @@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info) struct gcov_node *parent; struct gcov_node *node; - filename = kstrdup(gcov_info_filename(info), GFP_KERNEL); + filename = kstrdup(info->filename, GFP_KERNEL); if (!filename) return; parent = &root_node; @@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) */ loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); if (!loaded_info) { - pr_warn("could not add '%s' (out of memory)\n", - gcov_info_filename(info)); + pr_warning("could not add '%s' (out of memory)\n", + info->filename); return; } memcpy(loaded_info, node->loaded_info, @@ -644,9 +644,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) * data set replaces the copy of the last one. */ if (!gcov_info_is_compatible(node->unloaded_info, info)) { - pr_warn("discarding saved data for %s " - "(incompatible version)\n", - gcov_info_filename(info)); + pr_warning("discarding saved data for %s " + "(incompatible version)\n", info->filename); gcov_info_free(node->unloaded_info); node->unloaded_info = NULL; } @@ -656,8 +655,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) * The initial one takes precedence. */ if (!gcov_info_is_compatible(node->loaded_info[0], info)) { - pr_warn("could not add '%s' (incompatible " - "version)\n", gcov_info_filename(info)); + pr_warning("could not add '%s' (incompatible " + "version)\n", info->filename); kfree(loaded_info); return; } @@ -692,9 +691,8 @@ static void save_info(struct gcov_node *node, struct gcov_info *info) else { node->unloaded_info = gcov_info_dup(info); if (!node->unloaded_info) { - pr_warn("could not save data for '%s' " - "(out of memory)\n", - gcov_info_filename(info)); + pr_warning("could not save data for '%s' " + "(out of memory)\n", info->filename); } } } @@ -709,8 +707,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info) i = get_info_index(node, info); if (i < 0) { - pr_warn("could not remove '%s' (not found)\n", - gcov_info_filename(info)); + pr_warning("could not remove '%s' (not found)\n", + info->filename); return; } if (gcov_persist) @@ -737,7 +735,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) struct gcov_node *node; mutex_lock(&node_lock); - node = get_node_by_name(gcov_info_filename(info)); + node = get_node_by_name(info->filename); switch (action) { case GCOV_ADD: if (node) @@ -749,8 +747,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) if (node) remove_info(node, info); else { - pr_warn("could not remove '%s' (not found)\n", - gcov_info_filename(info)); + pr_warning("could not remove '%s' (not found)\n", + info->filename); } break; } diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 27bc88a..ae5bb42 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -21,121 +21,6 @@ #include <linux/vmalloc.h> #include "gcov.h" -#define GCOV_COUNTERS 5 - -static struct gcov_info *gcov_info_head; - -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { - unsigned int ident; - unsigned int checksum; - unsigned int n_ctrs[0]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; - void (*merge)(gcov_type *, unsigned int); -}; - -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - unsigned int n_functions; - const struct gcov_fn_info *functions; - unsigned int ctr_mask; - struct gcov_ctr_info counts[0]; -}; - -/** - * gcov_info_filename - return info filename - * @info: profiling data set - */ -const char *gcov_info_filename(struct gcov_info *info) -{ - return info->filename; -} - -/** - * gcov_info_version - return info version - * @info: profiling data set - */ -unsigned int gcov_info_version(struct gcov_info *info) -{ - return info->version; -} - -/** - * gcov_info_next - return next profiling data set - * @info: profiling data set - * - * Returns next gcov_info following @info or first gcov_info in the chain if - * @info is %NULL. - */ -struct gcov_info *gcov_info_next(struct gcov_info *info) -{ - if (!info) - return gcov_info_head; - - return info->next; -} - -/** - * gcov_info_link - link/add profiling data set to the list - * @info: profiling data set - */ -void gcov_info_link(struct gcov_info *info) -{ - info->next = gcov_info_head; - gcov_info_head = info; -} - -/** - * gcov_info_unlink - unlink/remove profiling data set from the list - * @prev: previous profiling data set - * @info: profiling data set - */ -void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) -{ - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; -} - /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c deleted file mode 100644 index 2c6e463..0000000 --- a/kernel/gcov/gcc_4_7.c +++ /dev/null @@ -1,560 +0,0 @@ -/* - * This code provides functions to handle gcc's profiling data format - * introduced with gcc 4.7. - * - * This file is based heavily on gcc_3_4.c file. - * - * For a better understanding, refer to gcc source: - * gcc/gcov-io.h - * libgcc/libgcov.c - * - * Uses gcc-internal data definitions. - */ - -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/seq_file.h> -#include <linux/vmalloc.h> -#include "gcov.h" - -#define GCOV_COUNTERS 8 -#define GCOV_TAG_FUNCTION_LENGTH 3 - -static struct gcov_info *gcov_info_head; - -/** - * struct gcov_ctr_info - information about counters for a single function - * @num: number of counter values for this type - * @values: array of counter values for this type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; -}; - -/** - * struct gcov_fn_info - profiling meta data per function - * @key: comdat key - * @ident: unique ident of function - * @lineno_checksum: function lineo_checksum - * @cfg_checksum: function cfg checksum - * @ctrs: instrumented counters - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - * - * Information about a single function. This uses the trailing array - * idiom. The number of counters is determined from the merge pointer - * array in gcov_info. The key is used to detect which of a set of - * comdat functions was selected -- it points to the gcov_info object - * of the object file containing the selected comdat function. - */ -struct gcov_fn_info { - const struct gcov_info *key; - unsigned int ident; - unsigned int lineno_checksum; - unsigned int cfg_checksum; - struct gcov_ctr_info ctrs[0]; -}; - -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: uniquifying time stamp - * @filename: name of the associated gcov data file - * @merge: merge functions (null for unused counter type) - * @n_functions: number of instrumented functions - * @functions: pointer to pointers to function information - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); - unsigned int n_functions; - struct gcov_fn_info **functions; -}; - -/** - * gcov_info_filename - return info filename - * @info: profiling data set - */ -const char *gcov_info_filename(struct gcov_info *info) -{ - return info->filename; -} - -/** - * gcov_info_version - return info version - * @info: profiling data set - */ -unsigned int gcov_info_version(struct gcov_info *info) -{ - return info->version; -} - -/** - * gcov_info_next - return next profiling data set - * @info: profiling data set - * - * Returns next gcov_info following @info or first gcov_info in the chain if - * @info is %NULL. - */ -struct gcov_info *gcov_info_next(struct gcov_info *info) -{ - if (!info) - return gcov_info_head; - - return info->next; -} - -/** - * gcov_info_link - link/add profiling data set to the list - * @info: profiling data set - */ -void gcov_info_link(struct gcov_info *info) -{ - info->next = gcov_info_head; - gcov_info_head = info; -} - -/** - * gcov_info_unlink - unlink/remove profiling data set from the list - * @prev: previous profiling data set - * @info: profiling data set - */ -void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) -{ - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; -} - -/* Symbolic links to be created for each profiling data file. */ -const struct gcov_link gcov_link[] = { - { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ - { 0, NULL}, -}; - -/* - * Determine whether a counter is active. Doesn't change at run-time. - */ -static int counter_active(struct gcov_info *info, unsigned int type) -{ - return info->merge[type] ? 1 : 0; -} - -/* Determine number of active counters. Based on gcc magic. */ -static unsigned int num_counter_active(struct gcov_info *info) -{ - unsigned int i; - unsigned int result = 0; - - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(info, i)) - result++; - } - return result; -} - -/** - * gcov_info_reset - reset profiling data to zero - * @info: profiling data set - */ -void gcov_info_reset(struct gcov_info *info) -{ - struct gcov_ctr_info *ci_ptr; - unsigned int fi_idx; - unsigned int ct_idx; - - for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { - ci_ptr = info->functions[fi_idx]->ctrs; - - for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { - if (!counter_active(info, ct_idx)) - continue; - - memset(ci_ptr->values, 0, - sizeof(gcov_type) * ci_ptr->num); - ci_ptr++; - } - } -} - -/** - * gcov_info_is_compatible - check if profiling data can be added - * @info1: first profiling data set - * @info2: second profiling data set - * - * Returns non-zero if profiling data can be added, zero otherwise. - */ -int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) -{ - return (info1->stamp == info2->stamp); -} - -/** - * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added - * - * Adds profiling counts of @source to @dest. - */ -void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) -{ - struct gcov_ctr_info *dci_ptr; - struct gcov_ctr_info *sci_ptr; - unsigned int fi_idx; - unsigned int ct_idx; - unsigned int val_idx; - - for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) { - dci_ptr = dst->functions[fi_idx]->ctrs; - sci_ptr = src->functions[fi_idx]->ctrs; - - for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { - if (!counter_active(src, ct_idx)) - continue; - - for (val_idx = 0; val_idx < sci_ptr->num; val_idx++) - dci_ptr->values[val_idx] += - sci_ptr->values[val_idx]; - - dci_ptr++; - sci_ptr++; - } - } -} - -/** - * gcov_info_dup - duplicate profiling data set - * @info: profiling data set to duplicate - * - * Return newly allocated duplicate on success, %NULL on error. - */ -struct gcov_info *gcov_info_dup(struct gcov_info *info) -{ - struct gcov_info *dup; - struct gcov_ctr_info *dci_ptr; /* dst counter info */ - struct gcov_ctr_info *sci_ptr; /* src counter info */ - unsigned int active; - unsigned int fi_idx; /* function info idx */ - unsigned int ct_idx; /* counter type idx */ - size_t fi_size; /* function info size */ - size_t cv_size; /* counter values size */ - - dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); - if (!dup) - return NULL; - - dup->next = NULL; - dup->filename = NULL; - dup->functions = NULL; - - dup->filename = kstrdup(info->filename, GFP_KERNEL); - if (!dup->filename) - goto err_free; - - dup->functions = kcalloc(info->n_functions, - sizeof(struct gcov_fn_info *), GFP_KERNEL); - if (!dup->functions) - goto err_free; - - active = num_counter_active(info); - fi_size = sizeof(struct gcov_fn_info); - fi_size += sizeof(struct gcov_ctr_info) * active; - - for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { - dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL); - if (!dup->functions[fi_idx]) - goto err_free; - - *(dup->functions[fi_idx]) = *(info->functions[fi_idx]); - - sci_ptr = info->functions[fi_idx]->ctrs; - dci_ptr = dup->functions[fi_idx]->ctrs; - - for (ct_idx = 0; ct_idx < active; ct_idx++) { - - cv_size = sizeof(gcov_type) * sci_ptr->num; - - dci_ptr->values = vmalloc(cv_size); - - if (!dci_ptr->values) - goto err_free; - - dci_ptr->num = sci_ptr->num; - memcpy(dci_ptr->values, sci_ptr->values, cv_size); - - sci_ptr++; - dci_ptr++; - } - } - - return dup; -err_free: - gcov_info_free(dup); - return NULL; -} - -/** - * gcov_info_free - release memory for profiling data set duplicate - * @info: profiling data set duplicate to free - */ -void gcov_info_free(struct gcov_info *info) -{ - unsigned int active; - unsigned int fi_idx; - unsigned int ct_idx; - struct gcov_ctr_info *ci_ptr; - - if (!info->functions) - goto free_info; - - active = num_counter_active(info); - - for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { - if (!info->functions[fi_idx]) - continue; - - ci_ptr = info->functions[fi_idx]->ctrs; - - for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) - vfree(ci_ptr->values); - - kfree(info->functions[fi_idx]); - } - -free_info: - kfree(info->functions); - kfree(info->filename); - kfree(info); -} - -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} - -/** - * convert_to_gcda - convert profiling data set to gcda file format - * @buffer: the buffer to store file data or %NULL if no data should be stored - * @info: profiling data set to be converted - * - * Returns the number of bytes that were/would have been stored into the buffer. - */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) -{ - struct gcov_fn_info *fi_ptr; - struct gcov_ctr_info *ci_ptr; - unsigned int fi_idx; - unsigned int ct_idx; - unsigned int cv_idx; - size_t pos = 0; - - /* File header. */ - pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); - pos += store_gcov_u32(buffer, pos, info->version); - pos += store_gcov_u32(buffer, pos, info->stamp); - - for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { - fi_ptr = info->functions[fi_idx]; - - /* Function record. */ - pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); - pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); - pos += store_gcov_u32(buffer, pos, fi_ptr->ident); - pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); - pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); - - ci_ptr = fi_ptr->ctrs; - - for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { - if (!counter_active(info, ct_idx)) - continue; - - /* Counter record. */ - pos += store_gcov_u32(buffer, pos, - GCOV_TAG_FOR_COUNTER(ct_idx)); - pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); - - for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { - pos += store_gcov_u64(buffer, pos, - ci_ptr->values[cv_idx]); - } - - ci_ptr++; - } - } - - return pos; -} - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 92c8e22..060073e 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -21,6 +21,7 @@ * gcc and need to be kept as close to the original definition as possible to * remain compatible. */ +#define GCOV_COUNTERS 5 #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) @@ -33,18 +34,60 @@ typedef long gcov_type; typedef long long gcov_type; #endif -/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so - * we cannot use full definition here and they need to be placed in gcc specific - * implementation of gcov. This also means no direct access to the members in - * generic code and usage of the interface below.*/ -struct gcov_info; +/** + * struct gcov_fn_info - profiling meta data per function + * @ident: object file-unique function identifier + * @checksum: function checksum + * @n_ctrs: number of values per counter type belonging to this function + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + */ +struct gcov_fn_info { + unsigned int ident; + unsigned int checksum; + unsigned int n_ctrs[0]; +}; + +/** + * struct gcov_ctr_info - profiling data per counter type + * @num: number of counter values for this type + * @values: array of counter values for this type + * @merge: merge function for counter values of this type (unused) + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { + unsigned int num; + gcov_type *values; + void (*merge)(gcov_type *, unsigned int); +}; -/* Interface to access gcov_info data */ -const char *gcov_info_filename(struct gcov_info *info); -unsigned int gcov_info_version(struct gcov_info *info); -struct gcov_info *gcov_info_next(struct gcov_info *info); -void gcov_info_link(struct gcov_info *info); -void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: time stamp + * @filename: name of the associated gcov data file + * @n_functions: number of instrumented functions + * @functions: function data + * @ctr_mask: mask specifying which counter types are active + * @counts: counter data per counter type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { + unsigned int version; + struct gcov_info *next; + unsigned int stamp; + const char *filename; + unsigned int n_functions; + const struct gcov_fn_info *functions; + unsigned int ctr_mask; + struct gcov_ctr_info counts[0]; +}; /* Base interface. */ enum gcov_action { diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9328b80..3e97fb1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -16,12 +16,11 @@ #include <linux/export.h> #include <linux/sysctl.h> #include <linux/utsname.h> -#include <trace/events/sched.h> /* * The number of tasks checked: */ -int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Limit number of tasks checked in a batch. @@ -93,9 +92,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) t->last_switch_count = switch_count; return; } - - trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -207,14 +203,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } -static atomic_t reset_hung_task = ATOMIC_INIT(0); - -void reset_hung_task_detector(void) -{ - atomic_set(&reset_hung_task, 1); -} -EXPORT_SYMBOL_GPL(reset_hung_task_detector); - /* * kthread which checks for tasks stuck in D state */ @@ -228,9 +216,6 @@ static int watchdog(void *dummy) while (schedule_timeout_interruptible(timeout_jiffies(timeout))) timeout = sysctl_hung_task_timeout_secs; - if (atomic_xchg(&reset_hung_task, 0)) - continue; - check_hung_uninterruptible_tasks(timeout); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc04c16..a3bb14f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc) } /** - * irq_disable - Mark interrupt disabled + * irq_disable - Mark interupt disabled * @desc: irq descriptor which should be disabled * * If the chip does not implement the irq_disable callback, we diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb3..706724e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -465,26 +465,27 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); -unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) { struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; unsigned int virq; - domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; + domain = controller ? irq_find_host(controller) : irq_default_domain; if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(irq_data->np)); + of_node_full_name(controller)); return 0; } /* If domain has no translation, then we assume interrupt line */ if (domain->ops->xlate == NULL) - hwirq = irq_data->args[0]; + hwirq = intspec[0]; else { - if (domain->ops->xlate(domain, irq_data->np, irq_data->args, - irq_data->args_count, &hwirq, &type)) + if (domain->ops->xlate(domain, controller, intspec, intsize, + &hwirq, &type)) return 0; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c..514bcfd 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) } /* - * Interrupts explicitly requested as threaded interrupts want to be + * Interrupts explicitely requested as threaded interupts want to be * preemtible - many of them need to sleep and wait for slow busses to * complete. */ @@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_mput; } - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + sched_setscheduler(t, SCHED_FIFO, ¶m); /* * We keep the reference to the task struct even if diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index abcd6ca..cb228bf 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; - if (!is_early && want_early) + if (is_early != want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 3320b84..1162f10 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,7 +14,6 @@ enum { _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, - _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -27,7 +26,6 @@ enum { #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON -#define IRQ_IS_POLLED GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -149,8 +147,3 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NESTED_THREAD; } - -static inline bool irq_settings_is_polled(struct irq_desc *desc) -{ - return desc->status_use_accessors & _IRQ_IS_POLLED; -} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc6..7b5f012 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -67,13 +67,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); - /* - * PER_CPU, nested thread interrupts and interrupts explicitely - * marked polled are excluded from polling. - */ - if (irq_settings_is_per_cpu(desc) || - irq_settings_is_nested_thread(desc) || - irq_settings_is_polled(desc)) + /* PER_CPU and nested thread interrupts are never polled */ + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) goto out; /* @@ -273,8 +268,7 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { - if (desc->istate & IRQS_POLL_INPROGRESS || - irq_settings_is_polled(desc)) + if (desc->istate & IRQS_POLL_INPROGRESS) return; /* we get here again via the threaded handler */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 9019f15..297a924 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,7 +58,6 @@ static void jump_label_update(struct static_key *key, int enable); void static_key_slow_inc(struct static_key *key) { - STATIC_KEY_CHECK_USE(); if (atomic_inc_not_zero(&key->enabled)) return; @@ -104,14 +103,12 @@ static void jump_label_update_timeout(struct work_struct *work) void static_key_slow_dec(struct static_key *key) { - STATIC_KEY_CHECK_USE(); __static_key_slow_dec(key, 0, NULL); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_deferred(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); __static_key_slow_dec(&key->key, key->timeout, &key->work); } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); @@ -119,7 +116,6 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { - STATIC_KEY_CHECK_USE(); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } @@ -216,7 +212,6 @@ void __init jump_label_init(void) key->next = NULL; #endif } - static_key_initialized = true; jump_label_unlock(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 490afc0..2a74f30 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -921,7 +921,7 @@ static int kimage_load_segment(struct kimage *image, * reinitialize them. * * - A machine specific part that includes the syscall number - * and then copies the image to it's final destination. And + * and the copies the image to it's final destination. And * jumps into the image at entry. * * kexec does not sync, or unmount filesystems so if you need diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ceeadfc..a0d367a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2066,7 +2066,7 @@ static int __init init_kprobes(void) { int i, err = 0; unsigned long offset = 0, size = 0; - char *modname, namebuf[KSYM_NAME_LEN]; + char *modname, namebuf[128]; const char *symbol_name; void *addr; struct kprobe_blackpoint *kb; @@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) const char *sym = NULL; unsigned int i = *(loff_t *) v; unsigned long offset = 0; - char *modname, namebuf[KSYM_NAME_LEN]; + char *modname, namebuf[128]; head = &kprobe_table[i]; preempt_disable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index b5ae3ee..760e86d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -33,7 +33,7 @@ struct kthread_create_info /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; - struct completion *done; + struct completion done; struct list_head list; }; @@ -178,7 +178,6 @@ static int kthread(void *_create) struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; - struct completion *done; struct kthread self; int ret; @@ -188,16 +187,10 @@ static int kthread(void *_create) init_completion(&self.parked); current->vfork_done = &self.exited; - /* If user was SIGKILLed, I release the structure. */ - done = xchg(&create->done, NULL); - if (!done) { - kfree(create); - do_exit(-EINTR); - } /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; - complete(done); + complete(&create->done); schedule(); ret = -EINTR; @@ -230,15 +223,8 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { - /* If user was SIGKILLed, I release the structure. */ - struct completion *done = xchg(&create->done, NULL); - - if (!done) { - kfree(create); - return; - } create->result = ERR_PTR(pid); - complete(done); + complete(&create->done); } } @@ -269,59 +255,36 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), const char namefmt[], ...) { - DECLARE_COMPLETION_ONSTACK(done); - struct task_struct *task; - struct kthread_create_info *create = kmalloc(sizeof(*create), - GFP_KERNEL); - - if (!create) - return ERR_PTR(-ENOMEM); - create->threadfn = threadfn; - create->data = data; - create->node = node; - create->done = &done; + struct kthread_create_info create; + + create.threadfn = threadfn; + create.data = data; + create.node = node; + init_completion(&create.done); spin_lock(&kthread_create_lock); - list_add_tail(&create->list, &kthread_create_list); + list_add_tail(&create.list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); - /* - * Wait for completion in killable state, for I might be chosen by - * the OOM killer while kthreadd is trying to allocate memory for - * new kernel thread. - */ - if (unlikely(wait_for_completion_killable(&done))) { - /* - * If I was SIGKILLed before kthreadd (or new kernel thread) - * calls complete(), leave the cleanup of this structure to - * that thread. - */ - if (xchg(&create->done, NULL)) - return ERR_PTR(-ENOMEM); - /* - * kthreadd (or new kernel thread) will call complete() - * shortly. - */ - wait_for_completion(&done); - } - task = create->result; - if (!IS_ERR(task)) { + wait_for_completion(&create.done); + + if (!IS_ERR(create.result)) { static const struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); - vsnprintf(task->comm, sizeof(task->comm), namefmt, args); + vsnprintf(create.result->comm, sizeof(create.result->comm), + namefmt, args); va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. */ - sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(create.result, cpu_all_mask); } - kfree(create); - return task; + return create.result; } EXPORT_SYMBOL(kthread_create_on_node); diff --git a/kernel/locking/lglock.c b/kernel/lglock.c index 86ae2ae..86ae2ae 100644 --- a/kernel/locking/lglock.c +++ b/kernel/lglock.c diff --git a/kernel/locking/lockdep.c b/kernel/lockdep.c index 576ba75..e16c45b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/lockdep.c @@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data) return 0; } -static unsigned long __lockdep_count_forward_deps(struct lock_list *this) +unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; struct lock_list *uninitialized_var(target_entry); @@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) return ret; } -static unsigned long __lockdep_count_backward_deps(struct lock_list *this) +unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; struct lock_list *uninitialized_var(target_entry); @@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() + : rcu_is_cpu_idle() ? "RCU used illegally from idle CPU!\n" : "", rcu_scheduler_active, debug_locks); @@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * So complain bitterly if someone does call rcu_read_lock(), * rcu_read_lock_bh() and so on from extended quiescent states. */ - if (!rcu_is_watching()) + if (rcu_is_cpu_idle()) printk("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); diff --git a/kernel/locking/lockdep_internals.h b/kernel/lockdep_internals.h index 4f560cf..4f560cf 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/lockdep_internals.h diff --git a/kernel/locking/lockdep_proc.c b/kernel/lockdep_proc.c index ef43ac4..b2c71c5 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -421,7 +421,6 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) @@ -519,20 +518,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) } if (i) { seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); + seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); seq_puts(m, "\n"); } } static void seq_header(struct seq_file *m) { - seq_puts(m, "lock_stat version 0.4\n"); + seq_printf(m, "lock_stat version 0.3\n"); if (unlikely(!debug_locks)) seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); - seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " "%14s %14s\n", "class name", "con-bounces", @@ -540,14 +539,12 @@ static void seq_header(struct seq_file *m) "waittime-min", "waittime-max", "waittime-total", - "waittime-avg", "acq-bounces", "acquisitions", "holdtime-min", "holdtime-max", - "holdtime-total", - "holdtime-avg"); - seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); + "holdtime-total"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); seq_printf(m, "\n"); } diff --git a/kernel/locking/lockdep_states.h b/kernel/lockdep_states.h index 995b0cc..995b0cc 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/lockdep_states.h diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile deleted file mode 100644 index baab8e5..0000000 --- a/kernel/locking/Makefile +++ /dev/null @@ -1,25 +0,0 @@ - -obj-y += mutex.o semaphore.o rwsem.o lglock.o - -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg -endif - -obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o -obj-$(CONFIG_LOCKDEP) += lockdep.o -ifeq ($(CONFIG_PROC_FS),y) -obj-$(CONFIG_LOCKDEP) += lockdep_proc.o -endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c deleted file mode 100644 index 652a8ee..0000000 --- a/kernel/locking/percpu-rwsem.c +++ /dev/null @@ -1,165 +0,0 @@ -#include <linux/atomic.h> -#include <linux/rwsem.h> -#include <linux/percpu.h> -#include <linux/wait.h> -#include <linux/lockdep.h> -#include <linux/percpu-rwsem.h> -#include <linux/rcupdate.h> -#include <linux/sched.h> -#include <linux/errno.h> - -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, - const char *name, struct lock_class_key *rwsem_key) -{ - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) - return -ENOMEM; - - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); - return 0; -} - -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) -{ - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ -} - -/* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) -{ - bool success = false; - - preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { - __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } - preempt_enable(); - - return success; -} - -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); - return; - } - - down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ - __up_read(&brw->rw_sem); -} - -void percpu_up_read(struct percpu_rw_semaphore *brw) -{ - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; - - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); -} - -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } - - return sum; -} - -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ -void percpu_down_write(struct percpu_rw_semaphore *brw) -{ - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); - /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). - * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). - */ - synchronize_sched_expedited(); - - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); - - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); - - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); -} - -void percpu_up_write(struct percpu_rw_semaphore *brw) -{ - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); - /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). - */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); -} diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index 9be8a91..0000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,296 +0,0 @@ -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/export.h> - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->activity = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->activity += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -void __sched __down_read(struct rw_semaphore *sem) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - - tsk->state = TASK_RUNNING; - out: - ; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->activity == 0) - break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->activity = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity == 0) { - /* got the lock */ - sem->activity = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 19c5fa9..0000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null @@ -1,293 +0,0 @@ -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi <alex.shi@intel.com> - * and Michel Lespinasse <walken@google.com> - */ -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/export.h> - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = RWSEM_UNLOCKED_VALUE; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false - */ -static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long oldcount, woken, loop, adjustment; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. - */ - wake_up_process(waiter->task); - goto out; - } - - /* Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - adjustment = 0; - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) - goto out; - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - } - - /* Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. - */ - woken = 0; - do { - woken++; - - if (waiter->list.next == &sem->wait_list) - break; - - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - rwsem_atomic_add(adjustment, sem); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - } while (--loop); - - sem->wait_list.next = next; - next->prev = &sem->wait_list; - - out: - return sem; -} - -/* - * wait for the read lock to be granted - */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!waiter.task) - break; - schedule(); - } - - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * wait until we successfully acquire the write lock - */ -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); - - /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c deleted file mode 100644 index 0374a59..0000000 --- a/kernel/locking/spinlock_debug.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright 2005, Red Hat, Inc., Ingo Molnar - * Released under the General Public License (GPL). - * - * This file contains the spinlock/rwlock implementations for - * DEBUG_SPINLOCK. - */ - -#include <linux/spinlock.h> -#include <linux/nmi.h> -#include <linux/interrupt.h> -#include <linux/debug_locks.h> -#include <linux/delay.h> -#include <linux/export.h> - -void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - lock->magic = SPINLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__raw_spin_lock_init); - -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; - lock->magic = RWLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__rwlock_init); - -static void spin_dump(raw_spinlock_t *lock, const char *msg) -{ - struct task_struct *owner = NULL; - - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; - printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", - msg, raw_smp_processor_id(), - current->comm, task_pid_nr(current)); - printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " - ".owner_cpu: %d\n", - lock, lock->magic, - owner ? owner->comm : "<none>", - owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); - dump_stack(); -} - -static void spin_bug(raw_spinlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - spin_dump(lock, msg); -} - -#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) - -static inline void -debug_spin_lock_before(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_spin_lock_after(raw_spinlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_spin_unlock(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); - SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); - SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -static void __spin_lock_debug(raw_spinlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); -#ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); -#endif - - /* - * The trylock above was causing a livelock. Give the lower level arch - * specific lock code a chance to acquire the lock. We have already - * printed a warning/backtrace at this point. The non-debug arch - * specific code might actually succeed in acquiring the lock. If it is - * not successful, the end-result is the same - there is no forward - * progress. - */ - arch_spin_lock(&lock->raw_lock); -} - -void do_raw_spin_lock(raw_spinlock_t *lock) -{ - debug_spin_lock_before(lock); - if (unlikely(!arch_spin_trylock(&lock->raw_lock))) - __spin_lock_debug(lock); - debug_spin_lock_after(lock); -} - -int do_raw_spin_trylock(raw_spinlock_t *lock) -{ - int ret = arch_spin_trylock(&lock->raw_lock); - - if (ret) - debug_spin_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_spin_unlock(raw_spinlock_t *lock) -{ - debug_spin_unlock(lock); - arch_spin_unlock(&lock->raw_lock); -} - -static void rwlock_bug(rwlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", - msg, raw_smp_processor_id(), current->comm, - task_pid_nr(current), lock); - dump_stack(); -} - -#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) - -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_read_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_read_lock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_lock(&lock->raw_lock); -} - -int do_raw_read_trylock(rwlock_t *lock) -{ - int ret = arch_read_trylock(&lock->raw_lock); - -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_read_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_unlock(&lock->raw_lock); -} - -static inline void debug_write_lock_before(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_write_lock_after(rwlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_write_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); - RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_write_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_write_lock(rwlock_t *lock) -{ - debug_write_lock_before(lock); - arch_write_lock(&lock->raw_lock); - debug_write_lock_after(lock); -} - -int do_raw_write_trylock(rwlock_t *lock) -{ - int ret = arch_write_trylock(&lock->raw_lock); - - if (ret) - debug_write_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_write_unlock(rwlock_t *lock) -{ - debug_write_unlock(lock); - arch_write_unlock(&lock->raw_lock); -} diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S new file mode 100644 index 0000000..4a9a86d --- /dev/null +++ b/kernel/modsign_certificate.S @@ -0,0 +1,12 @@ +#include <linux/export.h> + +#define GLOBAL(name) \ + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): + + .section ".init.data","aw" + +GLOBAL(modsign_certificate_list) + .incbin "signing_key.x509" + .incbin "extra_certificates" +GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 0000000..7cbd450 --- /dev/null +++ b/kernel/modsign_pubkey.c @@ -0,0 +1,104 @@ +/* Public keys for module signature verification + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +struct key *modsign_keyring; + +extern __initconst const u8 modsign_certificate_list[]; +extern __initconst const u8 modsign_certificate_list_end[]; + +/* + * We need to make sure ccache doesn't cache the .o file as it doesn't notice + * if modsign.pub changes. + */ +static __initconst const char annoy_ccache[] = __TIME__ "foo"; + +/* + * Load the compiled-in keys + */ +static __init int module_verify_init(void) +{ + pr_notice("Initialise module verification\n"); + + modsign_keyring = keyring_alloc(".module_sign", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(modsign_keyring)) + panic("Can't allocate module signing keyring\n"); + + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(module_verify_init); + +/* + * Load the compiled-in keys + */ +static __init int load_module_signing_keys(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading module verification certificates\n"); + + end = modsign_certificate_list_end; + p = modsign_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(modsign_keyring, 1), + "asymmetric", + NULL, + p, + plen, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) + pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + else + pr_notice("MODSIGN: Loaded cert '%s'\n", + key_ref_to_ptr(key)->description); + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 915e123..24f9247 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,4 +9,6 @@ * 2 of the Licence, or (at your option) any later version. */ +extern struct key *modsign_keyring; + extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index f5a3b1e..dc58274 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -378,21 +378,23 @@ static bool check_symbol(const struct symsearch *syms, if (syms->licence == GPL_ONLY) return false; if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { - pr_warn("Symbol %s is being used by a non-GPL module, " - "which will not be allowed in the future\n", - fsa->name); + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", fsa->name); } } #ifdef CONFIG_UNUSED_SYMBOLS if (syms->unused && fsa->warn) { - pr_warn("Symbol %s is marked as UNUSED, however this module is " - "using it.\n", fsa->name); - pr_warn("This symbol will go away in the future.\n"); - pr_warn("Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", fsa->name); + printk(KERN_WARNING + "This symbol will go away in the future.\n"); + printk(KERN_WARNING + "Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); } #endif @@ -490,15 +492,16 @@ static int percpu_modalloc(struct module *mod, struct load_info *info) return 0; if (align > PAGE_SIZE) { - pr_warn("%s: per-cpu alignment %li > %li\n", - mod->name, align, PAGE_SIZE); + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); if (!mod->percpu) { - pr_warn("%s: Could not allocate %lu bytes percpu data\n", - mod->name, (unsigned long)pcpusec->sh_size); + printk(KERN_WARNING + "%s: Could not allocate %lu bytes percpu data\n", + mod->name, (unsigned long)pcpusec->sh_size); return -ENOMEM; } mod->percpu_size = pcpusec->sh_size; @@ -641,6 +644,8 @@ static int module_unload_init(struct module *mod) /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); + /* Backwards compatibility macros put refcount during init. */ + mod->waiter = current; return 0; } @@ -674,7 +679,7 @@ static int add_module_usage(struct module *a, struct module *b) pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { - pr_warn("%s: out of memory loading\n", a->name); + printk(KERN_WARNING "%s: out of memory loading\n", a->name); return -ENOMEM; } @@ -766,9 +771,16 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - struct stopref sref = { mod, flags, forced }; + if (flags & O_NONBLOCK) { + struct stopref sref = { mod, flags, forced }; - return stop_machine(__try_stop_module, &sref, NULL); + return stop_machine(__try_stop_module, &sref, NULL); + } else { + /* We don't need to stop the machine for this. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + return 0; + } } unsigned long module_refcount(struct module *mod) @@ -801,6 +813,21 @@ EXPORT_SYMBOL(module_refcount); /* This exists whether we can unload or not */ static void free_module(struct module *mod); +static void wait_for_zero_refcount(struct module *mod) +{ + /* Since we might sleep for some time, release the mutex first */ + mutex_unlock(&module_mutex); + for (;;) { + pr_debug("Looking at refcount...\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + if (module_refcount(mod) == 0) + break; + schedule(); + } + current->state = TASK_RUNNING; + mutex_lock(&module_mutex); +} + SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -815,11 +842,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (!(flags & O_NONBLOCK)) { - printk(KERN_WARNING - "waiting module removal not supported: please upgrade"); - } - if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -837,7 +859,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Doing init or already dying? */ if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count damn the torpedoes */ + /* FIXME: if (force), slam module count and wake up + waiter --RR */ pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; @@ -853,11 +876,18 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } + /* Set this up before setting mod->state */ + mod->waiter = current; + /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; + /* Never wait if forced. */ + if (!forced && module_refcount(mod) != 0) + wait_for_zero_refcount(mod); + mutex_unlock(&module_mutex); /* Final destruction now no one is using it. */ if (mod->exit != NULL) @@ -975,6 +1005,9 @@ void module_put(struct module *module) __this_cpu_inc(module->refptr->decs); trace_module_put(module, _RET_IP_); + /* Maybe they're waiting for us to drop reference? */ + if (unlikely(!module_is_live(module))) + wake_up_process(module->waiter); preempt_enable(); } } @@ -1112,7 +1145,8 @@ static int try_to_force_load(struct module *mod, const char *reason) { #ifdef CONFIG_MODULE_FORCE_LOAD if (!test_taint(TAINT_FORCED_MODULE)) - pr_warn("%s: %s: kernel tainted.\n", mod->name, reason); + printk(KERN_WARNING "%s: %s: kernel tainted.\n", + mod->name, reason); add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else @@ -1165,7 +1199,8 @@ static int check_version(Elf_Shdr *sechdrs, goto bad_version; } - pr_warn("%s: no symbol version for %s\n", mod->name, symname); + printk(KERN_WARNING "%s: no symbol version for %s\n", + mod->name, symname); return 0; bad_version: @@ -1274,8 +1309,8 @@ resolve_symbol_wait(struct module *mod, !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { - pr_warn("%s: gave up waiting for init of module %s.\n", - mod->name, owner); + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, owner); } return ksym; } @@ -1591,14 +1626,15 @@ static int mod_sysfs_init(struct module *mod) struct kobject *kobj; if (!module_sysfs_initialized) { - pr_err("%s: module sysfs not initialized\n", mod->name); + printk(KERN_ERR "%s: module sysfs not initialized\n", + mod->name); err = -EINVAL; goto out; } kobj = kset_find_obj(module_kset, mod->name); if (kobj) { - pr_err("%s: module is already loaded\n", mod->name); + printk(KERN_ERR "%s: module is already loaded\n", mod->name); kobject_put(kobj); err = -EINVAL; goto out; @@ -1925,7 +1961,8 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(s->name, &owner, NULL, true, false)) { - pr_err("%s: exports duplicate symbol %s" + printk(KERN_ERR + "%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, s->name, module_name(owner)); return -ENOEXEC; @@ -1976,8 +2013,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - pr_warn("%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; @@ -2131,8 +2168,8 @@ static void set_license(struct module *mod, const char *license) if (!license_is_gpl_compatible(license)) { if (!test_taint(TAINT_PROPRIETARY_MODULE)) - pr_warn("%s: module license '%s' taints kernel.\n", - mod->name, license); + printk(KERN_WARNING "%s: module license '%s' taints " + "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); } @@ -2368,8 +2405,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) - pr_err("dynamic debug error adding module: %s\n", - debug->modname); + printk(KERN_ERR "dynamic debug error adding module: %s\n", + debug->modname); #endif } @@ -2582,7 +2619,8 @@ static int rewrite_section_headers(struct load_info *info, int flags) Elf_Shdr *shdr = &info->sechdrs[i]; if (shdr->sh_type != SHT_NOBITS && info->len < shdr->sh_offset + shdr->sh_size) { - pr_err("Module len %lu truncated\n", info->len); + printk(KERN_ERR "Module len %lu truncated\n", + info->len); return -ENOEXEC; } @@ -2644,14 +2682,15 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - pr_warn("No module found in object\n"); + printk(KERN_WARNING "No module found in object\n"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", mod->name); + printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", + mod->name); return ERR_PTR(-ENOEXEC); } @@ -2678,7 +2717,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) if (err) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { - pr_err("%s: version magic '%s' should be '%s'\n", + printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); return -ENOEXEC; } @@ -2688,8 +2727,9 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); - pr_warn("%s: module is from the staging directory, the quality " - "is unknown, you have been warned.\n", mod->name); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); } /* Set up license info based on the info section */ @@ -2698,7 +2738,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return 0; } -static int find_module_sections(struct module *mod, struct load_info *info) +static void find_module_sections(struct module *mod, struct load_info *info) { mod->kp = section_objs(info, "__param", sizeof(*mod->kp), &mod->num_kp); @@ -2728,18 +2768,6 @@ static int find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", sizeof(*mod->ctors), &mod->num_ctors); - if (!mod->ctors) - mod->ctors = section_objs(info, ".init_array", - sizeof(*mod->ctors), &mod->num_ctors); - else if (find_sec(info, ".init_array")) { - /* - * This shouldn't happen with same compiler and binutils - * building all parts of the module. - */ - printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", - mod->name); - return -EINVAL; - } #endif #ifdef CONFIG_TRACEPOINTS @@ -2773,12 +2801,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->extable), &mod->num_exentries); if (section_addr(info, "__obsparm")) - pr_warn("%s: Ignoring obsolete parameters\n", mod->name); + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); info->debug = section_objs(info, "__verbose", sizeof(*info->debug), &info->num_debug); - - return 0; } static int move_module(struct module *mod, struct load_info *info) @@ -3051,10 +3078,11 @@ static int do_init_module(struct module *mod) return ret; } if (ret > 0) { - pr_warn("%s: '%s'->init suspiciously returned %d, it should " - "follow 0/-E convention\n" - "%s: loading module anyway...\n", - __func__, mod->name, ret, __func__); + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); dump_stack(); } @@ -3177,8 +3205,10 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname) { /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); - if (ret != 0) - pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); + if (ret != 0) { + printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", + modname, param); + } return 0; } @@ -3213,9 +3243,10 @@ static int load_module(struct load_info *info, const char __user *uargs, #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; if (!mod->sig_ok) { - pr_notice_once("%s: module verification failed: signature " - "and/or required key missing - tainting " - "kernel\n", mod->name); + printk_once(KERN_NOTICE + "%s: module verification failed: signature and/or" + " required key missing - tainting kernel\n", + mod->name); add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); } #endif @@ -3232,9 +3263,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now we've got everything in the final locations, we can * find optional sections. */ - err = find_module_sections(mod, info); - if (err) - goto free_unload; + find_module_sections(mod, info); err = check_module_license_and_versions(mod); if (err) diff --git a/kernel/module_signing.c b/kernel/module_signing.c index be5b8fa..f2970bd 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,7 +14,6 @@ #include <crypto/public_key.h> #include <crypto/hash.h> #include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> #include "module-internal.h" /* @@ -29,7 +28,7 @@ */ struct module_signature { u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum hash_algo] */ + u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ u8 id_type; /* Key identifier type [enum pkey_id_type] */ u8 signer_len; /* Length of signer's name */ u8 key_id_len; /* Length of key identifier */ @@ -40,7 +39,7 @@ struct module_signature { /* * Digest the module contents. */ -static struct public_key_signature *mod_make_digest(enum hash_algo hash, +static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, const void *mod, unsigned long modlen) { @@ -55,7 +54,7 @@ static struct public_key_signature *mod_make_digest(enum hash_algo hash, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -158,7 +157,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, pr_debug("Look up: \"%s\"\n", id); - key = keyring_search(make_key_ref(system_trusted_keyring, 1), + key = keyring_search(make_key_ref(modsign_keyring, 1), &key_type_asymmetric, id); if (IS_ERR(key)) pr_warn("Request for unknown module key '%s' err %ld\n", @@ -218,7 +217,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -ENOPKG; if (ms.hash >= PKEY_HASH__LAST || - !hash_algo_name[ms.hash]) + !pkey_hash_algo[ms.hash]) return -ENOPKG; key = request_asymmetric_key(sig, ms.signer_len, diff --git a/kernel/locking/mutex-debug.c b/kernel/mutex-debug.c index 7e3443f..7e3443f 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/mutex-debug.c diff --git a/kernel/locking/mutex-debug.h b/kernel/mutex-debug.h index 0799fd3..0799fd3 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/mutex-debug.h diff --git a/kernel/locking/mutex.c b/kernel/mutex.c index 4dd6e4c..d24105b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/mutex.c @@ -1,5 +1,5 @@ /* - * kernel/locking/mutex.c + * kernel/mutex.c * * Mutexes: blocking mutual exclusion locks * diff --git a/kernel/locking/mutex.h b/kernel/mutex.h index 4115fbf..4115fbf 100644 --- a/kernel/locking/mutex.h +++ b/kernel/mutex.h diff --git a/kernel/padata.c b/kernel/padata.c index 2abd25d..07af2c9 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,7 +46,6 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) static int padata_cpu_hash(struct parallel_data *pd) { - unsigned int seq_nr; int cpu_index; /* @@ -54,8 +53,10 @@ static int padata_cpu_hash(struct parallel_data *pd) * seq_nr mod. number of cpus in use. */ - seq_nr = atomic_inc_return(&pd->seq_nr); - cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); + spin_lock(&pd->seq_lock); + cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); + pd->seq_nr++; + spin_unlock(&pd->seq_lock); return padata_index_to_cpu(pd, cpu_index); } @@ -428,7 +429,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = 0; atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; diff --git a/kernel/panic.c b/kernel/panic.c index c00b4ce..b6c482c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -233,7 +233,7 @@ static const struct tnt tnts[] = { */ const char *print_tainted(void) { - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; if (tainted_mask) { char *s; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 06c62de..4208655 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -132,12 +132,6 @@ out: return ERR_PTR(err); } -static void delayed_free_pidns(struct rcu_head *p) -{ - kmem_cache_free(pid_ns_cachep, - container_of(p, struct pid_namespace, rcu)); -} - static void destroy_pid_namespace(struct pid_namespace *ns) { int i; @@ -146,7 +140,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); - call_rcu(&ns->rcu, delayed_free_pidns); + kmem_cache_free(pid_ns_cachep, ns); } struct pid_namespace *copy_pid_ns(unsigned long flags, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc..d444c4e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -178,22 +178,6 @@ config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP -config DPM_WATCHDOG - bool "Device suspend/resume watchdog" - depends on PM_DEBUG && PSTORE - ---help--- - Sets up a watchdog timer to capture drivers that are - locked up attempting to suspend/resume a device. - A detected lockup causes system panic with message - captured in pstore device for inspection in subsequent - boot session. - -config DPM_WATCHDOG_TIMEOUT - int "Watchdog timeout in seconds" - range 1 120 - default 12 - depends on DPM_WATCHDOG - config PM_TRACE bool help diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8dff9b4..a394297 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -558,12 +558,30 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else { + } else if (count <= 11) { /* ASCII perhaps? */ + char ascii_value[11]; + unsigned long int ulval; int ret; - ret = kstrtos32_from_user(buf, count, 16, &value); - if (ret) - return ret; + if (copy_from_user(ascii_value, buf, count)) + return -EFAULT; + + if (count > 10) { + if (ascii_value[10] == '\n') + ascii_value[10] = '\0'; + else + return -EINVAL; + } else { + ascii_value[count] = '\0'; + } + ret = kstrtoul(ascii_value, 16, &ulval); + if (ret) { + pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); + return -EINVAL; + } + value = (s32)lower_32_bits(ulval); + } else { + return -EINVAL; } req = filp->private_data; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e..98c3b34 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -792,8 +792,7 @@ void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - if (WARN_ON(!(forbidden_pages_map && free_pages_map))) - return; + BUG_ON(!(forbidden_pages_map && free_pages_map)); bm1 = forbidden_pages_map; bm2 = free_pages_map; @@ -1403,11 +1402,7 @@ int hibernate_preallocate_memory(void) * highmem and non-highmem zones separately. */ pages_highmem = preallocate_image_highmem(highmem / 2); - alloc = count - max_size; - if (alloc > pages_highmem) - alloc -= pages_highmem; - else - alloc = 0; + alloc = (count - max_size) - pages_highmem; pages = preallocate_image_memory(alloc, avail_normal); if (pages < alloc) { /* We have exhausted non-highmem pages, try highmem. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 98d3575..957f061 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -36,9 +36,9 @@ static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; - bool frozen; - bool ready; - bool platform_support; + char frozen; + char ready; + char platform_support; bool free_bitmaps; } snapshot_state; @@ -70,7 +70,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; - data->free_bitmaps = false; error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); @@ -94,9 +93,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) atomic_inc(&snapshot_device_available); - data->frozen = false; - data->ready = false; - data->platform_support = false; + data->frozen = 0; + data->ready = 0; + data->platform_support = 0; Unlock: unlock_system_sleep(); @@ -230,7 +229,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (error) thaw_processes(); else - data->frozen = true; + data->frozen = 1; break; @@ -241,7 +240,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); - data->frozen = false; + data->frozen = 0; break; case SNAPSHOT_CREATE_IMAGE: @@ -271,7 +270,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = false; + data->ready = 0; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after @@ -335,7 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - data->ready = false; + data->ready = 0; break; case SNAPSHOT_PLATFORM_SUPPORT: diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86b..b4e8500 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = { #ifdef CONFIG_KEXEC /* - * This appends the listed symbols to /proc/vmcore + * This appends the listed symbols to /proc/vmcoreinfo * - * /proc/vmcore is used by various utilities, like crash and makedumpfile to + * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. @@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = 1; - pr_info("debug: ignoring loglevel setting.\n"); + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); return 0; } @@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str) pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); - return 0; + return 1; } -early_param("boot_delay", boot_delay_setup); +__setup("boot_delay=", boot_delay_setup); static void boot_delay_msec(int level) { @@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; - pr_info("debug: skip boot console de-registration.\n"); + printk(KERN_INFO "debug: skip boot console de-registration.\n"); return 0; } @@ -2241,7 +2241,7 @@ void register_console(struct console *newcon) /* find the last or real console */ for_each_console(bcon) { if (!(bcon->flags & CON_BOOT)) { - pr_info("Too late to register bootconsole %s%d\n", + printk(KERN_INFO "Too late to register bootconsole %s%d\n", newcon->name, newcon->index); return; } @@ -2358,18 +2358,21 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - pr_info("%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { - /* We need to iterate through all boot consoles, to make - * sure we print everything out, before we unregister them. + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) */ + printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", + newcon->name, newcon->index); for_each_console(bcon) if (bcon->flags & CON_BOOT) unregister_console(bcon); + } else { + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); } } EXPORT_SYMBOL(register_console); @@ -2379,10 +2382,6 @@ int unregister_console(struct console *console) struct console *a, *b; int res; - pr_info("%sconsole [%s%d] disabled\n", - (console->flags & CON_BOOT) ? "boot" : "" , - console->name, console->index); - res = _braille_unregister_console(console); if (res) return res; @@ -2422,6 +2421,8 @@ static int __init printk_late_init(void) for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { + printk(KERN_INFO "turn off boot console %s%d\n", + con->name, con->index); unregister_console(con); } } @@ -2448,7 +2449,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) if (pending & PRINTK_PENDING_SCHED) { char *buf = __get_cpu_var(printk_sched_buf); - pr_warn("[sched_delayed] %s", buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); } if (pending & PRINTK_PENDING_WAKEUP) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f4bcb3..dd562e9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -257,8 +257,7 @@ ok: if (task->mm) dumpable = get_dumpable(task->mm); rcu_read_lock(); - if (dumpable != SUID_DUMP_USER && - !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { rcu_read_unlock(); return -EPERM; } diff --git a/kernel/rcu/rcu.h b/kernel/rcu.h index 7859a0a..7713196 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu.h @@ -122,11 +122,4 @@ int rcu_jiffies_till_stall_check(void); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ -/* - * Strings used in tracepoints need to be exported via the - * tracing system such that tools like perf and trace-cmd can - * translate the string address pointers to actual text. - */ -#define TPS(x) tracepoint_string(x) - #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile deleted file mode 100644 index 01e9ec3..0000000 --- a/kernel/rcu/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -obj-y += update.o srcu.o -obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o -obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o -obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o -obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/update.c b/kernel/rcupdate.c index 6cb3dff..b02a339 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcupdate.c @@ -53,12 +53,6 @@ #include "rcu.h" -MODULE_ALIAS("rcupdate"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcupdate." - module_param(rcu_expedited, int, 0); #ifdef CONFIG_PREEMPT_RCU @@ -154,7 +148,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - if (!rcu_is_watching()) + if (rcu_is_cpu_idle()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; @@ -304,7 +298,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); diff --git a/kernel/rcu/tiny.c b/kernel/rcutiny.c index 1254f31..9ed6075 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcutiny.c @@ -35,7 +35,6 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> -#include <linux/ftrace_event.h> #ifdef CONFIG_RCU_TRACE #include <trace/events/rcu.h> @@ -43,7 +42,7 @@ #include "rcu.h" -/* Forward declarations for tiny_plugin.h. */ +/* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); @@ -53,23 +52,22 @@ static void __call_rcu(struct rcu_head *head, static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; -#include "tiny_plugin.h" +#include "rcutiny_plugin.h" /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) { if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), + RCU_TRACE(trace_rcu_dyntick("--=", rcu_dynticks_nesting, newval)); rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); + struct task_struct *idle = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -122,15 +120,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); static void rcu_idle_exit_common(long long oldval) { if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), + RCU_TRACE(trace_rcu_dyntick("++=", oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); if (!is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); + struct task_struct *idle = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -176,18 +174,18 @@ void rcu_irq_enter(void) } EXPORT_SYMBOL_GPL(rcu_irq_enter); -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Test whether RCU thinks that the current CPU is idle. */ -bool notrace __rcu_is_watching(void) +int rcu_is_cpu_idle(void) { - return rcu_dynticks_nesting; + return !rcu_dynticks_nesting; } -EXPORT_SYMBOL(__rcu_is_watching); +EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* * Test whether the current CPU was interrupted from idle. Nested @@ -275,7 +273,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) if (&rcp->rcucblist == rcp->donetail) { RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - !!ACCESS_ONCE(rcp->rcucblist), + ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), false)); @@ -306,8 +304,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), false)); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcutiny_plugin.h index 280d06c..280d06c 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcutiny_plugin.h diff --git a/kernel/rcu/torture.c b/kernel/rcutorture.c index 3929cd4..be63101 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcutorture.c @@ -52,12 +52,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." - static int fqs_duration; module_param(fqs_duration, int, 0444); MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); diff --git a/kernel/rcu/tree.c b/kernel/rcutree.c index dd08198..32618b3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcutree.c @@ -41,7 +41,6 @@ #include <linux/export.h> #include <linux/completion.h> #include <linux/moduleparam.h> -#include <linux/module.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -57,16 +56,17 @@ #include <linux/ftrace_event.h> #include <linux/suspend.h> -#include "tree.h" +#include "rcutree.h" #include <trace/events/rcu.h> #include "rcu.h" -MODULE_ALIAS("rcutree"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutree." +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) /* Data structures. */ @@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE @@ -371,8 +371,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, { trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); + struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); @@ -408,7 +407,7 @@ static void rcu_eqs_enter(bool user) long long oldval; struct rcu_dynticks *rdtp; - rdtp = this_cpu_ptr(&rcu_dynticks); + rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) @@ -436,7 +435,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -479,7 +478,7 @@ void rcu_irq_exit(void) struct rcu_dynticks *rdtp; local_irq_save(flags); - rdtp = this_cpu_ptr(&rcu_dynticks); + rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); @@ -509,8 +508,7 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); + struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); @@ -530,7 +528,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - rdtp = this_cpu_ptr(&rcu_dynticks); + rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); if (oldval & DYNTICK_TASK_NEST_MASK) @@ -557,7 +555,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -601,7 +599,7 @@ void rcu_irq_enter(void) long long oldval; local_irq_save(flags); - rdtp = this_cpu_ptr(&rcu_dynticks); + rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); @@ -622,7 +620,7 @@ void rcu_irq_enter(void) */ void rcu_nmi_enter(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 && (atomic_read(&rdtp->dynticks) & 0x1)) @@ -644,7 +642,7 @@ void rcu_nmi_enter(void) */ void rcu_nmi_exit(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 || --rdtp->dynticks_nmi_nesting != 0) @@ -657,34 +655,21 @@ void rcu_nmi_exit(void) } /** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool notrace __rcu_is_watching(void) -{ - return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; -} - -/** - * rcu_is_watching - see if RCU thinks that the current CPU is idle + * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * * If the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ -bool notrace rcu_is_watching(void) +int rcu_is_cpu_idle(void) { int ret; preempt_disable(); - ret = __rcu_is_watching(); + ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; preempt_enable(); return ret; } -EXPORT_SYMBOL_GPL(rcu_is_watching); +EXPORT_SYMBOL(rcu_is_cpu_idle); #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) @@ -718,7 +703,7 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi()) return 1; preempt_disable(); - rdp = this_cpu_ptr(&rcu_sched_data); + rdp = &__get_cpu_var(rcu_sched_data); rnp = rdp->mynode; ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; @@ -738,7 +723,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; + return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } /* @@ -817,11 +802,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, static void record_gp_stall_check_time(struct rcu_state *rsp) { - unsigned long j = ACCESS_ONCE(jiffies); - - rsp->gp_start = j; - smp_wmb(); /* Record start time before stall time. */ - rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); + rsp->gp_start = jiffies; + rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } /* @@ -916,12 +898,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -951,60 +927,22 @@ static void print_cpu_stall(struct rcu_state *rsp) 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* - * Attempt to revive the RCU machinery by forcing a context switch. - * - * A context switch would normally allow the RCU state machine to make - * progress and it could be we're stuck in kernel space without context - * switches for an entirely unreasonable amount of time. - */ - resched_cpu(smp_processor_id()); + set_need_resched(); /* kick ourselves to get things going. */ } static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - unsigned long completed; - unsigned long gpnum; - unsigned long gps; unsigned long j; unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if (rcu_cpu_stall_suppress) return; j = ACCESS_ONCE(jiffies); - - /* - * Lots of memory barriers to reject false positives. - * - * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, - * then rsp->gp_start, and finally rsp->completed. These values - * are updated in the opposite order with memory barriers (or - * equivalent) during grace-period initialization and cleanup. - * Now, a false positive can occur if we get an new value of - * rsp->gp_start and a old value of rsp->jiffies_stall. But given - * the memory barriers, the only way that this can happen is if one - * grace period ends and another starts between these two fetches. - * Detect this by comparing rsp->completed with the previous fetch - * from rsp->gpnum. - * - * Given this check, comparisons of jiffies, rsp->jiffies_stall, - * and rsp->gp_start suffice to forestall false positives. - */ - gpnum = ACCESS_ONCE(rsp->gpnum); - smp_rmb(); /* Pick up ->gpnum first... */ js = ACCESS_ONCE(rsp->jiffies_stall); - smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = ACCESS_ONCE(rsp->gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = ACCESS_ONCE(rsp->completed); - if (ULONG_CMP_GE(completed, gpnum) || - ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) - return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -1359,7 +1297,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. */ static int rcu_gp_init(struct rcu_state *rsp) { @@ -1368,27 +1306,18 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); - if (rsp->gp_flags == 0) { - /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); - return 0; - } rsp->gp_flags = 0; /* Clear all flags: New grace period. */ - if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { - /* - * Grace period already in progress, don't start another. - * Not supposed to be able to happen. - */ + if (rcu_gp_in_progress(rsp)) { + /* Grace period already in progress, don't start another. */ raw_spin_unlock_irq(&rnp->lock); return 0; } /* Advance to a new grace period and initialize state. */ - record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); + record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ @@ -1437,7 +1366,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* * Do one round of quiescent-state forcing. */ -static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; bool isidle = false; @@ -1522,12 +1451,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) { - rsp->gp_flags = RCU_GP_FLAG_INIT; - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("newreq")); - } + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); } @@ -1537,7 +1462,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) static int __noreturn rcu_gp_kthread(void *arg) { int fqs_state; - int gf; unsigned long j; int ret; struct rcu_state *rsp = arg; @@ -1547,19 +1471,14 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("reqwait")); wait_event_interruptible(rsp->gp_wq, - ACCESS_ONCE(rsp->gp_flags) & + rsp->gp_flags & RCU_GP_FLAG_INIT); - if (rcu_gp_init(rsp)) + if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && + rcu_gp_init(rsp)) break; cond_resched(); flush_signals(current); - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ @@ -1569,16 +1488,10 @@ static int __noreturn rcu_gp_kthread(void *arg) j = HZ; jiffies_till_first_fqs = HZ; } - ret = 0; for (;;) { - if (!ret) - rsp->jiffies_force_qs = jiffies + j; - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqswait")); + rsp->jiffies_force_qs = jiffies + j; ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = ACCESS_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || + (rsp->gp_flags & RCU_GP_FLAG_FQS) || (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); @@ -1587,23 +1500,13 @@ static int __noreturn rcu_gp_kthread(void *arg) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || - (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqsstart")); + if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { fqs_state = rcu_gp_fqs(rsp, fqs_state); - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqsend")); cond_resched(); } else { /* Deal with stray signal. */ cond_resched(); flush_signals(current); - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqswaitsig")); } j = jiffies_till_next_fqs; if (j > HZ) { @@ -1651,8 +1554,6 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, return; } rsp->gp_flags = RCU_GP_FLAG_INIT; - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), - TPS("newreq")); /* * We can't do wakeups while holding the rnp->lock, as that @@ -2354,7 +2255,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (!rcu_is_watching() && cpu_online(smp_processor_id())) + if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ @@ -2824,13 +2725,10 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (!rdp->nxtlist) - continue; - hc = true; - if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { + if (rdp->qlen != rdp->qlen_lazy) al = false; - break; - } + if (rdp->nxtlist) + hc = true; } if (all_lazy) *all_lazy = al; @@ -3318,7 +3216,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* * Compute the rcu_node tree geometry from kernel parameters. This cannot - * replace the definitions in tree.h because those are needed to size + * replace the definitions in rcutree.h because those are needed to size * the ->node array in the rcu_state structure. */ static void __init rcu_init_geometry(void) @@ -3397,8 +3295,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); @@ -3413,4 +3311,4 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } -#include "tree_plugin.h" +#include "rcutree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcutree.h index 52be957..5f97eab 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcutree.h @@ -104,8 +104,6 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; - /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcutree_plugin.h index 08a7652..130c97b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> -#include "../time/tick-internal.h" +#include "time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -96,15 +96,10 @@ static void __init rcu_bootup_announce_oddness(void) #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); + cpumask_setall(rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) @@ -665,7 +660,7 @@ static void rcu_preempt_check_callbacks(int cpu) static void rcu_preempt_do_callbacks(void) { - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); + rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1133,7 +1128,7 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../locking/rtmutex_common.h" +#include "rtmutex_common.h" #ifdef CONFIG_RCU_TRACE @@ -1337,7 +1332,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __this_cpu_read(rcu_cpu_kthread_task) == current; + return __get_cpu_var(rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1387,8 +1382,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, static void rcu_kthread_do_work(void) { - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); + rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_do_callbacks(); } @@ -1407,7 +1402,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __this_cpu_read(rcu_cpu_has_work); + return __get_cpu_var(rcu_cpu_has_work); } /* @@ -1417,8 +1412,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); + char work, *workp = &__get_cpu_var(rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { @@ -1632,26 +1627,20 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; +extern int tick_nohz_enabled; /* - * Try to advance callbacks for all flavors of RCU on the current CPU, but - * only if it has been awhile since the last time we did so. Afterwards, - * if there are any callbacks ready for immediate invocation, return true. + * Try to advance callbacks for all flavors of RCU on the current CPU. + * Afterwards, if there are any callbacks ready for immediate invocation, + * return true. */ static bool rcu_try_advance_all_cbs(void) { bool cbs_ready = false; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; - /* Exit early if we advanced recently. */ - if (jiffies == rdtp->last_advance_all) - return 0; - rdtp->last_advance_all = jiffies; - for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); rnp = rdp->mynode; @@ -1729,7 +1718,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_active); + tne = ACCESS_ONCE(tick_nohz_enabled); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ @@ -1750,8 +1739,6 @@ static void rcu_prepare_for_idle(int cpu) */ if (rdtp->all_lazy && rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { - rdtp->all_lazy = false; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1781,11 +1768,17 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { + struct rcu_data *rdp; + struct rcu_state *rsp; if (rcu_is_nocb_cpu(cpu)) return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); + rcu_try_advance_all_cbs(); + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); + } } /* @@ -2115,22 +2108,15 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll || !t) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeNotPoll")); + if (rcu_nocb_poll | !t) return; - } len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ rdp->qlen_last_fqs_check = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = LONG_MAX / 2; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); - } else { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); } return; } @@ -2154,12 +2140,10 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, (unsigned long)rhp->func, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + rdp->qlen_lazy, rdp->qlen); else trace_rcu_callback(rdp->rsp->name, rhp, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + rdp->qlen_lazy, rdp->qlen); return 1; } @@ -2237,7 +2221,6 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; - bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2246,27 +2229,14 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Sleep")); + if (!rcu_nocb_poll) wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); - } else if (firsttime) { - firsttime = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Poll")); - } list = ACCESS_ONCE(rdp->nocb_head); if (!list) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeEmpty")); schedule_timeout_interruptible(1); flush_signals(current); continue; } - firsttime = 1; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeNonEmpty")); /* * Extract queued callbacks, update counts, and wait @@ -2287,11 +2257,7 @@ static int rcu_nocb_kthread(void *arg) next = list->next; /* Wait for enqueuing to complete, if needed. */ while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WaitQueue")); schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeQueue")); next = list->next; } debug_rcu_head_unqueue(list); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcutree_trace.c index 3596797..cf6c174 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcutree_trace.c @@ -44,7 +44,7 @@ #include <linux/seq_file.h> #define RCU_TREE_NONCORE -#include "tree.h" +#include "rcutree.h" static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) diff --git a/kernel/locking/rtmutex-debug.c b/kernel/rtmutex-debug.c index 13b243a..13b243a 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c diff --git a/kernel/locking/rtmutex-debug.h b/kernel/rtmutex-debug.h index 14193d5..14193d5 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/rtmutex-debug.h diff --git a/kernel/locking/rtmutex-tester.c b/kernel/rtmutex-tester.c index 1d96dd0..1d96dd0 100644 --- a/kernel/locking/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c diff --git a/kernel/locking/rtmutex.c b/kernel/rtmutex.c index 0dd6aec..0dd6aec 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/rtmutex.c diff --git a/kernel/locking/rtmutex.h b/kernel/rtmutex.h index a1a1dd0..a1a1dd0 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/rtmutex.h diff --git a/kernel/locking/rtmutex_common.h b/kernel/rtmutex_common.h index 53a66c8..53a66c8 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/rtmutex_common.h diff --git a/kernel/locking/rwsem.c b/kernel/rwsem.c index cfff143..cfff143 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/rwsem.c diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b62140..54adcf3 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,6 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o -obj-y += wait.o completion.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c deleted file mode 100644 index a63f4dc..0000000 --- a/kernel/sched/completion.c +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Generic wait-for-completion handler; - * - * It differs from semaphores in that their default case is the opposite, - * wait_for_completion default blocks whereas semaphore default non-block. The - * interface also makes it easy to 'complete' multiple waiting threads, - * something which isn't entirely natural for semaphores. - * - * But more importantly, the primitive documents the usage. Semaphores would - * typically be used for exclusion which gives rise to priority inversion. - * Waiting for completion is a typically sync point, but not an exclusion point. - */ - -#include <linux/sched.h> -#include <linux/completion.h> - -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - __add_wait_queue_tail_exclusive(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = action(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ - wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Return: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Return: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e85cda2..5ac63c9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -513,11 +513,12 @@ static inline void init_hrtick(void) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ +#ifdef CONFIG_SMP void resched_task(struct task_struct *p) { int cpu; - lockdep_assert_held(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -525,10 +526,8 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) { - set_preempt_need_resched(); + if (cpu == smp_processor_id()) return; - } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -547,7 +546,6 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers @@ -695,6 +693,12 @@ void sched_avg_update(struct rq *rq) } } +#else /* !CONFIG_SMP */ +void resched_task(struct task_struct *p) +{ + assert_raw_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -763,14 +767,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); } @@ -983,7 +987,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1013,107 +1017,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } -static void __migrate_swap_task(struct task_struct *p, int cpu) -{ - if (p->on_rq) { - struct rq *src_rq, *dst_rq; - - src_rq = task_rq(p); - dst_rq = cpu_rq(cpu); - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); - } else { - /* - * Task isn't running anymore; make it appear like we migrated - * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. - */ - p->wake_cpu = cpu; - } -} - -struct migration_swap_arg { - struct task_struct *src_task, *dst_task; - int src_cpu, dst_cpu; -}; - -static int migrate_swap_stop(void *data) -{ - struct migration_swap_arg *arg = data; - struct rq *src_rq, *dst_rq; - int ret = -EAGAIN; - - src_rq = cpu_rq(arg->src_cpu); - dst_rq = cpu_rq(arg->dst_cpu); - - double_raw_lock(&arg->src_task->pi_lock, - &arg->dst_task->pi_lock); - double_rq_lock(src_rq, dst_rq); - if (task_cpu(arg->dst_task) != arg->dst_cpu) - goto unlock; - - if (task_cpu(arg->src_task) != arg->src_cpu) - goto unlock; - - if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) - goto unlock; - - if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) - goto unlock; - - __migrate_swap_task(arg->src_task, arg->dst_cpu); - __migrate_swap_task(arg->dst_task, arg->src_cpu); - - ret = 0; - -unlock: - double_rq_unlock(src_rq, dst_rq); - raw_spin_unlock(&arg->dst_task->pi_lock); - raw_spin_unlock(&arg->src_task->pi_lock); - - return ret; -} - -/* - * Cross migrate two tasks - */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) -{ - struct migration_swap_arg arg; - int ret = -EINVAL; - - arg = (struct migration_swap_arg){ - .src_task = cur, - .src_cpu = task_cpu(cur), - .dst_task = p, - .dst_cpu = task_cpu(p), - }; - - if (arg.src_cpu == arg.dst_cpu) - goto out; - - /* - * These three tests are all lockless; this is OK since all of them - * will be re-checked with proper locks held further down the line. - */ - if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) - goto out; - - if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) - goto out; - - if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) - goto out; - - ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); - -out: - return ret; -} - struct migration_arg { struct task_struct *task; int dest_cpu; @@ -1333,9 +1236,9 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1427,13 +1330,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*rq->max_idle_balance_cost; + u64 max = 2*sysctl_sched_migration_cost; - update_avg(&rq->avg_idle, delta); - - if (rq->avg_idle > max) + if (delta > max) rq->avg_idle = max; - + else + update_avg(&rq->avg_idle, delta); rq->idle_stamp = 0; } #endif @@ -1494,14 +1396,6 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - if (tif_need_resched()) - set_preempt_need_resched(); - if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1619,7 +1513,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -1701,7 +1595,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(struct task_struct *p) { p->on_rq = 0; @@ -1725,24 +1619,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; p->mm->numa_scan_seq = 0; } - if (clone_flags & CLONE_VM) - p->numa_preferred_nid = current->numa_preferred_nid; - else - p->numa_preferred_nid = -1; - p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; - - INIT_LIST_HEAD(&p->numa_entry); - p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1768,12 +1654,12 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(unsigned long clone_flags, struct task_struct *p) +void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(clone_flags, p); + __sched_fork(p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -1831,7 +1717,10 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif - init_task_preempt_count(p); +#ifdef CONFIG_PREEMPT_COUNT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif @@ -1858,7 +1747,7 @@ void wake_up_new_task(struct task_struct *p) * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif /* Initialize new task's runnable average */ @@ -1949,7 +1838,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); - sched_info_switch(rq, prev, next); + sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); @@ -2001,8 +1890,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); - /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -2186,7 +2073,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2253,20 +2140,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns = 0; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) - /* - * 64-bit doesn't need locks to atomically read a 64bit value. - * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. - * - * If we race with it leaving cpu, we'll take a lock. So we're correct. - * If we race with it entering cpu, unaccounted time is 0. This is - * indistinguishable from the read occurring a few cycles earlier. - */ - if (!p->on_cpu) - return p->se.sum_exec_runtime; -#endif - rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, p, &flags); @@ -2342,7 +2215,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes preempt_count_add(int val) +void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2351,7 +2224,7 @@ void __kprobes preempt_count_add(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - __preempt_count_add(val); + preempt_count() += val; #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2362,9 +2235,9 @@ void __kprobes preempt_count_add(int val) if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -EXPORT_SYMBOL(preempt_count_add); +EXPORT_SYMBOL(add_preempt_count); -void __kprobes preempt_count_sub(int val) +void __kprobes sub_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2382,9 +2255,9 @@ void __kprobes preempt_count_sub(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - __preempt_count_sub(val); + preempt_count() -= val; } -EXPORT_SYMBOL(preempt_count_sub); +EXPORT_SYMBOL(sub_preempt_count); #endif @@ -2557,7 +2430,6 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); - clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2648,9 +2520,9 @@ asmlinkage void __sched notrace preempt_schedule(void) return; do { - __preempt_count_add(PREEMPT_ACTIVE); + add_preempt_count_notrace(PREEMPT_ACTIVE); __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2660,7 +2532,6 @@ asmlinkage void __sched notrace preempt_schedule(void) } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); -#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -2670,19 +2541,20 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { + struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(preempt_count() || !irqs_disabled()); + BUG_ON(ti->preempt_count || !irqs_disabled()); prev_state = exception_enter(); do { - __preempt_count_add(PREEMPT_ACTIVE); + add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2694,6 +2566,8 @@ asmlinkage void __sched preempt_schedule_irq(void) exception_exit(prev_state); } +#endif /* CONFIG_PREEMPT */ + int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { @@ -2701,6 +2575,393 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ + __wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + int wake_flags = WF_SYNC; + + if (unlikely(!q)) + return; + + if (unlikely(nr_exclusive != 1)) + wake_flags = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = action(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, action, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Return: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Return: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -3337,11 +3598,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) struct task_struct *p; int retval; + get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); + put_online_cpus(); return -ESRCH; } @@ -3398,6 +3661,7 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); + put_online_cpus(); return retval; } @@ -3442,6 +3706,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; + get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -3454,11 +3719,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); + put_online_cpus(); return retval; } @@ -3528,11 +3794,16 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + static void __cond_resched(void) { - __preempt_count_add(PREEMPT_ACTIVE); + add_preempt_count(PREEMPT_ACTIVE); __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -3915,7 +4186,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); - __sched_fork(0, idle); + __sched_fork(idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -3941,7 +4212,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - init_idle_preempt_count(idle, cpu); + task_thread_info(idle)->preempt_count = 0; /* * The idle tasks have their own, simple scheduling class: @@ -4075,53 +4346,6 @@ fail: return ret; } -#ifdef CONFIG_NUMA_BALANCING -/* Migrate current task p to target_cpu */ -int migrate_task_to(struct task_struct *p, int target_cpu) -{ - struct migration_arg arg = { p, target_cpu }; - int curr_cpu = task_cpu(p); - - if (curr_cpu == target_cpu) - return 0; - - if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) - return -EINVAL; - - /* TODO: This is not properly updating schedstats */ - - return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); -} - -/* - * Requeue a task on a given node and accurately track the number of NUMA - * tasks on the runqueues - */ -void sched_setnuma(struct task_struct *p, int nid) -{ - struct rq *rq; - unsigned long flags; - bool on_rq, running; - - rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; - running = task_current(rq, p); - - if (on_rq) - dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); - task_rq_unlock(rq, p, &flags); -} -#endif - /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then @@ -4761,7 +4985,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rd yet then + * If we dont want to free the old_rt yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -4895,9 +5119,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); -DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { @@ -4909,19 +5130,11 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - sd = sd->parent; /* sd_busy */ } - rcu_assign_pointer(per_cpu(sd_busy, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; - - sd = lowest_flag_domain(cpu, SD_NUMA); - rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); - - sd = highest_flag_domain(cpu, SD_ASYM_PACKING); - rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); } /* @@ -5441,7 +5654,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA | sd_local_flags(level) , .last_balance = jiffies, @@ -6123,17 +6335,14 @@ void __init sched_init_smp(void) sched_init_numa(); - /* - * There's no userspace yet to cause hotplug operations; hence all the - * cpu masks are stable and all blatant races in the below code cannot - * happen. - */ + get_online_cpus(); mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); + put_online_cpus(); hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -6296,7 +6505,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7069,12 +7277,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - /* - * If we need to toggle cfs_bandwidth_used, off->on must occur - * before making related changes, and on->off must occur afterwards - */ - if (runtime_enabled && !runtime_was_enabled) - cfs_bandwidth_usage_inc(); + account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -7100,8 +7303,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } - if (runtime_was_enabled && !runtime_enabled) - cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d18..1965599 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -15,7 +15,6 @@ #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/utsname.h> -#include <linux/mempolicy.h> #include "sched.h" @@ -138,9 +137,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif -#ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); -#endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); #endif @@ -163,7 +159,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); @@ -229,14 +225,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif -#ifdef CONFIG_CFS_BANDWIDTH - SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", - cfs_rq->tg->cfs_bandwidth.timer_active); - SEQ_printf(m, " .%-30s: %d\n", "throttled", - cfs_rq->throttled); - SEQ_printf(m, " .%-30s: %d\n", "throttle_count", - cfs_rq->throttle_count); -#endif #ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); @@ -357,7 +345,7 @@ static void sched_debug_header(struct seq_file *m) cpu_clk = local_clock(); local_irq_restore(flags); - SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -500,56 +488,6 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); -#define __P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) - - -static void sched_show_numa(struct task_struct *p, struct seq_file *m) -{ -#ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - int node, i; - - if (p->mm) - P(mm->numa_scan_seq); - - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - - SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); - - for_each_online_node(node) { - for (i = 0; i < 2; i++) { - unsigned long nr_faults = -1; - int cpu_current, home_node; - - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; - - cpu_current = !i ? (task_node(p) == node) : - (pol && node_isset(node, pol->v.nodes)); - - home_node = (p->numa_preferred_nid == node); - - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", - i, node, cpu_current, home_node, nr_faults); - } - } - - mpol_put(pol); -#endif -} - void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; @@ -653,8 +591,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } - - sched_show_numa(p, m); } void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ad..7c70201 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -681,8 +681,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static unsigned long task_h_load(struct task_struct *p); - static inline void __update_task_entity_contrib(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ @@ -820,12 +818,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * Approximate time to scan a full NUMA task in ms. The task scan period is - * calculated based on the tasks virtual memory size and - * numa_balancing_scan_size. + * numa task sample period in ms */ -unsigned int sysctl_numa_balancing_scan_period_min = 1000; -unsigned int sysctl_numa_balancing_scan_period_max = 60000; +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -833,835 +830,41 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - -static unsigned int task_nr_scan_windows(struct task_struct *p) -{ - unsigned long rss = 0; - unsigned long nr_scan_pages; - - /* - * Calculations based on RSS as non-present and empty pages are skipped - * by the PTE scanner and NUMA hinting faults should be trapped based - * on resident pages - */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); - rss = get_mm_rss(p->mm); - if (!rss) - rss = nr_scan_pages; - - rss = round_up(rss, nr_scan_pages); - return rss / nr_scan_pages; -} - -/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ -#define MAX_SCAN_WINDOW 2560 - -static unsigned int task_scan_min(struct task_struct *p) -{ - unsigned int scan, floor; - unsigned int windows = 1; - - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; - floor = 1000 / windows; - - scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); - return max_t(unsigned int, floor, scan); -} - -static unsigned int task_scan_max(struct task_struct *p) -{ - unsigned int smin = task_scan_min(p); - unsigned int smax; - - /* Watch for min being lower than max due to floor calculations */ - smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); - return max(smin, smax); -} - -/* - * Once a preferred node is selected the scheduler balancer will prefer moving - * a task to that node for sysctl_numa_balancing_settle_count number of PTE - * scans. This will give the process the chance to accumulate more faults on - * the preferred node but still allow the scheduler to move the task again if - * the nodes CPUs are overloaded. - */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; - -static void account_numa_enqueue(struct rq *rq, struct task_struct *p) -{ - rq->nr_numa_running += (p->numa_preferred_nid != -1); - rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); -} - -static void account_numa_dequeue(struct rq *rq, struct task_struct *p) -{ - rq->nr_numa_running -= (p->numa_preferred_nid != -1); - rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); -} - -struct numa_group { - atomic_t refcount; - - spinlock_t lock; /* nr_tasks, tasks */ - int nr_tasks; - pid_t gid; - struct list_head task_list; - - struct rcu_head rcu; - unsigned long total_faults; - unsigned long faults[0]; -}; - -pid_t task_numa_group_id(struct task_struct *p) -{ - return p->numa_group ? p->numa_group->gid : 0; -} - -static inline int task_faults_idx(int nid, int priv) -{ - return 2 * nid + priv; -} - -static inline unsigned long task_faults(struct task_struct *p, int nid) -{ - if (!p->numa_faults) - return 0; - - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; -} - -static inline unsigned long group_faults(struct task_struct *p, int nid) -{ - if (!p->numa_group) - return 0; - - return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; -} - -/* - * These return the fraction of accesses done by a particular task, or - * task group, on a particular numa node. The group weight is given a - * larger multiplier, in order to group tasks together that are almost - * evenly spread out between numa nodes. - */ -static inline unsigned long task_weight(struct task_struct *p, int nid) -{ - unsigned long total_faults; - - if (!p->numa_faults) - return 0; - - total_faults = p->total_numa_faults; - - if (!total_faults) - return 0; - - return 1000 * task_faults(p, nid) / total_faults; -} - -static inline unsigned long group_weight(struct task_struct *p, int nid) -{ - if (!p->numa_group || !p->numa_group->total_faults) - return 0; - - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; -} - -static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); - -/* Cached statistics for all CPUs within a node */ -struct numa_stats { - unsigned long nr_running; - unsigned long load; - - /* Total compute capacity of CPUs on a node */ - unsigned long power; - - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; - int has_capacity; -}; - -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) -{ - int cpu, cpus = 0; - - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); - - ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); - - cpus++; - } - - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. - */ - if (!cpus) - return; - - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); -} - -struct task_numa_env { - struct task_struct *p; - - int src_cpu, src_nid; - int dst_cpu, dst_nid; - - struct numa_stats src_stats, dst_stats; - - int imbalance_pct, idx; - - struct task_struct *best_task; - long best_imp; - int best_cpu; -}; - -static void task_numa_assign(struct task_numa_env *env, - struct task_struct *p, long imp) -{ - if (env->best_task) - put_task_struct(env->best_task); - if (p) - get_task_struct(p); - - env->best_task = p; - env->best_imp = imp; - env->best_cpu = env->dst_cpu; -} - -/* - * This checks if the overall compute and NUMA accesses of the system would - * be improved if the source tasks was migrated to the target dst_cpu taking - * into account that it might be best if task running on the dst_cpu should - * be exchanged with the source task - */ -static void task_numa_compare(struct task_numa_env *env, - long taskimp, long groupimp) -{ - struct rq *src_rq = cpu_rq(env->src_cpu); - struct rq *dst_rq = cpu_rq(env->dst_cpu); - struct task_struct *cur; - long dst_load, src_load; - long load; - long imp = (groupimp > 0) ? groupimp : taskimp; - - rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ - cur = NULL; - - /* - * "imp" is the fault differential for the source task between the - * source and destination node. Calculate the total differential for - * the source task and potential destination task. The more negative - * the value is, the more rmeote accesses that would be expected to - * be incurred if the tasks were swapped. - */ - if (cur) { - /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) - goto unlock; - - /* - * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights. - */ - if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); - /* - * Add some hysteresis to prevent swapping the - * tasks within a group over tiny differences. - */ - if (cur->numa_group) - imp -= imp/16; - } else { - /* - * Compare the group weights. If a task is all by - * itself (not part of a group), use the task weight - * instead. - */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - - if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); - else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); - } - } - - if (imp < env->best_imp) - goto unlock; - - if (!cur) { - /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) - goto unlock; - - goto balance; - } - - /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) - goto assign; - - /* - * In the overloaded case, try and keep the load balanced. - */ -balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; - - /* XXX missing power terms */ - load = task_h_load(env->p); - dst_load += load; - src_load -= load; - - if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; - } - - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) - goto unlock; - -assign: - task_numa_assign(env, cur, imp); -unlock: - rcu_read_unlock(); -} - -static void task_numa_find_cpu(struct task_numa_env *env, - long taskimp, long groupimp) -{ - int cpu; - - for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { - /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) - continue; - - env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp); - } -} - -static int task_numa_migrate(struct task_struct *p) -{ - struct task_numa_env env = { - .p = p, - - .src_cpu = task_cpu(p), - .src_nid = task_node(p), - - .imbalance_pct = 112, - - .best_task = NULL, - .best_imp = 0, - .best_cpu = -1 - }; - struct sched_domain *sd; - unsigned long taskweight, groupweight; - int nid, ret; - long taskimp, groupimp; - - /* - * Pick the lowest SD_NUMA domain, as that would have the smallest - * imbalance and would be the first to start moving tasks about. - * - * And we want to avoid any moving of tasks about, as that would create - * random movement of tasks -- counter the numa conditions we're trying - * to satisfy here. - */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) - env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; - rcu_read_unlock(); - - /* - * Cpusets can break the scheduler domain tree into smaller - * balance domains, some of which do not cross NUMA boundaries. - * Tasks that are "trapped" in such domains cannot be migrated - * elsewhere, so there is no point in (re)trying. - */ - if (unlikely(!sd)) { - p->numa_preferred_nid = cpu_to_node(task_cpu(p)); - return -EINVAL; - } - - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); - env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); - - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); - - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { - for_each_online_node(nid) { - if (nid == env.src_nid || nid == p->numa_preferred_nid) - continue; - - /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; - if (taskimp < 0 && groupimp < 0) - continue; - - env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); - task_numa_find_cpu(&env, taskimp, groupimp); - } - } - - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - - sched_setnuma(p, env.dst_nid); - - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_min(p); - - if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); - return ret; - } - - ret = migrate_swap(p, env.best_task); - put_task_struct(env.best_task); - return ret; -} - -/* Attempt to migrate a task to a CPU on the preferred node. */ -static void numa_migrate_preferred(struct task_struct *p) -{ - /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) - return; - - /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; - - /* Success if task is already running on preferred CPU */ - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) - return; - - /* Otherwise, try migrate to a CPU on the preferred node */ - task_numa_migrate(p); -} - -/* - * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS - * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease - */ -#define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 - -/* - * Increase the scan period (slow down scanning) if the majority of - * our memory is already on our local node, or if the majority of - * the page accesses are shared with other processes. - * Otherwise, decrease the scan period. - */ -static void update_task_scan_period(struct task_struct *p, - unsigned long shared, unsigned long private) -{ - unsigned int period_slot; - int ratio; - int diff; - - unsigned long remote = p->numa_faults_locality[0]; - unsigned long local = p->numa_faults_locality[1]; - - /* - * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower - */ - if (local + shared == 0) { - p->numa_scan_period = min(p->numa_scan_period_max, - p->numa_scan_period << 1); - - p->mm->numa_next_scan = jiffies + - msecs_to_jiffies(p->numa_scan_period); - - return; - } - - /* - * Prepare to scale scan period relative to the current period. - * == NUMA_PERIOD_THRESHOLD scan period stays the same - * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) - * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) - */ - period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); - ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); - if (ratio >= NUMA_PERIOD_THRESHOLD) { - int slot = ratio - NUMA_PERIOD_THRESHOLD; - if (!slot) - slot = 1; - diff = slot * period_slot; - } else { - diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; - - /* - * Scale scan rate increases based on sharing. There is an - * inverse relationship between the degree of sharing and - * the adjustment made to the scanning period. Broadly - * speaking the intent is that there is little point - * scanning faster if shared accesses dominate as it may - * simply bounce migrations uselessly - */ - period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); - diff = (diff * ratio) / NUMA_PERIOD_SLOTS; - } - - p->numa_scan_period = clamp(p->numa_scan_period + diff, - task_scan_min(p), task_scan_max(p)); - memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); -} - static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1, max_group_nid = -1; - unsigned long max_faults = 0, max_group_faults = 0; - unsigned long fault_types[2] = { 0, 0 }; - spinlock_t *group_lock = NULL; + int seq; + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; - p->numa_scan_period_max = task_scan_max(p); - - /* If the task is part of a group prevent parallel updates to group stats */ - if (p->numa_group) { - group_lock = &p->numa_group->lock; - spin_lock(group_lock); - } - - /* Find the node with the highest number of faults */ - for_each_online_node(nid) { - unsigned long faults = 0, group_faults = 0; - int priv, i; - - for (priv = 0; priv < 2; priv++) { - long diff; - - i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; - - /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; - - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; - p->total_numa_faults += diff; - if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; - } - } - - if (faults > max_faults) { - max_faults = faults; - max_nid = nid; - } - - if (group_faults > max_group_faults) { - max_group_faults = group_faults; - max_group_nid = nid; - } - } - - update_task_scan_period(p, fault_types[0], fault_types[1]); - - if (p->numa_group) { - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - - spin_unlock(group_lock); - } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); - } -} - -static inline int get_numa_group(struct numa_group *grp) -{ - return atomic_inc_not_zero(&grp->refcount); -} - -static inline void put_numa_group(struct numa_group *grp) -{ - if (atomic_dec_and_test(&grp->refcount)) - kfree_rcu(grp, rcu); -} - -static void task_numa_group(struct task_struct *p, int cpupid, int flags, - int *priv) -{ - struct numa_group *grp, *my_grp; - struct task_struct *tsk; - bool join = false; - int cpu = cpupid_to_cpu(cpupid); - int i; - - if (unlikely(!p->numa_group)) { - unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); - - grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!grp) - return; - - atomic_set(&grp->refcount, 1); - spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); - grp->gid = p->pid; - - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; - - grp->total_faults = p->total_numa_faults; - - list_add(&p->numa_entry, &grp->task_list); - grp->nr_tasks++; - rcu_assign_pointer(p->numa_group, grp); - } - - rcu_read_lock(); - tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); - - if (!cpupid_match_pid(tsk, cpupid)) - goto no_join; - - grp = rcu_dereference(tsk->numa_group); - if (!grp) - goto no_join; - - my_grp = p->numa_group; - if (grp == my_grp) - goto no_join; - - /* - * Only join the other group if its bigger; if we're the bigger group, - * the other task will join us. - */ - if (my_grp->nr_tasks > grp->nr_tasks) - goto no_join; - - /* - * Tie-break on the grp address. - */ - if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) - goto no_join; - - /* Always join threads in the same process. */ - if (tsk->mm == current->mm) - join = true; - - /* Simple filter to avoid false positives due to PID collisions */ - if (flags & TNF_SHARED) - join = true; - - /* Update priv based on whether false sharing was detected */ - *priv = !join; - - if (join && !get_numa_group(grp)) - goto no_join; - - rcu_read_unlock(); - - if (!join) - return; - - double_lock(&my_grp->lock, &grp->lock); - - for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; - } - my_grp->total_faults -= p->total_numa_faults; - grp->total_faults += p->total_numa_faults; - - list_move(&p->numa_entry, &grp->task_list); - my_grp->nr_tasks--; - grp->nr_tasks++; - - spin_unlock(&my_grp->lock); - spin_unlock(&grp->lock); - - rcu_assign_pointer(p->numa_group, grp); - - put_numa_group(my_grp); - return; - -no_join: - rcu_read_unlock(); - return; -} - -void task_numa_free(struct task_struct *p) -{ - struct numa_group *grp = p->numa_group; - int i; - void *numa_faults = p->numa_faults; - - if (grp) { - spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; - grp->total_faults -= p->total_numa_faults; - - list_del(&p->numa_entry); - grp->nr_tasks--; - spin_unlock(&grp->lock); - rcu_assign_pointer(p->numa_group, NULL); - put_numa_group(grp); - } - - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; - kfree(numa_faults); + /* FIXME: Scheduling placement policy hints go here */ } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; - bool migrated = flags & TNF_MIGRATED; - int priv; if (!numabalancing_enabled) return; - /* for example, ksmd faulting in a user's mm */ - if (!p->mm) - return; - - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - - /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; - - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) - return; - - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); - p->total_numa_faults = 0; - memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); - } + /* FIXME: Allocate task-specific structure for placement policy here */ /* - * First accesses are treated as private, otherwise consider accesses - * to be private if the accessing pid has not changed + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes */ - if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { - priv = 1; - } else { - priv = cpupid_match_pid(p, last_cpupid); - if (!priv && !(flags & TNF_NO_GROUP)) - task_numa_group(p, last_cpupid, flags, &priv); - } + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); - - /* - * Retry task to preferred node migration periodically, in case it - * case it previously failed, or the scheduler moved us. - */ - if (time_after(jiffies, p->numa_migrate_retry)) - numa_migrate_preferred(p); - - if (migrated) - p->numa_pages_migrated += pages; - - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -1681,7 +884,6 @@ void task_numa_work(struct callback_head *work) struct mm_struct *mm = p->mm; struct vm_area_struct *vma; unsigned long start, end; - unsigned long nr_pte_updates = 0; long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -1698,9 +900,35 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; - if (!mm->numa_next_scan) { - mm->numa_next_scan = now + - msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); } /* @@ -1710,20 +938,20 @@ void task_numa_work(struct callback_head *work) if (time_before(now, migrate)) return; - if (p->numa_scan_period == 0) { - p->numa_scan_period_max = task_scan_max(p); - p->numa_scan_period = task_scan_min(p); - } + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; /* - * Delay this task enough that another task of this mm will likely win - * the next time around. + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work */ - p->node_stamp += 2 * TICK_NSEC; + if (migrate_ratelimited(numa_node_id())) + return; start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; @@ -1739,32 +967,18 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + if (!vma_migratable(vma)) continue; - /* - * Shared library pages mapped by multiple processes are not - * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso - * as migrating the pages will be of marginal benefit. - */ - if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + /* Skip small VMAs. They are not likely to be of relevance */ + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) continue; do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); - - /* - * Scan sysctl_numa_balancing_scan_size but ensure that - * at least one PTE is updated so that unused virtual - * address space is quickly skipped. - */ - if (nr_pte_updates) - pages -= (end - start) >> PAGE_SHIFT; + pages -= change_prot_numa(vma, start, end); start = end; if (pages <= 0) @@ -1774,10 +988,10 @@ void task_numa_work(struct callback_head *work) out: /* - * It is possible to reach the end of the VMA list but the last few - * VMAs are not guaranteed to the vma_migratable. If they are not, we - * would find the !migratable VMA on the next scan but not reset the - * scanner to the start so check it now. + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. */ if (vma) mm->numa_scan_offset = start; @@ -1811,8 +1025,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now - curr->node_stamp > period) { if (!curr->node_stamp) - curr->numa_scan_period = task_scan_min(curr); - curr->node_stamp += period; + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->node_stamp = now; if (!time_before(jiffies, curr->mm->numa_next_scan)) { init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ @@ -1824,14 +1038,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } - -static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) -{ -} - -static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) -{ -} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -1841,12 +1047,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP - if (entity_is_task(se)) { - struct rq *rq = rq_of(cfs_rq); - - account_numa_enqueue(rq, task_of(se)); - list_add(&se->group_node, &rq->cfs_tasks); - } + if (entity_is_task(se)) + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -1857,10 +1059,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + if (entity_is_task(se)) list_del_init(&se->group_node); - } cfs_rq->nr_running--; } @@ -2178,7 +1378,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, long contrib; /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; @@ -2870,14 +2070,13 @@ static inline bool cfs_bandwidth_used(void) return static_key_false(&__cfs_bandwidth_used); } -void cfs_bandwidth_usage_inc(void) +void account_cfs_bandwidth_used(int enabled, int was_enabled) { - static_key_slow_inc(&__cfs_bandwidth_used); -} - -void cfs_bandwidth_usage_dec(void) -{ - static_key_slow_dec(&__cfs_bandwidth_used); + /* only need to count groups transitioning between enabled/!enabled */ + if (enabled && !was_enabled) + static_key_slow_inc(&__cfs_bandwidth_used); + else if (!enabled && was_enabled) + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2885,8 +2084,7 @@ static bool cfs_bandwidth_used(void) return true; } -void cfs_bandwidth_usage_inc(void) {} -void cfs_bandwidth_usage_dec(void) {} +void account_cfs_bandwidth_used(int enabled, int was_enabled) {} #endif /* HAVE_JUMP_LABEL */ /* @@ -3137,8 +2335,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); } @@ -3252,13 +2448,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (idle) goto out_unlock; - /* - * if we have relooped after returning idle once, we need to update our - * status as actually running, so that other cpus doing - * __start_cfs_bandwidth will stop trying to cancel us. - */ - cfs_b->timer_active = 1; - __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3319,13 +2508,7 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* - * Are we near the end of the current quota period? - * - * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the - * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of - * migrate_hrtimers, base is never cleared, so we are fine. - */ +/* are we near the end of the current quota period? */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -3401,12 +2584,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) return; - } + raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; @@ -3527,11 +2708,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { - /* bounce the lock to allow do_sched_cfs_period_timer to run */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { raw_spin_unlock(&cfs_b->lock); - cpu_relax(); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ if (cfs_b->timer_active) @@ -3932,7 +3113,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - if (!tg->parent || !wl) /* the trivial, non-cgroup case */ + if (!tg->parent) /* the trivial, non-cgroup case */ return wl; for_each_sched_entity(se) { @@ -3985,7 +3166,8 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { return wl; } @@ -4238,10 +3420,11 @@ done: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; @@ -4721,12 +3904,9 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; -enum fbq_type { regular, remote, all }; - #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_DST_PINNED 0x04 -#define LBF_SOME_PINNED 0x08 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; @@ -4749,8 +3929,6 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; - - enum fbq_type fbq_type; }; /* @@ -4797,78 +3975,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#ifdef CONFIG_NUMA_BALANCING -/* Returns true if the destination node has incurred more faults */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) -{ - int src_nid, dst_nid; - - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { - return false; - } - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; - - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) - return true; - - return false; -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; - - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Migrating away from the preferred node is always bad. */ - if (src_nid == p->numa_preferred_nid) - return true; - - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; -} - -#else -static inline bool migrate_improves_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; -} -#endif - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -4891,8 +3997,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p, se.statistics.nr_failed_migrations_affine); - env->flags |= LBF_SOME_PINNED; - /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4901,13 +4005,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_DST_PINNED; + env->flags |= LBF_SOME_PINNED; env->new_dst_cpu = cpu; break; } @@ -4926,24 +4030,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) task is cache cold, or + * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); - - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } -#endif - return 1; - } + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4986,6 +4077,8 @@ static int move_one_task(struct lb_env *env) return 0; } +static unsigned long task_h_load(struct task_struct *p); + static const unsigned int sched_nr_migrate_break = 32; /* @@ -5198,10 +4291,6 @@ struct sg_lb_stats { unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ -#ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; - unsigned int nr_preferred_running; -#endif }; /* @@ -5241,7 +4330,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. * * Return: The load index. */ @@ -5358,7 +4447,7 @@ void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long power; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -5370,7 +4459,7 @@ void update_group_power(struct sched_domain *sd, int cpu) return; } - power_orig = power = 0; + power = 0; if (child->flags & SD_OVERLAP) { /* @@ -5378,33 +4467,8 @@ void update_group_power(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; - struct rq *rq = cpu_rq(cpu); - - /* - * build_sched_domains() -> init_sched_groups_power() - * gets here before we've attached the domains to the - * runqueues. - * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). - * - * This avoids power/power_orig from being 0 and - * causing divide-by-zero issues on boot. - * - * Runtime updates will correct power_orig. - */ - if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); - continue; - } - - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; - } + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); } else { /* * !SD_OVERLAP domains can assume that child groups @@ -5413,14 +4477,12 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; power += group->sgp->power; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgp->power_orig = sdg->sgp->power = power; } /* @@ -5464,12 +4526,13 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing the lower domain failed to reach balance and had difficulty - * moving tasks due to affinity constraints. + * by noticing it has a cpu that is overloaded while the remaining cpus are + * idle -- or rather, there's a distinct imbalance in the cpus; see + * sg_imbalanced(). * * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * update_sd_pick_busiest(). And calculcate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditional to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -5477,36 +4540,49 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * subtle and fragile situation. */ -static inline int sg_imbalanced(struct sched_group *group) +struct sg_imb_stats { + unsigned long max_nr_running, min_nr_running; + unsigned long max_cpu_load, min_cpu_load; +}; + +static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) { - return group->sgp->imbalance; + sgi->max_cpu_load = sgi->max_nr_running = 0UL; + sgi->min_cpu_load = sgi->min_nr_running = ~0UL; } -/* - * Compute the group capacity. - * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. - */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline void +update_sg_imb_stats(struct sg_imb_stats *sgi, + unsigned long load, unsigned long nr_running) { - unsigned int capacity, smt, cpus; - unsigned int power, power_orig; - - power = group->sgp->power; - power_orig = group->sgp->power_orig; - cpus = group->group_weight; + if (load > sgi->max_cpu_load) + sgi->max_cpu_load = load; + if (sgi->min_cpu_load > load) + sgi->min_cpu_load = load; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + if (nr_running > sgi->max_nr_running) + sgi->max_nr_running = nr_running; + if (sgi->min_nr_running > nr_running) + sgi->min_nr_running = nr_running; +} - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); +static inline int +sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) +{ + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && + (sgi->max_nr_running - sgi->min_nr_running) > 1) + return 1; - return capacity; + return 0; } /** @@ -5521,11 +4597,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { + struct sg_imb_stats sgi; unsigned long nr_running; unsigned long load; int i; - memset(sgs, 0, sizeof(*sgs)); + init_sg_imb_stats(&sgi); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -5533,22 +4610,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, nr_running = rq->nr_running; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { load = target_load(i, load_idx); - else + } else { load = source_load(i, load_idx); + update_sg_imb_stats(&sgi, load, nr_running); + } sgs->group_load += load; sgs->sum_nr_running += nr_running; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; } + if (local_group && (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, group->sgp->next_update))) + update_group_power(env->sd, env->dst_cpu); + /* Adjust by relative CPU power of the group */ sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; @@ -5556,10 +4635,15 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_weight = group->group_weight; + sgs->group_imb = sg_imbalanced(sgs, &sgi); + + sgs->group_capacity = + DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); + + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(env->sd, group); - sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; @@ -5609,42 +4693,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; } -#ifdef CONFIG_NUMA_BALANCING -static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) -{ - if (sgs->sum_nr_running > sgs->nr_numa_running) - return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) - return remote; - return all; -} - -static inline enum fbq_type fbq_classify_rq(struct rq *rq) -{ - if (rq->nr_running > rq->nr_numa_running) - return regular; - if (rq->nr_running > rq->nr_preferred_running) - return remote; - return all; -} -#else -static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) -{ - return all; -} - -static inline enum fbq_type fbq_classify_rq(struct rq *rq) -{ - return regular; -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. + * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -5664,17 +4720,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) { sds->local = sg; sgs = &sds->local_stat; - - if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); } + memset(sgs, 0, sizeof(*sgs)); update_sg_lb_stats(env, sg, load_idx, local_group, sgs); - if (local_group) - goto next_group; - /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try @@ -5685,25 +4735,21 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) + if (prefer_sibling && !local_group && + sds->local && sds->local_stat.group_has_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); - if (update_sd_pick_busiest(env, sds, sg, sgs)) { - sds->busiest = sg; - sds->busiest_stat = *sgs; - } - -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_pwr += sgs->group_power; + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { + sds->busiest = sg; + sds->busiest_stat = *sgs; + } + sg = sg->next; } while (sg != env->sd->groups); - - if (env->sd->flags & SD_NUMA) - env->fbq_type = fbq_classify_group(&sds->busiest_stat); } /** @@ -6007,39 +5053,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; - enum fbq_type rt; - - rq = cpu_rq(i); - rt = fbq_classify_rq(rq); + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); + unsigned long wl; - /* - * We classify groups/runqueues into three groups: - * - regular: there are !numa tasks - * - remote: there are numa tasks that run on the 'wrong' node - * - all: there is no distinction - * - * In order to avoid migrating ideally placed numa tasks, - * ignore those when there's better options. - * - * If we ignore the actual busiest queue to migrate another - * task, the next balance pass can still reduce the busiest - * queue by moving tasks around inside the node. - * - * If we cannot move enough load due to this classification - * the next pass will adjust the group classification and - * allow migration of more tasks. - * - * Both cases only affect the total convergence complexity. - */ - if (rt > env->fbq_type) - continue; - - power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (!capacity) capacity = fix_small_capacity(env->sd, group); + rq = cpu_rq(i); wl = weighted_cpuload(i); /* @@ -6142,7 +5164,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -6156,7 +5177,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, - .fbq_type = all, }; /* @@ -6248,17 +5268,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { - - /* Prevent to re-select dst_cpu via env's cpus */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_DST_PINNED; + env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -6266,18 +5286,6 @@ more_balance: goto more_balance; } - /* - * We failed to reach balance because of affinity. - */ - if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; - - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { - *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; - } - /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -6385,7 +5393,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - u64 curr_cost = 0; this_rq->idle_stamp = rq_clock(this_rq); @@ -6402,27 +5409,15 @@ void idle_balance(int this_cpu, struct rq *this_rq) for_each_domain(this_cpu, sd) { unsigned long interval; int continue_balancing = 1; - u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) - break; - if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; - - curr_cost += domain_cost; } interval = msecs_to_jiffies(sd->balance_interval); @@ -6444,9 +5439,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; } /* @@ -6580,16 +5572,16 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + for (; sd; sd = sd->parent) + atomic_inc(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6597,16 +5589,16 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + for (; sd; sd = sd->parent) + atomic_dec(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6670,39 +5662,15 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; - u64 max_cost = 0; + int need_serialize; update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { - /* - * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. - */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } - max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) continue; - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!continue_balancing) { - if (need_decay) - continue; - break; - } - interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; @@ -6721,7 +5689,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * The LBF_DST_PINNED logic could have changed + * The LBF_SOME_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ @@ -6736,14 +5704,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - } - if (need_decay) { + /* - * Ensure the rq-wide value also decays but keep it at a - * reasonable floor to avoid funnies with rq->avg_idle. + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. */ - rq->max_idle_balance_cost = - max((u64)sysctl_sched_migration_cost, max_cost); + if (!continue_balancing) + break; } rcu_read_unlock(); @@ -6813,8 +5781,6 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; - int nr_busy; if (unlikely(idle_cpu(cpu))) return 0; @@ -6840,22 +5806,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) goto need_kick; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - - if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + for_each_domain(cpu, sd) { + struct sched_group *sg = sd->groups; + struct sched_group_power *sgp = sg->sgp; + int nr_busy = atomic_read(&sgp->nr_busy_cpus); - if (nr_busy > 1) + if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) goto need_kick_unlock; - } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); - - if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; + if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight + && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) + break; + } rcu_read_unlock(); return 0; @@ -7248,8 +6214,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - /* guarantee group entities always have weight */ - update_load_set(&se->load, NICE_0_LOAD); + update_load_set(&se->load, 0); se->parent = parent; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929..99399f8 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,23 +63,10 @@ SCHED_FEAT(LB_MIN, false) /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) - -/* - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a - * higher number of hinting faults are recorded during active load - * balancing. - */ -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) - -/* - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a - * lower number of hinting faults have been recorded. As this has - * the potential to prevent a task ever migrating to a new node - * due to CPU overload it is disabled by default. - */ -SCHED_FEAT(NUMA_RESIST_LOWER, false) +SCHED_FEAT(NUMA_FORCE, false) #endif diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9..d8da010 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7d57275..01970c8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -246,10 +246,8 @@ static inline void rt_set_overload(struct rq *rq) * if we should look at the mask. It would be a shame * if we looked at the mask, but the mask was not * updated yet. - * - * Matched by the barrier in pull_rt_task(). */ - smp_wmb(); + wmb(); atomic_inc(&rq->rd->rto_count); } @@ -1171,10 +1169,13 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + int cpu; + + cpu = task_cpu(p); if (p->nr_cpus_allowed == 1) goto out; @@ -1212,7 +1213,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) */ if (curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + curr->prio <= p->prio) && + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1628,12 +1630,6 @@ static int pull_rt_task(struct rq *this_rq) if (likely(!rt_overloaded(this_rq))) return 0; - /* - * Match the barrier from rt_set_overloaded; this guarantees that if we - * see overloaded we must also see the rto_mask bit. - */ - smp_rmb(); - for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1935,8 +1931,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = sched_rr_timeslice; /* - * Requeue to the end of queue if we (and all of our ancestors) are not - * the only element on the queue + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b2..b3c5653 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,7 +6,6 @@ #include <linux/spinlock.h> #include <linux/stop_machine.h> #include <linux/tick.h> -#include <linux/slab.h> #include "cpupri.h" #include "cpuacct.h" @@ -409,10 +408,6 @@ struct rq { * remote CPUs use both these fields when doing load calculation. */ unsigned int nr_running; -#ifdef CONFIG_NUMA_BALANCING - unsigned int nr_numa_running; - unsigned int nr_preferred_running; -#endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; @@ -481,9 +476,6 @@ struct rq { u64 age_stamp; u64 idle_stamp; u64 avg_idle; - - /* This is used to determine avg_idle's max value */ - u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -560,12 +552,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -#ifdef CONFIG_NUMA_BALANCING -extern void sched_setnuma(struct task_struct *p, int node); -extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *, struct task_struct *); -#endif /* CONFIG_NUMA_BALANCING */ - #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ @@ -607,24 +593,9 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) return hsd; } -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) { - if (sd->flags & flag) - break; - } - - return sd; -} - DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_busy); -DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_power { atomic_t ref; @@ -634,7 +605,6 @@ struct sched_group_power { */ unsigned int power, power_orig; unsigned long next_update; - int imbalance; /* XXX unrelated to power but shared group state */ /* * Number of busy cpus in this group. */ @@ -749,7 +719,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); task_thread_info(p)->cpu = cpu; - p->wake_cpu = cpu; #endif } @@ -1005,7 +974,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); @@ -1251,24 +1220,6 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } -static inline void double_lock(spinlock_t *l1, spinlock_t *l2) -{ - if (l1 > l2) - swap(l1, l2); - - spin_lock(l1); - spin_lock_nested(l2, SINGLE_DEPTH_NESTING); -} - -static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) -{ - if (l1 > l2) - swap(l1, l2); - - raw_spin_lock(l1); - raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); -} - /* * double_rq_lock - safely lock two runqueues * @@ -1354,8 +1305,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void cfs_bandwidth_usage_inc(void); -extern void cfs_bandwidth_usage_dec(void); +extern void account_cfs_bandwidth_used(int enabled, int was_enabled); #ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4ab7043..c7edee7 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. */ -static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +static inline void sched_info_dequeued(struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; - rq_sched_info_dequeued(rq, delta); + rq_sched_info_dequeued(task_rq(t), delta); } /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ -static void sched_info_arrive(struct rq *rq, struct task_struct *t) +static void sched_info_arrive(struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; - rq_sched_info_arrive(rq, delta); + rq_sched_info_arrive(task_rq(t), delta); } /* @@ -96,11 +96,11 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); + t->sched_info.last_queued = rq_clock(task_rq(t)); } /* @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ -static inline void sched_info_depart(struct rq *rq, struct task_struct *t) +static inline void sched_info_depart(struct task_struct *t) { - unsigned long long delta = rq_clock(rq) - + unsigned long long delta = rq_clock(task_rq(t)) - t->sched_info.last_arrival; - rq_sched_info_depart(rq, delta); + rq_sched_info_depart(task_rq(t), delta); if (t->state == TASK_RUNNING) - sched_info_queued(rq, t); + sched_info_queued(t); } /* @@ -128,34 +128,32 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct task_struct *prev, struct task_struct *next) { + struct rq *rq = task_rq(prev); + /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle) - sched_info_depart(rq, prev); + sched_info_depart(prev); if (next != rq->idle) - sched_info_arrive(rq, next); + sched_info_arrive(next); } static inline void -sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) - __sched_info_switch(rq, prev, next); + __sched_info_switch(prev, next); } #else -#define sched_info_queued(rq, t) do { } while (0) +#define sched_info_queued(t) do { } while (0) #define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(rq, t) do { } while (0) -#define sched_info_depart(rq, t) do { } while (0) -#define sched_info_arrive(rq, next) do { } while (0) -#define sched_info_switch(rq, t, next) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de..e08fbee 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } diff --git a/kernel/locking/semaphore.c b/kernel/semaphore.c index 6815171..6815171 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/semaphore.c diff --git a/kernel/signal.c b/kernel/signal.c index 940b30e..ded28b9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER -int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) +int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) { int err; diff --git a/kernel/smp.c b/kernel/smp.c index bd9f940..0564571 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,9 +15,9 @@ #include "smpboot.h" +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, - CSD_FLAG_WAIT = 0x02, }; struct call_function_data { @@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd) static void csd_unlock(struct call_single_data *csd) { - WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: @@ -139,15 +139,13 @@ static void csd_unlock(struct call_single_data *csd) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static +void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; int ipi; - if (wait) - csd->flags |= CSD_FLAG_WAIT; - raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&csd->list, &dst->list); @@ -342,7 +340,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, } put_cpu(); } -EXPORT_SYMBOL_GPL(__smp_call_function_single); /** * smp_call_function_many(): Run a function on a set of other CPUs. @@ -462,6 +459,7 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); +#endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; @@ -526,11 +524,6 @@ void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -void __weak smp_announce(void) -{ - printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); -} - /* Called by boot processor to activate the rest. */ void __init smp_init(void) { @@ -547,7 +540,7 @@ void __init smp_init(void) } /* Any cleanup work */ - smp_announce(); + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); smp_cpus_done(setup_max_cpus); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 11025cc..d7d498d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/export.h> @@ -27,6 +29,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/irq.h> +#include <asm/irq.h> /* - No shared variables, all the data are CPU local. - If a softirq needs serialization, let it serialize itself @@ -97,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) raw_local_irq_save(flags); /* - * The preempt tracer hooks into preempt_count_add and will break + * The preempt tracer hooks into add_preempt_count and will break * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET * is set and before current->softirq_enabled is cleared. * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - __preempt_count_add(cnt); + preempt_count() += cnt; /* * Were softirqs turned off above: */ @@ -117,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) #else /* !CONFIG_TRACE_IRQFLAGS */ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - preempt_count_add(cnt); + add_preempt_count(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -131,11 +134,12 @@ EXPORT_SYMBOL(local_bh_disable); static void __local_bh_enable(unsigned int cnt) { + WARN_ON_ONCE(in_irq()); WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) trace_softirqs_on(_RET_IP_); - preempt_count_sub(cnt); + sub_preempt_count(cnt); } /* @@ -145,7 +149,6 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } @@ -166,17 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip) * Keep preemption disabled until we are done with * softirq processing: */ - preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); + sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); - if (unlikely(!in_interrupt() && local_softirq_pending())) { - /* - * Run softirq if any pending. And do it in its own stack - * as we may be calling this deep in a task call stack already. - */ + if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); - } - preempt_count_dec(); + dec_preempt_count(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_enable(); #endif @@ -258,7 +256,7 @@ restart: " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); - preempt_count_set(prev_count); + preempt_count() = prev_count; } rcu_bh_qs(cpu); @@ -282,11 +280,10 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); - WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } - +#ifndef __ARCH_HAS_DO_SOFTIRQ asmlinkage void do_softirq(void) { @@ -301,11 +298,13 @@ asmlinkage void do_softirq(void) pending = local_softirq_pending(); if (pending) - do_softirq_own_stack(); + __do_softirq(); local_irq_restore(flags); } +#endif + /* * Enter an interrupt context. */ @@ -330,21 +329,15 @@ void irq_enter(void) static inline void invoke_softirq(void) { if (!force_irqthreads) { -#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if * it is the irq stack, because it should be near empty - * at this stage. + * at this stage. But we have no way to know if the arch + * calls irq_exit() on the irq stack. So call softirq + * in its own stack to prevent from any overrun on top + * of a potentially deep task stack. */ - __do_softirq(); -#else - /* - * Otherwise, irq_exit() is called on the task stack that can - * be potentially deep already. So call softirq in its own stack - * to prevent from any overrun. - */ - do_softirq_own_stack(); -#endif + do_softirq(); } else { wakeup_softirqd(); } @@ -376,7 +369,7 @@ void irq_exit(void) account_irq_exit_time(current); trace_hardirq_exit(); - preempt_count_sub(HARDIRQ_OFFSET); + sub_preempt_count(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); @@ -625,17 +618,146 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, } EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); +/* + * Remote softirq bits + */ + +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = *(int *)cp->info; + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = &softirq; + cp->flags = 0; + + __smp_call_function_single(cpu, cp, 0); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } @@ -649,10 +771,6 @@ static void run_ksoftirqd(unsigned int cpu) { local_irq_disable(); if (local_softirq_pending()) { - /* - * We can safely run softirq on inline stack, as we are not deep - * in the task stack here. - */ __do_softirq(); rcu_note_context_switch(cpu); local_irq_enable(); diff --git a/kernel/locking/spinlock.c b/kernel/spinlock.c index 4b082b5..4b082b5 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/spinlock.c diff --git a/kernel/rcu/srcu.c b/kernel/srcu.c index 01d5ccb..01d5ccb 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/srcu.c diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e0..c09f295 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,7 +20,6 @@ #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> -#include <linux/lglock.h> /* * Structure to determine completion condition and record errors. May @@ -44,14 +43,6 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; -/* - * Avoids a race between stop_two_cpus and global stop_cpus, where - * the stoppers could get queued up in reverse order, leading to - * system deadlock. Using an lglock means stop_two_cpus remains - * relatively cheap. - */ -DEFINE_STATIC_LGLOCK(stop_cpus_lock); - static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); @@ -124,184 +115,6 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) return done.executed ? done.ret : -ENOENT; } -/* This controls the threads on each CPU. */ -enum multi_stop_state { - /* Dummy starting state for thread. */ - MULTI_STOP_NONE, - /* Awaiting everyone to be scheduled. */ - MULTI_STOP_PREPARE, - /* Disable interrupts. */ - MULTI_STOP_DISABLE_IRQ, - /* Run the function */ - MULTI_STOP_RUN, - /* Exit */ - MULTI_STOP_EXIT, -}; - -struct multi_stop_data { - int (*fn)(void *); - void *data; - /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ - unsigned int num_threads; - const struct cpumask *active_cpus; - - enum multi_stop_state state; - atomic_t thread_ack; -}; - -static void set_state(struct multi_stop_data *msdata, - enum multi_stop_state newstate) -{ - /* Reset ack counter. */ - atomic_set(&msdata->thread_ack, msdata->num_threads); - smp_wmb(); - msdata->state = newstate; -} - -/* Last one to ack a state moves to the next state. */ -static void ack_state(struct multi_stop_data *msdata) -{ - if (atomic_dec_and_test(&msdata->thread_ack)) - set_state(msdata, msdata->state + 1); -} - -/* This is the cpu_stop function which stops the CPU. */ -static int multi_cpu_stop(void *data) -{ - struct multi_stop_data *msdata = data; - enum multi_stop_state curstate = MULTI_STOP_NONE; - int cpu = smp_processor_id(), err = 0; - unsigned long flags; - bool is_active; - - /* - * When called from stop_machine_from_inactive_cpu(), irq might - * already be disabled. Save the state and restore it on exit. - */ - local_save_flags(flags); - - if (!msdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, msdata->active_cpus); - - /* Simple state machine */ - do { - /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax(); - if (msdata->state != curstate) { - curstate = msdata->state; - switch (curstate) { - case MULTI_STOP_DISABLE_IRQ: - local_irq_disable(); - hard_irq_disable(); - break; - case MULTI_STOP_RUN: - if (is_active) - err = msdata->fn(msdata->data); - break; - default: - break; - } - ack_state(msdata); - } - } while (curstate != MULTI_STOP_EXIT); - - local_irq_restore(flags); - return err; -} - -struct irq_cpu_stop_queue_work_info { - int cpu1; - int cpu2; - struct cpu_stop_work *work1; - struct cpu_stop_work *work2; -}; - -/* - * This function is always run with irqs and preemption disabled. - * This guarantees that both work1 and work2 get queued, before - * our local migrate thread gets the chance to preempt us. - */ -static void irq_cpu_stop_queue_work(void *arg) -{ - struct irq_cpu_stop_queue_work_info *info = arg; - cpu_stop_queue_work(info->cpu1, info->work1); - cpu_stop_queue_work(info->cpu2, info->work2); -} - -/** - * stop_two_cpus - stops two cpus - * @cpu1: the cpu to stop - * @cpu2: the other cpu to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Stops both the current and specified CPU and runs @fn on one of them. - * - * returns when both are completed. - */ -int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) -{ - struct cpu_stop_done done; - struct cpu_stop_work work1, work2; - struct irq_cpu_stop_queue_work_info call_args; - struct multi_stop_data msdata; - - preempt_disable(); - msdata = (struct multi_stop_data){ - .fn = fn, - .data = arg, - .num_threads = 2, - .active_cpus = cpumask_of(cpu1), - }; - - work1 = work2 = (struct cpu_stop_work){ - .fn = multi_cpu_stop, - .arg = &msdata, - .done = &done - }; - - call_args = (struct irq_cpu_stop_queue_work_info){ - .cpu1 = cpu1, - .cpu2 = cpu2, - .work1 = &work1, - .work2 = &work2, - }; - - cpu_stop_init_done(&done, 2); - set_state(&msdata, MULTI_STOP_PREPARE); - - /* - * If we observe both CPUs active we know _cpu_down() cannot yet have - * queued its stop_machine works and therefore ours will get executed - * first. Or its not either one of our CPUs that's getting unplugged, - * in which case we don't care. - * - * This relies on the stopper workqueues to be FIFO. - */ - if (!cpu_active(cpu1) || !cpu_active(cpu2)) { - preempt_enable(); - return -ENOENT; - } - - lg_local_lock(&stop_cpus_lock); - /* - * Queuing needs to be done by the lowest numbered CPU, to ensure - * that works are always queued in the same order on every CPU. - * This prevents deadlocks. - */ - smp_call_function_single(min(cpu1, cpu2), - &irq_cpu_stop_queue_work, - &call_args, 0); - lg_local_unlock(&stop_cpus_lock); - preempt_enable(); - - wait_for_completion(&done.completion); - - return done.executed ? done.ret : -ENOENT; -} - /** * stop_one_cpu_nowait - stop a cpu but don't wait for completion * @cpu: cpu to stop @@ -346,10 +159,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - lg_global_lock(&stop_cpus_lock); + preempt_disable(); for_each_cpu(cpu, cpumask) cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); - lg_global_unlock(&stop_cpus_lock); + preempt_enable(); } static int __stop_cpus(const struct cpumask *cpumask, @@ -546,14 +359,98 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE +/* This controls the threads on each CPU. */ +enum stopmachine_state { + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ + STOPMACHINE_PREPARE, + /* Disable interrupts. */ + STOPMACHINE_DISABLE_IRQ, + /* Run the function */ + STOPMACHINE_RUN, + /* Exit */ + STOPMACHINE_EXIT, +}; + +struct stop_machine_data { + int (*fn)(void *); + void *data; + /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ + unsigned int num_threads; + const struct cpumask *active_cpus; + + enum stopmachine_state state; + atomic_t thread_ack; +}; + +static void set_state(struct stop_machine_data *smdata, + enum stopmachine_state newstate) +{ + /* Reset ack counter. */ + atomic_set(&smdata->thread_ack, smdata->num_threads); + smp_wmb(); + smdata->state = newstate; +} + +/* Last one to ack a state moves to the next state. */ +static void ack_state(struct stop_machine_data *smdata) +{ + if (atomic_dec_and_test(&smdata->thread_ack)) + set_state(smdata, smdata->state + 1); +} + +/* This is the cpu_stop function which stops the CPU. */ +static int stop_machine_cpu_stop(void *data) +{ + struct stop_machine_data *smdata = data; + enum stopmachine_state curstate = STOPMACHINE_NONE; + int cpu = smp_processor_id(), err = 0; + unsigned long flags; + bool is_active; + + /* + * When called from stop_machine_from_inactive_cpu(), irq might + * already be disabled. Save the state and restore it on exit. + */ + local_save_flags(flags); + + if (!smdata->active_cpus) + is_active = cpu == cpumask_first(cpu_online_mask); + else + is_active = cpumask_test_cpu(cpu, smdata->active_cpus); + + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ + cpu_relax(); + if (smdata->state != curstate) { + curstate = smdata->state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + if (is_active) + err = smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(smdata); + } + } while (curstate != STOPMACHINE_EXIT); + + local_irq_restore(flags); + return err; +} + int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct multi_stop_data msdata = { - .fn = fn, - .data = data, - .num_threads = num_online_cpus(), - .active_cpus = cpus, - }; + struct stop_machine_data smdata = { .fn = fn, .data = data, + .num_threads = num_online_cpus(), + .active_cpus = cpus }; if (!stop_machine_initialized) { /* @@ -564,7 +461,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) unsigned long flags; int ret; - WARN_ON_ONCE(msdata.num_threads != 1); + WARN_ON_ONCE(smdata.num_threads != 1); local_irq_save(flags); hard_irq_disable(); @@ -575,8 +472,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) } /* Set the initial state and stop all online cpus. */ - set_state(&msdata, MULTI_STOP_PREPARE); - return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); + set_state(&smdata, STOPMACHINE_PREPARE); + return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); } int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) @@ -616,25 +513,25 @@ EXPORT_SYMBOL_GPL(stop_machine); int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct multi_stop_data msdata = { .fn = fn, .data = data, + struct stop_machine_data smdata = { .fn = fn, .data = data, .active_cpus = cpus }; struct cpu_stop_done done; int ret; /* Local CPU must be inactive and CPU hotplug in progress. */ BUG_ON(cpu_active(raw_smp_processor_id())); - msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ + smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ /* No proper task established and can't sleep - busy wait for lock. */ while (!mutex_trylock(&stop_cpus_mutex)) cpu_relax(); /* Schedule work on other CPUs and execute directly for local CPU */ - set_state(&msdata, MULTI_STOP_PREPARE); + set_state(&smdata, STOPMACHINE_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); - queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, + queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, &done); - ret = multi_cpu_stop(&msdata); + ret = stop_machine_cpu_stop(&smdata); /* Busy wait for completion. */ while (!completion_done(&done.completion)) diff --git a/kernel/sys.c b/kernel/sys.c index c723113..c18ecca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,6 +16,7 @@ #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a6047..b2f06f3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ -static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; +static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -371,6 +371,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, .maxlen = sizeof(unsigned int), @@ -384,20 +391,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "numa_balancing_settle_count", - .data = &sysctl_numa_balancing_settle_count, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_migrate_deferred", - .data = &sysctl_numa_balancing_migrate_deferred, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { @@ -969,10 +962,9 @@ static struct ctl_table kern_table[] = { { .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "hung_task_timeout_secs", @@ -1057,7 +1049,6 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_proc_update_handler, - .extra1 = &one, }, { .procname = "perf_cpu_time_max_percent", @@ -2223,11 +2214,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { + if (!first) err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } err = proc_put_long(&buffer, &left, val, false); if (err) break; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd..b609213 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file, if (get_user(value, vec + i)) goto out_kfree; - str += scnprintf(str, end - str, "%lu\t", value); + str += snprintf(str, end - str, "%lu\t", value); } result = kernel_write(file, buffer, str - buffer, 0); @@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file, if (get_user(value, vec + i)) goto out_kfree; - str += scnprintf(str, end - str, "%lu\t", value); + str += snprintf(str, end - str, "%lu\t", value); } result = kernel_write(file, buffer, str - buffer, 0); @@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file, if (get_user(dnaddr, (__le16 __user *)newval)) goto out; - len = scnprintf(buf, sizeof(buf), "%hu.%hu", + len = snprintf(buf, sizeof(buf), "%hu.%hu", le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S deleted file mode 100644 index 4aef390..0000000 --- a/kernel/system_certificates.S +++ /dev/null @@ -1,10 +0,0 @@ -#include <linux/export.h> -#include <linux/init.h> - - __INITRODATA - - .globl VMLINUX_SYMBOL(system_certificate_list) -VMLINUX_SYMBOL(system_certificate_list): - .incbin "kernel/x509_certificate_list" - .globl VMLINUX_SYMBOL(system_certificate_list_end) -VMLINUX_SYMBOL(system_certificate_list_end): diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c deleted file mode 100644 index 564dd93..0000000 --- a/kernel/system_keyring.c +++ /dev/null @@ -1,105 +0,0 @@ -/* System trusted keyring for trusted public keys - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> -#include "module-internal.h" - -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); - -extern __initconst const u8 system_certificate_list[]; -extern __initconst const u8 system_certificate_list_end[]; - -/* - * Load the compiled-in keys - */ -static __init int system_trusted_keyring_init(void) -{ - pr_notice("Initialise system trusted keyring\n"); - - system_trusted_keyring = - keyring_alloc(".system_keyring", - KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(system_trusted_keyring)) - panic("Can't allocate system trusted keyring\n"); - - set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(system_trusted_keyring_init); - -/* - * Load the compiled-in list of X.509 certificates. - */ -static __init int load_system_certificate_list(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading compiled-in X.509 certificates\n"); - - end = system_certificate_list_end; - p = system_certificate_list; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_TRUSTED); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_system_certificate_list); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7c..145bb4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,7 +290,6 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) struct listener_list *listeners; struct listener *s, *tmp, *s2; unsigned int cpu; - int ret = 0; if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; @@ -305,10 +304,9 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); - if (!s) { - ret = -ENOMEM; + if (!s) goto cleanup; - } + s->pid = pid; s->valid = 1; @@ -341,7 +339,7 @@ cleanup: } up_write(&listeners->sem); } - return ret; + return 0; } static int parse(struct nlattr *na, struct cpumask *mask) @@ -406,15 +404,11 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) if (!na) goto err; - if (nla_put(skb, type, sizeof(pid), &pid) < 0) { - nla_nest_cancel(skb, na); + if (nla_put(skb, type, sizeof(pid), &pid) < 0) goto err; - } ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); - if (!ret) { - nla_nest_cancel(skb, na); + if (!ret) goto err; - } nla_nest_end(skb, na); return nla_data(ret); @@ -673,18 +667,17 @@ err: nlmsg_free(rep_skb); } -static const struct genl_ops taskstats_ops[] = { - { - .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = CGROUPSTATS_CMD_GET, - .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, - }, +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, +}; + +static struct genl_ops cgroupstats_ops = { + .cmd = CGROUPSTATS_CMD_GET, + .doit = cgroupstats_user_cmd, + .policy = cgroupstats_cmd_get_policy, }; /* Needed early in initialization */ @@ -703,13 +696,26 @@ static int __init taskstats_init(void) { int rc; - rc = genl_register_family_with_ops(&family, taskstats_ops); + rc = genl_register_family(&family); if (rc) return rc; + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + rc = genl_register_ops(&family, &cgroupstats_ops); + if (rc < 0) + goto err_cgroup_ops; + family_registered = 1; pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; +err_cgroup_ops: + genl_unregister_ops(&family, &taskstats_ops); +err: + genl_unregister_family(&family); + return rc; } /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 3ce6e8c..2b62fe8 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -100,7 +100,7 @@ config NO_HZ_FULL # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency - depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + depends on 64BIT select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 88c9c65..eec50fc 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; if (!alarmtimer_get_rtcdev()) - return -EINVAL; + return -ENOTSUPP; return hrtimer_get_res(baseid, tp); } @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) - return -EINVAL; + return -ENOTSUPP; *tp = ktime_to_timespec(base->gettime()); return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 086ad60..662c579 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, const char *buf, size_t count) { char name[CS_NAME_LEN]; - ssize_t ret = sysfs_get_uname(buf, name, count); + size_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce; if (ret < 0) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502..50a8736 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -479,7 +479,6 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } -void clocksource_mark_unstable(struct clocksource *cs) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -538,55 +537,40 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) } /** - * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted - * @mult: cycle to nanosecond multiplier - * @shift: cycle to nanosecond divisor (power of two) - * @maxadj: maximum adjustment value to mult (~11%) - * @mask: bitmask for two's complement subtraction of non 64 bit counters + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * */ -u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) +static u64 clocksource_max_deferment(struct clocksource *cs) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) + * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) * which is equivalent to the below. - * max_cycles < (2^63)/(mult + maxadj) - * max_cycles < 2^(log2((2^63)/(mult + maxadj))) - * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) - * max_cycles < 2^(63 - log2(mult + maxadj)) - * max_cycles < 1 << (63 - log2(mult + maxadj)) + * max_cycles < (2^63)/(cs->mult + cs->maxadj) + * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) + * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) + * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) + * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); + max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and mask. + * determined by the minimum of max_cycles and cs->mask. * Note: Here we subtract the maxadj to make sure we don't sleep for * too long if there's a large negative adjustment. */ - max_cycles = min(max_cycles, mask); - max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); - - return max_nsecs; -} - -/** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource - * - */ -static u64 clocksource_max_deferment(struct clocksource *cs) -{ - u64 max_nsecs; + max_cycles = min_t(u64, max_cycles, (u64) cs->mask); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, + cs->shift); - max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, - cs->mask); /* * To ensure that the clocksource does not wrap whilst we are idle, * limit the time the clocksource can be deferred by 12.5%. Please @@ -909,7 +893,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -940,7 +924,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - ssize_t ret; + size_t ret; mutex_lock(&clocksource_mutex); @@ -968,7 +952,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, { struct clocksource *cs; char name[CS_NAME_LEN]; - ssize_t ret; + size_t ret; ret = sysfs_get_uname(buf, name, count); if (ret < 0) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4..bb22151 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -475,7 +475,6 @@ static void sync_cmos_clock(struct work_struct *work) * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... - * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* @@ -486,7 +485,7 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { struct timespec adjust = now; fail = -ENODEV; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 68b7993..0b479a6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,28 +8,25 @@ #include <linux/clocksource.h> #include <linux/init.h> #include <linux/jiffies.h> -#include <linux/ktime.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/syscore_ops.h> -#include <linux/hrtimer.h> +#include <linux/timer.h> #include <linux/sched_clock.h> -#include <linux/seqlock.h> -#include <linux/bitops.h> struct clock_data { - ktime_t wrap_kt; u64 epoch_ns; - u64 epoch_cyc; - seqcount_t seq; + u32 epoch_cyc; + u32 epoch_cyc_copy; unsigned long rate; u32 mult; u32 shift; bool suspended; }; -static struct hrtimer sched_clock_timer; +static void sched_clock_poll(unsigned long wrap_ticks); +static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); @@ -38,46 +35,42 @@ static struct clock_data cd = { .mult = NSEC_PER_SEC / HZ, }; -static u64 __read_mostly sched_clock_mask; +static u32 __read_mostly sched_clock_mask = 0xffffffff; -static u64 notrace jiffy_sched_clock_read(void) +static u32 notrace jiffy_sched_clock_read(void) { - /* - * We don't need to use get_jiffies_64 on 32-bit arches here - * because we register with BITS_PER_LONG - */ - return (u64)(jiffies - INITIAL_JIFFIES); + return (u32)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock_32)(void); - -static u64 notrace read_sched_clock_32_wrapper(void) -{ - return read_sched_clock_32(); -} - -static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } -unsigned long long notrace sched_clock(void) +static unsigned long long notrace sched_clock_32(void) { u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; - unsigned long seq; + u32 epoch_cyc; + u32 cyc; if (cd.suspended) return cd.epoch_ns; + /* + * Load the epoch_cyc and epoch_ns atomically. We do this by + * ensuring that we always write epoch_cyc, epoch_ns and + * epoch_cyc_copy in strict order, and read them in strict order. + * If epoch_cyc and epoch_cyc_copy are not equal, then we're in + * the middle of an update, and we should repeat the load. + */ do { - seq = read_seqcount_begin(&cd.seq); epoch_cyc = cd.epoch_cyc; + smp_rmb(); epoch_ns = cd.epoch_ns; - } while (read_seqcount_retry(&cd.seq, seq)); + smp_rmb(); + } while (epoch_cyc != cd.epoch_cyc_copy); cyc = read_sched_clock(); cyc = (cyc - epoch_cyc) & sched_clock_mask; @@ -90,46 +83,49 @@ unsigned long long notrace sched_clock(void) static void notrace update_sched_clock(void) { unsigned long flags; - u64 cyc; + u32 cyc; u64 ns; cyc = read_sched_clock(); ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, cd.mult, cd.shift); - + /* + * Write epoch_cyc and epoch_ns in a way that the update is + * detectable in cyc_to_fixed_sched_clock(). + */ raw_local_irq_save(flags); - write_seqcount_begin(&cd.seq); + cd.epoch_cyc_copy = cyc; + smp_wmb(); cd.epoch_ns = ns; + smp_wmb(); cd.epoch_cyc = cyc; - write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } -static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) +static void sched_clock_poll(unsigned long wrap_ticks) { + mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); update_sched_clock(); - hrtimer_forward_now(hrt, cd.wrap_kt); - return HRTIMER_RESTART; } -void __init sched_clock_register(u64 (*read)(void), int bits, - unsigned long rate) +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) { - unsigned long r; + unsigned long r, w; u64 res, wrap; char r_unit; if (cd.rate > rate) return; + BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); read_sched_clock = read; - sched_clock_mask = CLOCKSOURCE_MASK(bits); + sched_clock_mask = (1ULL << bits) - 1; cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); r = rate; if (r >= 4000000) { @@ -142,14 +138,20 @@ void __init sched_clock_register(u64 (*read)(void), int bits, r_unit = ' '; /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); - cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); + do_div(wrap, NSEC_PER_MSEC); + w = wrap; /* calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, cd.mult, cd.shift); - pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", - bits, r, r_unit, res, wrap); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", + bits, r, r_unit, res, w); + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); update_sched_clock(); /* @@ -164,10 +166,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, pr_debug("Registered %pF as sched_clock source\n", read); } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; + +unsigned long long notrace sched_clock(void) { - read_sched_clock_32 = read; - sched_clock_register(read_sched_clock_32_wrapper, bits, rate); + return sched_clock_func(); } void __init sched_clock_postinit(void) @@ -177,22 +180,14 @@ void __init sched_clock_postinit(void) * make it the final one one. */ if (read_sched_clock == jiffy_sched_clock_read) - sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + setup_sched_clock(jiffy_sched_clock_read, 32, HZ); - update_sched_clock(); - - /* - * Start the timer to keep sched_clock() properly updated and - * sets the initial epoch. - */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sched_clock_timer.function = sched_clock_poll; - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + sched_clock_poll(sched_clock_timer.data); } static int sched_clock_suspend(void) { - sched_clock_poll(&sched_clock_timer); + sched_clock_poll(sched_clock_timer.data); cd.suspended = true; return 0; } @@ -200,6 +195,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); + cd.epoch_cyc_copy = cd.epoch_cyc; cd.suspended = false; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9532690..218bcb5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -70,7 +70,6 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || - (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 162b03a..64522ec 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,21 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; - -/* - * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR - * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This - * variable has two functions: - * - * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the - * timekeeping lock all at once. Only the CPU which is assigned to do the - * update is handling it. - * - * 2) Hand off the duty in the NOHZ idle case by setting the value to - * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks - * at it will take over and keep the time keeping alive. The handover - * procedure also covers cpu hotplug. - */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 18e71f7..bc906ca 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); -extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* * NO_HZ / high resolution timer shared code diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d..3612fc7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -361,8 +361,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; -int tick_nohz_active __read_mostly; +int tick_nohz_enabled __read_mostly = 1; + /* * Enable / Disable tickless mode */ @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_active) + if (!tick_nohz_enabled) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_active) + if (!tick_nohz_enabled) return -1; now = ktime_get(); @@ -711,10 +711,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -801,6 +799,11 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); + /* + * set ts->inidle unconditionally. even if the system did not + * switch to nohz mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -970,7 +973,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_active) + if (!tick_nohz_enabled) return; local_irq_disable(); @@ -978,7 +981,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - tick_nohz_active = 1; + ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1136,10 +1139,8 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { + if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } #endif } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 87b4f00..947ba25 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; + } #else #define old_vsyscall_fixup(tk) @@ -1613,10 +1613,9 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * ktime_get_update_offsets - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset - * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets - * Called from hrtimer_interrupt() or retrigger_next_event() + * Called from hrtimer_interupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f2..0b537f2 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.3\n"); + seq_puts(m, "Timer Stats Version: v0.2\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); - seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { diff --git a/kernel/timer.c b/kernel/timer.c index accfd24..4296d13 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { - int count = preempt_count(); + int preempt_count = preempt_count(); #ifdef CONFIG_LOCKDEP /* @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_release(&lockdep_map); - if (count != preempt_count()) { + if (preempt_count != preempt_count()) { WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, count, preempt_count()); + fn, preempt_count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ - preempt_count_set(count); + preempt_count() = preempt_count; } } @@ -1518,8 +1518,9 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kzalloc_node(sizeof(*base), GFP_KERNEL, - cpu_to_node(cpu)); + base = kmalloc_node(sizeof(*base), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); if (!base) return -ENOMEM; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f785aef..b8b8560 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,7 +26,6 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> -#include <linux/list.h> #include <trace/events/block.h> @@ -39,9 +38,6 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; -static LIST_HEAD(running_trace_list); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); - /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -111,18 +107,10 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct task_struct *tsk) +static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) { - unsigned long flags; - struct blk_trace *bt; - tsk->btrace_seq = blktrace_seq; - spin_lock_irqsave(&running_trace_lock, flags); - list_for_each_entry(bt, &running_trace_list, running_list) { - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); - } - spin_unlock_irqrestore(&running_trace_lock, flags); + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); } static void trace_note_time(struct blk_trace *bt) @@ -241,15 +229,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(tsk); - /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(bt, tsk); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -488,7 +477,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); - INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -579,12 +567,13 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; + memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts.name, 32)) { blk_trace_remove(q); return -EFAULT; } @@ -612,9 +601,6 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; - spin_lock_irq(&running_trace_lock); - list_add(&bt->running_list, &running_trace_list); - spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -622,9 +608,6 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1489,9 +1472,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0e9f9ea..03cf44a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -367,6 +367,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { + if (unlikely(ftrace_disabled)) + return -ENODEV; + if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -425,6 +428,9 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; + if (ftrace_disabled) + return -ENODEV; + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; @@ -2082,15 +2088,10 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { bool hash_enable = true; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - ret = __register_ftrace_function(ops); - if (ret) - return ret; - ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; @@ -2112,17 +2113,12 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static int ftrace_shutdown(struct ftrace_ops *ops, int command) +static void ftrace_shutdown(struct ftrace_ops *ops, int command) { bool hash_disable = true; - int ret; if (unlikely(ftrace_disabled)) - return -ENODEV; - - ret = __unregister_ftrace_function(ops); - if (ret) - return ret; + return; ftrace_start_up--; /* @@ -2157,10 +2153,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) } if (!command || !ftrace_enabled) - return 0; + return; ftrace_run_update_code(command); - return 0; } static void ftrace_startup_sysctl(void) @@ -3065,13 +3060,16 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - ret = ftrace_startup(&trace_probe_ops, 0); + ret = __register_ftrace_function(&trace_probe_ops); + if (!ret) + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { + int ret; int i; if (!ftrace_probe_registered) @@ -3084,7 +3082,9 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - ftrace_shutdown(&trace_probe_ops, 0); + ret = __unregister_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; } @@ -3307,11 +3307,7 @@ void unregister_ftrace_function_probe_all(char *glob) static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); -/* - * Currently we only register ftrace commands from __init, so mark this - * __init too. - */ -__init int register_ftrace_command(struct ftrace_func_command *cmd) +int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; int ret = 0; @@ -3330,11 +3326,7 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd) return ret; } -/* - * Currently we only unregister ftrace commands from __init, so mark - * this __init too. - */ -__init int unregister_ftrace_command(struct ftrace_func_command *cmd) +int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; int ret = -ENODEV; @@ -3649,7 +3641,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); static int __init set_graph_function(char *str) { @@ -3667,7 +3659,7 @@ static void __init set_ftrace_early_graph(char *buf) func = strsep(&buf, ","); /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - FTRACE_GRAPH_MAX_FUNCS, func); + func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3784,25 +3776,15 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; -int ftrace_graph_notrace_count; +int ftrace_graph_filter_enabled; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; -unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; - -struct ftrace_graph_data { - unsigned long *table; - size_t size; - int *count; - const struct seq_operations *seq_ops; -}; static void * __g_next(struct seq_file *m, loff_t *pos) { - struct ftrace_graph_data *fgd = m->private; - - if (*pos >= *fgd->count) + if (*pos >= ftrace_graph_count) return NULL; - return &fgd->table[*pos]; + return &ftrace_graph_funcs[*pos]; } static void * @@ -3814,12 +3796,10 @@ g_next(struct seq_file *m, void *v, loff_t *pos) static void *g_start(struct seq_file *m, loff_t *pos) { - struct ftrace_graph_data *fgd = m->private; - mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!*fgd->count && !*pos) + if (!ftrace_graph_filter_enabled && !*pos) return (void *)1; return __g_next(m, pos); @@ -3855,88 +3835,38 @@ static const struct seq_operations ftrace_graph_seq_ops = { }; static int -__ftrace_graph_open(struct inode *inode, struct file *file, - struct ftrace_graph_data *fgd) +ftrace_graph_open(struct inode *inode, struct file *file) { int ret = 0; + if (unlikely(ftrace_disabled)) + return -ENODEV; + mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - *fgd->count = 0; - memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); + ftrace_graph_filter_enabled = 0; + ftrace_graph_count = 0; + memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) { - ret = seq_open(file, fgd->seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = fgd; - } - } else - file->private_data = fgd; + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_graph_seq_ops); return ret; } static int -ftrace_graph_open(struct inode *inode, struct file *file) -{ - struct ftrace_graph_data *fgd; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); - if (fgd == NULL) - return -ENOMEM; - - fgd->table = ftrace_graph_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_count; - fgd->seq_ops = &ftrace_graph_seq_ops; - - return __ftrace_graph_open(inode, file, fgd); -} - -static int -ftrace_graph_notrace_open(struct inode *inode, struct file *file) -{ - struct ftrace_graph_data *fgd; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); - if (fgd == NULL) - return -ENOMEM; - - fgd->table = ftrace_graph_notrace_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_notrace_count; - fgd->seq_ops = &ftrace_graph_seq_ops; - - return __ftrace_graph_open(inode, file, fgd); -} - -static int ftrace_graph_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - - kfree(m->private); + if (file->f_mode & FMODE_READ) seq_release(inode, file); - } else { - kfree(file->private_data); - } - return 0; } static int -ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, char *buffer) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -3949,7 +3879,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (!not && *idx >= size) + if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) return -EBUSY; search_len = strlen(search); @@ -3977,7 +3907,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) fail = 0; if (!exists) { array[(*idx)++] = rec->ip; - if (*idx >= size) + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) goto out; } } else { @@ -3995,6 +3925,8 @@ out: if (fail) return -EINVAL; + ftrace_graph_filter_enabled = !!(*idx); + return 0; } @@ -4003,33 +3935,36 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - ssize_t read, ret = 0; - struct ftrace_graph_data *fgd = file->private_data; + ssize_t read, ret; if (!cnt) return 0; - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) - return -ENOMEM; + mutex_lock(&graph_lock); + + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; + goto out_unlock; + } read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; - mutex_lock(&graph_lock); - /* we allow only one expression at a time */ - ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, - parser.buffer); - - mutex_unlock(&graph_lock); + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + parser.buffer); + if (ret) + goto out_free; } - if (!ret) - ret = read; + ret = read; +out_free: trace_parser_put(&parser); +out_unlock: + mutex_unlock(&graph_lock); return ret; } @@ -4041,14 +3976,6 @@ static const struct file_operations ftrace_graph_fops = { .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, }; - -static const struct file_operations ftrace_graph_notrace_fops = { - .open = ftrace_graph_notrace_open, - .read = seq_read, - .write = ftrace_graph_write, - .llseek = ftrace_filter_lseek, - .release = ftrace_graph_release, -}; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) @@ -4070,9 +3997,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0444, d_tracer, - NULL, - &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -4366,15 +4290,12 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - int ___ret = __register_ftrace_function(ops); \ - if (!___ret) \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - ___ret; \ +# define ftrace_startup(ops, command) \ + ({ \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + 0; \ }) -# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) - +# define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4399,21 +4320,12 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); - - /* - * Control funcs (perf) uses RCU. Only trace if - * RCU is currently active. - */ - if (!rcu_is_watching()) - goto out; - do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - out: trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4783,7 +4695,9 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - ret = ftrace_startup(ops, 0); + ret = __register_ftrace_function(ops); + if (!ret) + ret = ftrace_startup(ops, 0); mutex_unlock(&ftrace_lock); @@ -4802,7 +4716,9 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_lock); - ret = ftrace_shutdown(ops, 0); + ret = __unregister_ftrace_function(ops); + if (!ret) + ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -4996,13 +4912,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } -/* Just a place holder for function graph */ -static struct ftrace_ops fgraph_ops __read_mostly = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | - FTRACE_OPS_FL_RECURSION_SAFE, -}; - int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5029,7 +4938,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5046,7 +4955,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9d20cd9..7974ba2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,33 +235,13 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct ftrace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) +int filter_current_check_discard(struct ring_buffer *buffer, + struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) { - if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && - !filter_match_preds(file->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(filter_check_discard); - -int call_filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; + return filter_check_discard(call, rec, buffer, event); } -EXPORT_SYMBOL_GPL(call_filter_check_discard); +EXPORT_SYMBOL_GPL(filter_current_check_discard); cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { @@ -863,12 +843,9 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, if (isspace(ch)) { parser->buffer[parser->idx] = 0; parser->cont = false; - } else if (parser->idx < parser->size - 1) { + } else { parser->cont = true; parser->buffer[parser->idx++] = ch; - } else { - ret = -EINVAL; - goto out; } *ppos += read; @@ -1284,6 +1261,21 @@ int is_tracing_stopped(void) } /** + * ftrace_off_permanent - disable all ftrace code permanently + * + * This should only be called when a serious anomally has + * been detected. This will turn off the function tracing, + * ring buffers, and other tracing utilites. It takes no + * locks and can be called from any context. + */ +void ftrace_off_permanent(void) +{ + tracing_disabled = 1; + ftrace_stop(); + tracing_off_permanent(); +} + +/** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, @@ -1517,8 +1509,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | - (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1639,7 +1630,7 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -1723,7 +1714,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: @@ -1825,7 +1816,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out_drop_count: @@ -2017,7 +2008,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!call_filter_check_discard(call, entry, buffer, event)) { + if (!filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2072,7 +2063,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; - if (!call_filter_check_discard(call, entry, buffer, event)) { + if (!filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2769,7 +2760,7 @@ static void show_snapshot_main_help(struct seq_file *m) seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); seq_printf(m, "# is not a '0' or '1')\n"); } @@ -2973,11 +2964,6 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } -bool tracing_is_disabled(void) -{ - return (tracing_disabled) ? true: false; -} - /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -5468,12 +5454,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = { .func = ftrace_trace_snapshot_callback, }; -static __init int register_snapshot_cmd(void) +static int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else -static inline __init int register_snapshot_cmd(void) { return 0; } +static inline int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) @@ -6267,17 +6253,6 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->trace_buffer = &global_trace.trace_buffer; - - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - - /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->trace_buffer->buffer)) - iter->iter_flags |= TRACE_FILE_ANNOTATE; - - /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[iter->tr->clock_id].in_ns) - iter->iter_flags |= TRACE_FILE_TIME_IN_NS; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ea189e0..10c86fb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -124,7 +124,6 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_PREEMPT_RESCHED = 0x20, }; #define TRACE_BUF_SIZE 1024 @@ -193,8 +192,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; + DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); + DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); #endif int stop_count; int clock_id; @@ -515,7 +514,6 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); -bool tracing_is_disabled(void); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -713,8 +711,6 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 -#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -734,16 +730,15 @@ extern void __trace_graph_return(struct trace_array *tr, #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; -extern int ftrace_graph_notrace_count; -extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_count) + if (!ftrace_graph_filter_enabled) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -763,31 +758,11 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } - -static inline int ftrace_graph_notrace_addr(unsigned long addr) -{ - int i; - - if (!ftrace_graph_notrace_count) - return 0; - - for (i = 0; i < ftrace_graph_notrace_count; i++) { - if (addr == ftrace_graph_notrace_funcs[i]) - return 1; - } - - return 0; -} #else static inline int ftrace_graph_addr(unsigned long addr) { return 1; } - -static inline int ftrace_graph_notrace_addr(unsigned long addr) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t @@ -1011,9 +986,9 @@ struct filter_pred { extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_file *file, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_file *file, +extern int apply_event_filter(struct ftrace_event_call *call, char *filter_string); extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); @@ -1024,6 +999,20 @@ extern int filter_assign_type(const char *type); struct ftrace_event_field * trace_find_event_field(struct ftrace_event_call *call, char *name); +static inline int +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} + extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9b..d594da0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e854f42..80c36bc 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,15 +24,9 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - if (tp_event->perf_perm) { - int ret = tp_event->perf_perm(tp_event, p_event); - if (ret) - return ret; - } - /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && - perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EPERM; /* No tracing, just counting, so no obvious leak */ @@ -179,7 +173,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - u64 event_id = p_event->attr.config; + int event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a11800a..368a4d5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -989,7 +989,7 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct ftrace_event_call *call; struct trace_seq *s; int r = -ENODEV; @@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - print_event_filter(file, s); + call = event_file_data(filp); + if (call) + print_event_filter(call, s); mutex_unlock(&event_mutex); - if (file) + if (call) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -1021,7 +1021,7 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file; + struct ftrace_event_call *call; char *buf; int err = -ENODEV; @@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, buf[cnt] = '\0'; mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - err = apply_event_filter(file, buf); + call = event_file_data(filp); + if (call) + err = apply_event_filter(call, buf); mutex_unlock(&event_mutex); free_page((unsigned long) buf); @@ -1062,9 +1062,6 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct trace_array *tr; int ret; - if (tracing_is_disabled()) - return -ENODEV; - /* Make sure the system still exists */ mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); @@ -1111,9 +1108,6 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; - if (tracing_is_disabled()) - return -ENODEV; - if (trace_array_get(tr) < 0) return -ENODEV; @@ -1130,12 +1124,11 @@ static int system_tr_open(struct inode *inode, struct file *filp) if (ret < 0) { trace_array_put(tr); kfree(dir); - return ret; } filp->private_data = dir; - return 0; + return ret; } static int subsystem_release(struct inode *inode, struct file *file) @@ -1546,7 +1539,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) return -1; } } - trace_create_file("filter", 0644, file->dir, file, + trace_create_file("filter", 0644, file->dir, call, &ftrace_event_filter_fops); trace_create_file("format", 0444, file->dir, call, @@ -1584,7 +1577,6 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); - destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1708,7 +1700,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_call_preds(call); + destroy_preds(call); } static int probe_remove_event_call(struct ftrace_event_call *call) @@ -2314,9 +2306,6 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); - /* Access to events are within rcu_read_lock_sched() */ - synchronize_sched(); - down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2468f56..97daa8c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -637,18 +637,10 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } -static inline struct event_filter *event_filter(struct ftrace_event_file *file) -{ - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - return file->event_call->filter; - else - return file->filter; -} - /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = event_filter(file); + struct event_filter *filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); @@ -774,21 +766,11 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void call_filter_disable(struct ftrace_event_call *call) +static void filter_disable(struct ftrace_event_call *call) { call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static void filter_disable(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call_filter_disable(call); - else - file->flags &= ~FTRACE_EVENT_FL_FILTERED; -} - static void __free_filter(struct event_filter *filter) { if (!filter) @@ -799,30 +781,16 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } -void destroy_call_preds(struct ftrace_event_call *call) -{ - __free_filter(call->filter); - call->filter = NULL; -} - -static void destroy_file_preds(struct ftrace_event_file *file) -{ - __free_filter(file->filter); - file->filter = NULL; -} - /* - * Called when destroying the ftrace_event_file. - * The file is being freed, so we do not need to worry about - * the file being currently used. This is for module code removing + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing * the tracepoints from within it. */ -void destroy_preds(struct ftrace_event_file *file) +void destroy_preds(struct ftrace_event_call *call) { - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - destroy_call_preds(file->event_call); - else - destroy_file_preds(file); + __free_filter(call->filter); + call->filter = NULL; } static struct event_filter *__alloc_filter(void) @@ -857,56 +825,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static inline void __remove_filter(struct ftrace_event_file *file) +static void filter_free_subsystem_preds(struct event_subsystem *system) { - struct ftrace_event_call *call = file->event_call; - - filter_disable(file); - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - remove_filter_string(call->filter); - else - remove_filter_string(file->filter); -} - -static void filter_free_subsystem_preds(struct event_subsystem *system, - struct trace_array *tr) -{ - struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(file, &tr->events, list) { - call = file->event_call; + list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - __remove_filter(file); - } -} - -static inline void __free_subsystem_filter(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { - __free_filter(call->filter); - call->filter = NULL; - } else { - __free_filter(file->filter); - file->filter = NULL; + filter_disable(call); + remove_filter_string(call->filter); } } -static void filter_free_subsystem_filters(struct event_subsystem *system, - struct trace_array *tr) +static void filter_free_subsystem_filters(struct event_subsystem *system) { - struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(file, &tr->events, list) { - call = file->event_call; + list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - __free_subsystem_filter(file); + __free_filter(call->filter); + call->filter = NULL; } } @@ -1677,85 +1617,15 @@ fail: return err; } -static inline void event_set_filtered_flag(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_FILTERED; - else - file->flags |= FTRACE_EVENT_FL_FILTERED; -} - -static inline void event_set_filter(struct ftrace_event_file *file, - struct event_filter *filter) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - rcu_assign_pointer(call->filter, filter); - else - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - RCU_INIT_POINTER(call->filter, NULL); - else - RCU_INIT_POINTER(file->filter, NULL); -} - -static inline void -event_set_no_set_filter_flag(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; -} - -static inline void -event_clear_no_set_filter_flag(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; -} - -static inline bool -event_no_set_filter_flag(struct ftrace_event_file *file) -{ - struct ftrace_event_call *call = file->event_call; - - if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) - return true; - - if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && - (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) - return true; - - return false; -} - struct filter_list { struct list_head list; struct event_filter *filter; }; static int replace_system_preds(struct event_subsystem *system, - struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { - struct ftrace_event_file *file; struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; @@ -1763,8 +1633,8 @@ static int replace_system_preds(struct event_subsystem *system, bool fail = true; int err; - list_for_each_entry(file, &tr->events, list) { - call = file->event_call; + list_for_each_entry(call, &ftrace_events, list) { + if (strcmp(call->class->system, system->name) != 0) continue; @@ -1774,20 +1644,18 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - event_set_no_set_filter_flag(file); + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; else - event_clear_no_set_filter_flag(file); + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } - list_for_each_entry(file, &tr->events, list) { + list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter; - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) continue; - if (event_no_set_filter_flag(file)) + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) continue; filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); @@ -1808,17 +1676,17 @@ static int replace_system_preds(struct event_subsystem *system, err = replace_preds(call, filter, ps, filter_string, false); if (err) { - filter_disable(file); + filter_disable(call); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); append_filter_err(ps, filter); } else - event_set_filtered_flag(file); + call->flags |= TRACE_EVENT_FL_FILTERED; /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = event_filter(file); - event_set_filter(file, filter_item->filter); + filter = call->filter; + rcu_assign_pointer(call->filter, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1948,7 +1816,6 @@ static int create_filter(struct ftrace_event_call *call, * and always remembers @filter_str. */ static int create_system_filter(struct event_subsystem *system, - struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct event_filter *filter = NULL; @@ -1957,7 +1824,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, tr, ps, filter_str); + err = replace_system_preds(system, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -1973,25 +1840,20 @@ static int create_system_filter(struct event_subsystem *system, } /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_file *file, char *filter_string) +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - struct ftrace_event_call *call = file->event_call; struct event_filter *filter; int err; if (!strcmp(strstrip(filter_string), "0")) { - filter_disable(file); - filter = event_filter(file); - + filter_disable(call); + filter = call->filter; if (!filter) return 0; - - event_clear_filter(file); - + RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); - return 0; } @@ -2004,15 +1866,14 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string) * string */ if (filter) { - struct event_filter *tmp; + struct event_filter *tmp = call->filter; - tmp = event_filter(file); if (!err) - event_set_filtered_flag(file); + call->flags |= TRACE_EVENT_FL_FILTERED; else - filter_disable(file); + filter_disable(call); - event_set_filter(file, filter); + rcu_assign_pointer(call->filter, filter); if (tmp) { /* Make sure the call is done with the filter */ @@ -2028,7 +1889,6 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; - struct trace_array *tr = dir->tr; struct event_filter *filter; int err = 0; @@ -2041,18 +1901,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, tr); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system, tr); + filter_free_subsystem_filters(system); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, tr, filter_string, &filter); + err = create_system_filter(system, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e7..d21a746 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0b99120..b5c0924 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -82,9 +82,9 @@ static struct trace_array *graph_array; * to fill in space into DURATION column. */ enum { - FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, - FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, - FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, + DURATION_FILL_FULL = -1, + DURATION_FILL_START = -2, + DURATION_FILL_END = -3, }; static enum print_line_t @@ -114,37 +114,16 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return -EBUSY; } - /* - * The curr_ret_stack is an index to ftrace return stack of - * current task. Its value should be in [0, FTRACE_RETFUNC_ - * DEPTH) when the function graph tracer is used. To support - * filtering out specific functions, it makes the index - * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) - * so when it sees a negative index the ftrace will ignore - * the record. And the index gets recovered when returning - * from the filtered function by adding the FTRACE_NOTRACE_ - * DEPTH and then it'll continue to record functions normally. - * - * The curr_ret_stack is initialized to -1 and get increased - * in this function. So it can be less than -1 only if it was - * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in debugfs by user. - */ - if (current->curr_ret_stack < -1) - return -EBUSY; - calltime = trace_clock_local(); index = ++current->curr_ret_stack; - if (ftrace_graph_notrace_addr(func)) - current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; current->ret_stack[index].fp = frame_pointer; - *depth = current->curr_ret_stack; + *depth = index; return 0; } @@ -158,17 +137,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - /* - * A negative index here means that it's just returned from a - * notrace'd function. Recover index to get an original - * return address. See ftrace_push_return_trace(). - * - * TODO: Need to check whether the stack gets corrupted. - */ - if (index < 0) - index += FTRACE_NOTRACE_DEPTH; - - if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { + if (unlikely(index < 0)) { ftrace_graph_stop(); WARN_ON(1); /* Might as well panic, otherwise we have no where to go */ @@ -224,15 +193,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; - /* - * The curr_ret_stack can be less than -1 only if it was - * filtered out and it's about to return from the function. - * Recover the index and continue to trace normal functions. - */ - if (current->curr_ret_stack < -1) { - current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; - return ret; - } /* * The trace should run after decrementing the ret counter @@ -270,7 +230,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_current_check_discard(buffer, call, entry, event)) __buffer_unlock_commit(buffer, event); return 1; @@ -299,20 +259,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) /* trace it when it is-nested-in or is a function enabled. */ if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || (trace->depth < 0) || + ftrace_graph_ignore_irqs()) || (max_depth && trace->depth >= max_depth)) return 0; - /* - * Do not trace a function if it's filtered by set_graph_notrace. - * Make the index of ret stack negative to indicate that it should - * ignore further functions. But it needs its own ret stack entry - * to recover the original index in order to continue tracing after - * returning from the function. - */ - if (ftrace_graph_notrace_addr(trace->func)) - return 1; - local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -385,7 +335,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_current_check_discard(buffer, call, entry, event)) __buffer_unlock_commit(buffer, event); } @@ -702,7 +652,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); + ret = print_graph_duration(DURATION_FILL_START, s, flags); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -714,7 +664,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); + ret = print_graph_duration(DURATION_FILL_END, s, flags); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -779,14 +729,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ - switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { - case FLAGS_FILL_FULL: + switch (duration) { + case DURATION_FILL_FULL: ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case FLAGS_FILL_START: + case DURATION_FILL_START: ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case FLAGS_FILL_END: + case DURATION_FILL_END: ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -902,7 +852,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -1222,7 +1172,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return TRACE_TYPE_PARTIAL_LINE; /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + ret = print_graph_duration(DURATION_FILL_FULL, s, flags); if (ret != TRACE_TYPE_HANDLED) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index dae9541..243f683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, entry->ip = (unsigned long)tp->rp.kp.addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) + if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit_regs(buffer, event, irq_flags, pc, regs); } @@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) + if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit_regs(buffer, event, irq_flags, pc, regs); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0abd9b8..b3dcfb2 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed32284..34e7cba 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -618,23 +618,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; - - switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | - TRACE_FLAG_PREEMPT_RESCHED)) { - case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: - need_resched = 'N'; - break; - case TRACE_FLAG_NEED_RESCHED: - need_resched = 'n'; - break; - case TRACE_FLAG_PREEMPT_RESCHED: - need_resched = 'p'; - break; - default: - need_resched = '.'; - break; - } - + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; hardsoft_irq = (hardirq && softirq) ? 'H' : hardirq ? 'h' : diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3f34dc9..4e98e3b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } @@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 7af6736..847f88a 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,15 +43,46 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; -static void __reset_stat_session(struct stat_session *session) +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) { - struct stat_node *snode, *n; + struct stat_node *snode; + struct rb_node *parent = rb_parent(node); + + if (node->rb_left) + return node->rb_left; + else if (node->rb_right) + return node->rb_right; + else { + if (!parent) + ; + else if (parent->rb_left == node) + parent->rb_left = NULL; + else + parent->rb_right = NULL; - rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { - if (session->ts->stat_release) - session->ts->stat_release(snode->stat); + snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); kfree(snode); + + return parent; } +} + +static void __reset_stat_session(struct stat_session *session) +{ + struct rb_node *node = session->stat_root.rb_node; + + while (node) + node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ea90eb5..559329d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -302,7 +302,6 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; - struct ftrace_event_file *ftrace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -315,13 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ - ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); - if (!ftrace_file) - return; - - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -343,7 +336,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) + if (!filter_current_check_discard(buffer, sys_data->enter_event, + entry, event)) trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); } @@ -351,7 +345,6 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; - struct ftrace_event_file *ftrace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -363,13 +356,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ - ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); - if (!ftrace_file) - return; - - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -390,7 +377,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) + if (!filter_current_check_discard(buffer, sys_data->exit_event, + entry, event)) trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); } @@ -409,7 +397,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - rcu_assign_pointer(tr->enter_syscall_files[num], file); + set_bit(num, tr->enabled_enter_syscalls); tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); @@ -427,7 +415,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - rcu_assign_pointer(tr->enter_syscall_files[num], NULL); + clear_bit(num, tr->enabled_enter_syscalls); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -447,7 +435,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - rcu_assign_pointer(tr->exit_syscall_files[num], file); + set_bit(num, tr->enabled_exit_syscalls); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -465,7 +453,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - rcu_assign_pointer(tr->exit_syscall_files[num], NULL); + clear_bit(num, tr->enabled_exit_syscalls); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b6dcc42..272261b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -128,7 +128,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); - tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: @@ -562,7 +561,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu, for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); } diff --git a/kernel/up.c b/kernel/up.c index 509403e..630d72b 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,17 +22,6 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) -{ - unsigned long flags; - - local_irq_save(flags); - csd->func(csd->info); - local_irq_restore(flags); -} -EXPORT_SYMBOL(__smp_call_function_single); - int on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/user.c b/kernel/user.c index a3a0dbf..5bbb919 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,10 +51,6 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, -#ifdef CONFIG_KEYS_KERBEROS_CACHE - .krb_cache_register_sem = - __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), -#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62..13fb113 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -101,9 +101,6 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); -#ifdef CONFIG_PERSISTENT_KEYRINGS - init_rwsem(&ns->persistent_keyring_register_sem); -#endif return 0; } @@ -133,9 +130,6 @@ void free_user_ns(struct user_namespace *ns) do { parent = ns->parent; -#ifdef CONFIG_PERSISTENT_KEYRINGS - key_put(ns->persistent_keyring_register); -#endif proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); ns = parent; diff --git a/kernel/sched/wait.c b/kernel/wait.c index 7d50f79..d550920 100644 --- a/kernel/sched/wait.c +++ b/kernel/wait.c @@ -53,109 +53,6 @@ EXPORT_SYMBOL(remove_wait_queue); /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ - __wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ - __wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - int wake_flags = 1; /* XXX WF_SYNC */ - - if (unlikely(!q)) - return; - - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - __wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -/* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any * wake-function that tests for the wait-queue being active @@ -195,30 +92,6 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; - - wait->private = current; - wait->func = autoremove_wake_function; - - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); - } - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); - - return 0; -} -EXPORT_SYMBOL(prepare_to_wait_event); - /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c66912be..987293d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,9 +305,6 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; -/* I: attributes used when instantiating ordered pools on demand */ -static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; - struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -521,21 +518,14 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/** - * worker_pool_assign_id - allocate ID and assing it to @pool - * @pool: the pool pointer of interest - * - * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned - * successfully, -errno on failure. - */ +/* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); - ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, - GFP_KERNEL); + ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; @@ -1330,7 +1320,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); - /* if draining, only works from the same workqueue are allowed */ + /* if dying, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; @@ -1746,17 +1736,16 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; - set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; - /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. */ + set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; + /* * The caller is responsible for ensuring %POOL_DISASSOCIATED * remains stable across this function. See the comments above the @@ -4117,7 +4106,7 @@ out_unlock: static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; - int cpu, ret; + int cpu; if (!(wq->flags & WQ_UNBOUND)) { wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4137,13 +4126,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) mutex_unlock(&wq->mutex); } return 0; - } else if (wq->flags & __WQ_ORDERED) { - ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); - /* there should only be single pwq for ordering guarantee */ - WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || - wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), - "ordering guarantee broken for workqueue %s\n", wq->name); - return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -5027,6 +5009,10 @@ static int __init init_workqueues(void) int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; + /* make sure we have enough bits for OFFQ pool ID */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < + WORK_CPU_END * NR_STD_WORKER_POOLS); + WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -5065,23 +5051,13 @@ static int __init init_workqueues(void) } } - /* create default unbound and ordered wq attrs */ + /* create default unbound wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; - - /* - * An ordered wq should have only one pwq as ordering is - * guaranteed by max_active which is enforced by pwqs. - * Turn off NUMA so that dfl_pwq is used for all nodes. - */ - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); - attrs->nice = std_nice[i]; - attrs->no_numa = true; - ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); |