summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/sysctl/kernel.txt8
-rw-r--r--Makefile3
-rw-r--r--arch/arm/kvm/arm.c8
-rw-r--r--arch/arm/kvm/psci.c4
-rw-r--r--arch/mips/kvm/mips.c8
-rw-r--r--arch/powerpc/include/asm/kvm_host.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c23
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/kvm/interrupt.c4
-rw-r--r--arch/x86/kvm/lapic.c6
-rw-r--r--include/linux/kvm_host.h5
-rw-r--r--include/linux/latencytop.h3
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/sched/sysctl.h4
-rw-r--r--include/linux/swait.h172
-rw-r--r--include/linux/wait.h2
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/rcu/tree.c24
-rw-r--r--kernel/rcu/tree.h12
-rw-r--r--kernel/rcu/tree_plugin.h32
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c80
-rw-r--r--kernel/sched/debug.c102
-rw-r--r--kernel/sched/fair.c200
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/swait.c123
-rw-r--r--kernel/sysctl.c13
-rw-r--r--virt/kvm/async_pf.c4
-rw-r--r--virt/kvm/kvm_main.c17
32 files changed, 693 insertions, 205 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9a53c92..76d68c5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3528,6 +3528,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
sched_debug [KNL] Enables verbose scheduler debug messages.
+ schedstats= [KNL,X86] Enable or disable scheduled statistics.
+ Allowed values are enable and disable. This feature
+ incurs a small amount of overhead in the scheduler
+ but is useful for debugging and performance tuning.
+
skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
xtime_lock contention on larger systems, and/or RCU lock
contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a93b414..87119dc 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -760,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued.
==============================================================
+sched_schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
sg-big-buff:
This file shows the size of the generic SCSI (sg) buffer.
diff --git a/Makefile b/Makefile
index af6e5f8..db2bc43 100644
--- a/Makefile
+++ b/Makefile
@@ -773,6 +773,9 @@ KBUILD_CFLAGS += $(call cc-option,-Werror=strict-prototypes)
# Prohibit date/time macros, which would make the build non-deterministic
KBUILD_CFLAGS += $(call cc-option,-Werror=date-time)
+# enforce correct pointer usage
+KBUILD_CFLAGS += $(call cc-option,-Werror=incompatible-pointer-types)
+
# use the deterministic mode of AR if available
KBUILD_ARFLAGS := $(call ar-option,D)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dda1959..08e49c4 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
- wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+ struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
vcpu->arch.pause = false;
- wake_up_interruptible(wq);
+ swake_up(wq);
}
}
static void vcpu_sleep(struct kvm_vcpu *vcpu)
{
- wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+ struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
- wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+ swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
(!vcpu->arch.pause)));
}
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index a9b3b90..c2b1315 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
{
struct kvm *kvm = source_vcpu->kvm;
struct kvm_vcpu *vcpu = NULL;
- wait_queue_head_t *wq;
+ struct swait_queue_head *wq;
unsigned long cpu_id;
unsigned long context_id;
phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
smp_mb(); /* Make sure the above is visible */
wq = kvm_arch_vcpu_wq(vcpu);
- wake_up_interruptible(wq);
+ swake_up(wq);
return PSCI_RET_SUCCESS;
}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8bc3977..341f6a1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
dvcpu->arch.wait = 0;
- if (waitqueue_active(&dvcpu->wq))
- wake_up_interruptible(&dvcpu->wq);
+ if (swait_active(&dvcpu->wq))
+ swake_up(&dvcpu->wq);
return 0;
}
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
kvm_mips_callbacks->queue_timer_int(vcpu);
vcpu->arch.wait = 0;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
+ if (swait_active(&vcpu->wq))
+ swake_up(&vcpu->wq);
}
/* low level hrtimer wake routine */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9d08d8c..c98afa5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvmppc_vcore {
struct list_head runnable_threads;
struct list_head preempt_list;
spinlock_t lock;
- wait_queue_head_t wq;
+ struct swait_queue_head wq;
spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
u64 stolen_tb;
u64 preempt_tb;
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
u8 prodded;
u32 last_inst;
- wait_queue_head_t *wqp;
+ struct swait_queue_head *wqp;
struct kvmppc_vcore *vcore;
int ret;
int trap;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index baeddb0..f1187bb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
{
int cpu;
- wait_queue_head_t *wqp;
+ struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu);
- if (waitqueue_active(wqp)) {
- wake_up_interruptible(wqp);
+ if (swait_active(wqp)) {
+ swake_up(wqp);
++vcpu->stat.halt_wakeup;
}
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
tvcpu->arch.prodded = 1;
smp_mb();
if (vcpu->arch.ceded) {
- if (waitqueue_active(&vcpu->wq)) {
- wake_up_interruptible(&vcpu->wq);
+ if (swait_active(&vcpu->wq)) {
+ swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
}
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
INIT_LIST_HEAD(&vcore->runnable_threads);
spin_lock_init(&vcore->lock);
spin_lock_init(&vcore->stoltb_lock);
- init_waitqueue_head(&vcore->wq);
+ init_swait_queue_head(&vcore->wq);
vcore->preempt_tb = TB_NIL;
vcore->lpcr = kvm->arch.lpcr;
vcore->first_vcpuid = core * threads_per_subcore;
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
struct kvm_vcpu *vcpu;
int do_sleep = 1;
+ DECLARE_SWAITQUEUE(wait);
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
/*
* Check one last time for pending exceptions and ceded state after
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
}
if (!do_sleep) {
- finish_wait(&vc->wq, &wait);
+ finish_swait(&vc->wq, &wait);
return;
}
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
trace_kvmppc_vcore_blocked(vc, 0);
spin_unlock(&vc->lock);
schedule();
- finish_wait(&vc->wq, &wait);
+ finish_swait(&vc->wq, &wait);
spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE;
trace_kvmppc_vcore_blocked(vc, 1);
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
} else if (vc->vcore_state == VCORE_SLEEPING) {
- wake_up(&vc->wq);
+ swake_up(&vc->wq);
}
}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8959ebb..b0c8ad0 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
struct kvm_s390_local_interrupt {
spinlock_t lock;
struct kvm_s390_float_interrupt *float_int;
- wait_queue_head_t *wq;
+ struct swait_queue_head *wq;
atomic_t *cpuflags;
DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
struct kvm_s390_irq_payload irq;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f88ca72..9ffc732 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -966,13 +966,13 @@ no_timer:
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
{
- if (waitqueue_active(&vcpu->wq)) {
+ if (swait_active(&vcpu->wq)) {
/*
* The vcpu gave up the cpu voluntarily, mark it as a good
* yield-candidate.
*/
vcpu->preempted = true;
- wake_up_interruptible(&vcpu->wq);
+ swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591fa..3a045f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
static void apic_timer_expired(struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu = apic->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
+ struct swait_queue_head *q = &vcpu->wq;
struct kvm_timer *ktimer = &apic->lapic_timer;
if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
atomic_inc(&apic->lapic_timer.pending);
kvm_set_pending_timer(vcpu);
- if (waitqueue_active(q))
- wake_up_interruptible(q);
+ if (swait_active(q))
+ swake_up(q);
if (apic_lvtt_tscdeadline(apic))
ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 861f690..5276fe0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -25,6 +25,7 @@
#include <linux/irqflags.h>
#include <linux/context_tracking.h>
#include <linux/irqbypass.h>
+#include <linux/swait.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
unsigned char fpu_counter;
- wait_queue_head_t wq;
+ struct swait_queue_head wq;
struct pid *pid;
int sigset_active;
sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
}
#endif
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
{
#ifdef __KVM_HAVE_ARCH_WQP
return vcpu->arch.wqp;
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index e23121f..59ccab2 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
void clear_all_latency_tracing(struct task_struct *p);
+extern int sysctl_latencytop(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+
#else
static inline void
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..a292c4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,6 +920,10 @@ static inline int sched_info_on(void)
#endif
}
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
enum cpu_idle_type {
CPU_IDLE,
CPU_NOT_IDLE,
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731..4f080ab 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
#endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/swait.h b/include/linux/swait.h
new file mode 100644
index 0000000..c1f9c62
--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ * sleeper state.
+ *
+ * - the exclusive mode; because this requires preserving the list order
+ * and this is hard.
+ *
+ * - custom wake functions; because you cannot give any guarantees about
+ * random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+ raw_spinlock_t lock;
+ struct list_head task_list;
+};
+
+struct swait_queue {
+ struct task_struct *task;
+ struct list_head task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) { \
+ .task = current, \
+ .task_list = LIST_HEAD_INIT((name).task_list), \
+}
+
+#define DECLARE_SWAITQUEUE(name) \
+ struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
+ .task_list = LIST_HEAD_INIT((name).task_list), \
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name) \
+ struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+ struct lock_class_key *key);
+
+#define init_swait_queue_head(q) \
+ do { \
+ static struct lock_class_key __key; \
+ __init_swait_queue_head((q), #q, &__key); \
+ } while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
+ ({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \
+ struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \
+ DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+ return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd) \
+({ \
+ struct swait_queue __wait; \
+ long __ret = ret; \
+ \
+ INIT_LIST_HEAD(&__wait.task_list); \
+ for (;;) { \
+ long __int = prepare_to_swait_event(&wq, &__wait, state);\
+ \
+ if (condition) \
+ break; \
+ \
+ if (___wait_is_interruptible(state) && __int) { \
+ __ret = __int; \
+ break; \
+ } \
+ \
+ cmd; \
+ } \
+ finish_swait(&wq, &__wait); \
+ __ret; \
+})
+
+#define __swait_event(wq, condition) \
+ (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
+ schedule())
+
+#define swait_event(wq, condition) \
+do { \
+ if (condition) \
+ break; \
+ __swait_event(wq, condition); \
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout) \
+ ___swait_event(wq, ___wait_cond_timeout(condition), \
+ TASK_UNINTERRUPTIBLE, timeout, \
+ __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout) \
+({ \
+ long __ret = timeout; \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __swait_event_timeout(wq, condition, timeout); \
+ __ret; \
+})
+
+#define __swait_event_interruptible(wq, condition) \
+ ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
+ schedule())
+
+#define swait_event_interruptible(wq, condition) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __ret = __swait_event_interruptible(wq, condition); \
+ __ret; \
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout) \
+ ___swait_event(wq, ___wait_cond_timeout(condition), \
+ TASK_INTERRUPTIBLE, timeout, \
+ __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout) \
+({ \
+ long __ret = timeout; \
+ if (!___wait_cond_timeout(condition)) \
+ __ret = __swait_event_interruptible_timeout(wq, \
+ condition, timeout); \
+ __ret; \
+})
+
+#endif /* _LINUX_SWAIT_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index ae71a76..27d7a0a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -338,7 +338,7 @@ do { \
schedule(); try_to_freeze())
/**
- * wait_event - sleep (or freeze) until a condition gets true
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a028127..b5c30d9 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
* of times)
*/
-#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
+#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
device_initcall(init_lstats_procfs);
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e1..5136969 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
+ force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd41..9fd5b62 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
int needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1];
trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
- wake_up(&rsp->gp_wq);
+ swake_up(&rsp->gp_wq);
}
/*
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct swait_queue_head *sq;
WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
+ sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq(&rnp->lock);
+ rcu_nocb_gp_cleanup(sq);
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- wait_event_interruptible(rsp->gp_wq,
+ swait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_interruptible_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- wake_up(&rsp->expedited_wq);
+ swake_up(&rsp->expedited_wq);
}
break;
}
@@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
- ret = wait_event_interruptible_timeout(
+ ret = swait_event_timeout(
rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall);
@@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
return;
if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */
- wait_event(rsp->expedited_wq,
+ swait_event(rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root));
return;
}
@@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
}
}
- init_waitqueue_head(&rsp->gp_wq);
- init_waitqueue_head(&rsp->expedited_wq);
+ init_swait_queue_head(&rsp->gp_wq);
+ init_swait_queue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 83360b4..bbd235d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/swait.h>
#include <linux/stop_machine.h>
/*
@@ -243,7 +244,7 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
- wait_queue_head_t nocb_gp_wq[2];
+ struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2];
@@ -399,7 +400,7 @@ struct rcu_data {
atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail;
- wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -478,7 +479,7 @@ struct rcu_state {
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
- wait_queue_head_t gp_wq; /* Where GP task waits. */
+ struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
@@ -506,7 +507,7 @@ struct rcu_state {
unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
- wait_queue_head_t expedited_wq; /* Wait for check-ins. */
+ struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9467a8b..080bd20 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
- wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+ swake_up_all(sq);
}
/*
@@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
}
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
- init_waitqueue_head(&rnp->nocb_gp_wq[0]);
- init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
- wake_up(&rdp_leader->nocb_wq);
+ swake_up(&rdp_leader->nocb_wq);
}
}
@@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
- wait_event_interruptible(
+ swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
@@ -2097,7 +2102,7 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
- wait_event_interruptible(my_rdp->nocb_wq,
+ swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
@@ -2172,7 +2177,7 @@ wait_again:
* List was empty, wake up the follower.
* Memory barriers supplied by atomic_long_add().
*/
- wake_up(&rdp->nocb_wq);
+ swake_up(&rdp->nocb_wq);
}
}
@@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
- wait_event_interruptible(rdp->nocb_wq,
+ swait_event_interruptible(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
@@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void)
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
- init_waitqueue_head(&rdp->nocb_wq);
+ init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
return false;
}
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
}
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..7d4cba2 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab814bf..291552b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2093,7 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu);
stat:
- ttwu_stat(p, cpu, wake_flags);
+ if (schedstat_enabled())
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2141,7 +2142,8 @@ static void try_to_wake_up_local(struct task_struct *p)
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
+ if (schedstat_enabled())
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2210,6 +2212,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2281,6 +2284,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ set_schedstats(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_schedstats(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif
+#endif
+
/*
* fork()/clone()-time setup:
*/
@@ -3280,7 +3346,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
- cpu = cpu_of(rq);
} else {
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
@@ -6173,11 +6238,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
+ int ret;
+
alloc_bootmem_cpumask_var(&cpu_isolated_map);
- cpulist_parse(str, cpu_isolated_map);
+ ret = cpulist_parse(str, cpu_isolated_map);
+ if (ret) {
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return 0;
+ }
return 1;
}
-
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6415117..7cfa87b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -75,16 +75,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
+ if (schedstat_enabled()) {
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
+ }
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
@@ -122,10 +124,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
#ifdef CONFIG_SCHEDSTATS
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.statistics.wait_sum),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ if (schedstat_enabled()) {
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(p->se.statistics.wait_sum),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ }
#else
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
0LL, 0L,
@@ -313,17 +317,18 @@ do { \
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
- P(yld_count);
-
- P(sched_count);
- P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
P64(max_idle_balance_cost);
#endif
- P(ttwu_count);
- P(ttwu_local);
+ if (schedstat_enabled()) {
+ P(yld_count);
+ P(sched_count);
+ P(sched_goidle);
+ P(ttwu_count);
+ P(ttwu_local);
+ }
#undef P
#undef P64
@@ -569,38 +574,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
- PN(se.statistics.sum_sleep_runtime);
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
P(se.nr_migrations);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
- {
+ if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
+ PN(se.statistics.sum_sleep_runtime);
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cce3303..5641042 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ u64 delta;
+
+ delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
if (entity_is_task(se)) {
p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->statistics.wait_sum += delta;
se->statistics.wait_start = 0;
}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Mark the end of the wait period if dequeueing a
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
+
+ if (flags & DEQUEUE_SLEEP) {
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ }
+ }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
/*
* We are picking a new current task - update its stats:
*/
@@ -907,10 +932,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
+ int active_nodes;
struct rcu_head rcu;
- nodemask_t active_nodes;
unsigned long total_faults;
+ unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true;
/*
- * Do not migrate if the destination is not a node that
- * is actively used by this numa group.
+ * Destination node is much more heavily used than the source
+ * node? Allow migration.
*/
- if (!node_isset(dst_nid, ng->active_nodes))
- return false;
-
- /*
- * Source is a node that is not actively used by this
- * numa group, while the destination is. Migrate.
- */
- if (!node_isset(src_nid, ng->active_nodes))
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+ ACTIVE_NODE_FRACTION)
return true;
/*
- * Both source and destination are nodes in active
- * use by this numa group. Maximize memory bandwidth
- * by migrating from more heavily used groups, to less
- * heavily used ones, spreading the load around.
- * Use a 1/4 hysteresis to avoid spurious page movement.
+ * Distribute memory according to CPU & memory use on each node,
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
+ *
+ * faults_cpu(dst) 3 faults_cpu(src)
+ * --------------- * - > ---------------
+ * faults_mem(dst) 4 faults_mem(src)
*/
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static unsigned long weighted_cpuload(const int cpu);
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL,
.best_imp = 0,
- .best_cpu = -1
+ .best_cpu = -1,
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group &&
- nodes_weight(p->numa_group->active_nodes) > 1)) {
+ if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = env.dst_nid;
- if (node_isset(nid, p->numa_group->active_nodes))
+ if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid);
}
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
*/
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
- int nid;
+ int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
- if (!node_isset(nid, numa_group->active_nodes)) {
- if (faults > max_faults * 6 / 16)
- node_set(nid, numa_group->active_nodes);
- } else if (faults < max_faults * 3 / 16)
- node_clear(nid, numa_group->active_nodes);
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
+ active_nodes++;
}
+
+ numa_group->max_faults_cpu = max_faults;
+ numa_group->active_nodes = active_nodes;
}
/*
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
- update_numa_active_node_mask(p->numa_group);
+ numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
}
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return;
atomic_set(&grp->refcount, 1);
+ grp->active_nodes = 1;
+ grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
- node_set(task_node(current), grp->active_nodes);
-
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
+ struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- if (!priv && !local && p->numa_group &&
- node_isset(cpu_node, p->numa_group->active_nodes) &&
- node_isset(mem_node, p->numa_group->active_nodes))
+ ng = p->numa_group;
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
local = 1;
task_numa_placement(p);
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled()) {
+ pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ "stat_blocked and stat_runtime require the "
+ "kernel parameter schedstats=enabled or "
+ "kernel.sched_schedstats=1\n");
+ }
+#endif
+}
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
+ if (schedstat_enabled())
+ enqueue_sleeper(cfs_rq, se);
}
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
+ check_schedstat_required();
+ if (schedstat_enabled()) {
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ }
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
-#endif
- }
+ if (schedstat_enabled())
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ if (schedstat_enabled())
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
+ if (schedstat_enabled()) {
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq)
+ update_stats_wait_start(cfs_rq, prev);
+ }
+
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 30ea2d8..91f0a77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1022,6 +1022,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void)
{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc76..70b3b6a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val) do { var = (val); } while (0)
+# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
+# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 0000000..82f0dff
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+ struct lock_class_key *key)
+{
+ raw_spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ wake_up_process(curr->task);
+ list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+ unsigned long flags;
+
+ if (!swait_active(q))
+ return;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ swake_up_locked(q);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ LIST_HEAD(tmp);
+
+ if (!swait_active(q))
+ return;
+
+ raw_spin_lock_irq(&q->lock);
+ list_splice_init(&q->task_list, &tmp);
+ while (!list_empty(&tmp)) {
+ curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+ wake_up_state(curr->task, TASK_NORMAL);
+ list_del_init(&curr->task_list);
+
+ if (list_empty(&tmp))
+ break;
+
+ raw_spin_unlock_irq(&q->lock);
+ raw_spin_lock_irq(&q->lock);
+ }
+ raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ wait->task = current;
+ if (list_empty(&wait->task_list))
+ list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ prepare_to_swait(q, wait, state);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ __set_current_state(TASK_RUNNING);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+
+ if (!list_empty_careful(&wait->task_list)) {
+ raw_spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97715fd..f5102fab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
.data = &latencytop_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = sysctl_latencytop,
},
#endif
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index db2dd33..65da997 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
* This memory barrier pairs with prepare_to_wait's set_current_state()
*/
smp_mb();
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
+ if (swait_active(&vcpu->wq))
+ swake_up(&vcpu->wq);
mmput(mm);
kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a11cfd2..f8417d0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
vcpu->pid = NULL;
- vcpu->halt_poll_ns = 0;
- init_waitqueue_head(&vcpu->wq);
+ init_swait_queue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
vcpu->pre_pcpu = -1;
@@ -1990,7 +1989,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
void kvm_vcpu_block(struct kvm_vcpu *vcpu)
{
ktime_t start, cur;
- DEFINE_WAIT(wait);
+ DECLARE_SWAITQUEUE(wait);
bool waited = false;
u64 block_ns;
@@ -2015,7 +2014,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_blocking(vcpu);
for (;;) {
- prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
if (kvm_vcpu_check_block(vcpu) < 0)
break;
@@ -2024,7 +2023,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
schedule();
}
- finish_wait(&vcpu->wq, &wait);
+ finish_swait(&vcpu->wq, &wait);
cur = ktime_get();
kvm_arch_vcpu_unblocking(vcpu);
@@ -2056,11 +2055,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
int me;
int cpu = vcpu->cpu;
- wait_queue_head_t *wqp;
+ struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu);
- if (waitqueue_active(wqp)) {
- wake_up_interruptible(wqp);
+ if (swait_active(wqp)) {
+ swake_up(wqp);
++vcpu->stat.halt_wakeup;
}
@@ -2161,7 +2160,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (vcpu == me)
continue;
- if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+ if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;