summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2016-08-18 09:20:19 (GMT)
committerIngo Molnar <mingo@kernel.org>2016-08-18 09:20:19 (GMT)
commit5a96215739fbda2ccc0f2371295e93e4cb6a088a (patch)
tree65459d484c23c0574984a1ac237254ce69c1f623 /kernel
parent1fc770d5899c995db8e22d35eb918a2cb79559d9 (diff)
parent03cbc732639ddcad15218c4b2046d255851ff1e3 (diff)
downloadlinux-5a96215739fbda2ccc0f2371295e93e4cb6a088a.tar.xz
Merge branch 'sched/urgent' into sched/core, to pick up dependencies
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c77
-rw-r--r--kernel/futex.c23
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/qspinlock_stat.h1
-rw-r--r--kernel/power/hibernate.c4
-rw-r--r--kernel/sched/cputime.c41
-rw-r--r--kernel/time/timer.c5
8 files changed, 127 insertions, 37 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a19550d..1903b8f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
}
}
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ if (add && ctx->nr_cgroups++)
+ return;
+ else if (!add && --ctx->nr_cgroups)
+ return;
+ /*
+ * Because cgroup events are always per-cpu events,
+ * this will always be called from the right CPU.
+ */
+ cpuctx = __get_cpu_context(ctx);
+ cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
#else /* !CONFIG_CGROUP_PERF */
static inline bool
@@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool add)
+{
+}
+
#endif
/*
@@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
- if (is_cgroup_event(event))
- ctx->nr_cgroups++;
+ list_update_cgroup_event(event, ctx, true);
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
@@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- if (is_cgroup_event(event)) {
- ctx->nr_cgroups--;
- /*
- * Because cgroup events are always per-cpu events, this will
- * always be called from the right CPU.
- */
- cpuctx = __get_cpu_context(ctx);
- /*
- * If there are no more cgroup events then clear cgrp to avoid
- * stale pointer in update_cgrp_time_from_cpuctx().
- */
- if (!ctx->nr_cgroups)
- cpuctx->cgrp = NULL;
- }
+ list_update_cgroup_event(event, ctx, false);
ctx->nr_events--;
if (event->attr.inherit_stat)
@@ -1716,8 +1734,8 @@ static inline int pmu_filter_match(struct perf_event *event)
static inline int
event_filter_match(struct perf_event *event)
{
- return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event) && pmu_filter_match(event);
+ return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+ perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
@@ -1737,8 +1755,8 @@ event_sched_out(struct perf_event *event,
* maintained, otherwise bogus information is return
* via read() for time_enabled, time_running:
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
+ if (event->state == PERF_EVENT_STATE_INACTIVE &&
+ !event_filter_match(event)) {
delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
@@ -2236,10 +2254,15 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
- event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;
+ /*
+ * Ensures that if we can observe event->ctx, both the event and ctx
+ * will be 'complete'. See perf_iterate_sb_cpu().
+ */
+ smp_store_release(&event->ctx, ctx);
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -5969,6 +5992,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
struct perf_event *event;
list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ /*
+ * Skip events that are not fully formed yet; ensure that
+ * if we observe event->ctx, both event and ctx will be
+ * complete enough. See perf_install_in_context().
+ */
+ if (!smp_load_acquire(&event->ctx))
+ continue;
+
if (event->state < PERF_EVENT_STATE_INACTIVE)
continue;
if (!event_filter_match(event))
diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f7..46cb3a3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
-#define FLAGS_SHARED 0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x00
+#endif
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
if (!key->both.ptr)
return;
+ /*
+ * On MMU less systems futexes are always "private" as there is no per
+ * process address space. We need the smp wmb nevertheless - yes,
+ * arch/blackfin has MMU less SMP ...
+ */
+ if (!IS_ENABLED(CONFIG_MMU)) {
+ smp_mb(); /* explicit smp_mb(); (B) */
+ return;
+ }
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
return;
}
+ if (!IS_ENABLED(CONFIG_MMU))
+ return;
+
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
iput(key->shared.inode);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5499935..19e9dfb 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 37649e6..8a99abf 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
goto gotlock;
}
}
- WRITE_ONCE(pn->state, vcpu_halted);
+ WRITE_ONCE(pn->state, vcpu_hashed);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e0253..b9d0315 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
*/
if ((counter == qstat_pv_latency_kick) ||
(counter == qstat_pv_latency_wake)) {
- stat = 0;
if (kicks)
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a881c6a..33c79b6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
+ restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- /* Restore control flow magically appears here */
- restore_processor_state();
if (!in_suspend)
events_check_enabled = false;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f65..a846cf8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
*/
- other = account_other_time(cputime);
+ other = account_other_time(ULONG_MAX);
if (other >= cputime)
return;
cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
}
cputime = cputime_one_jiffy;
- steal = steal_account_process_time(cputime);
+ steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
@@ -508,13 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
*/
void account_idle_ticks(unsigned long ticks)
{
+ cputime_t cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
+ cputime = jiffies_to_cputime(ticks);
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
@@ -606,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -641,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
@@ -686,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
unsigned long now = READ_ONCE(jiffies);
cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
delta = jiffies_to_cputime(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 555670a..32bf6f7 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
+ bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
spin_lock(&base->lock);
nextevt = __next_timer_interrupt(base);
+ is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
base->next_expiry = nextevt;
/*
* We have a fresh next event. Check whether we can forward the base:
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ if (!is_max_delta)
+ expires = basem + (nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle:
*/