summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c12
-rw-r--r--kernel/livepatch/core.c30
-rw-r--r--kernel/locking/lockdep.c81
-rw-r--r--kernel/module.c29
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/fair.c12
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c11
9 files changed, 143 insertions, 63 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f04daab..2fabc06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3591,7 +3591,7 @@ static void put_event(struct perf_event *event)
ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
WARN_ON_ONCE(ctx->parent_ctx);
perf_remove_from_context(event, true);
- mutex_unlock(&ctx->mutex);
+ perf_event_ctx_unlock(event, ctx);
_free_event(event);
}
@@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry)
{
struct perf_event *event = container_of(entry,
struct perf_event, pending);
+ int rctx;
+
+ rctx = perf_swevent_get_recursion_context();
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
if (event->pending_disable) {
event->pending_disable = 0;
@@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry)
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
}
/*
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 01ca088..3f9f1d6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj)
/* sets obj->mod if object is not vmlinux and module is found */
static void klp_find_object_module(struct klp_object *obj)
{
+ struct module *mod;
+
if (!klp_is_module(obj))
return;
mutex_lock(&module_mutex);
/*
- * We don't need to take a reference on the module here because we have
- * the klp_mutex, which is also taken by the module notifier. This
- * prevents any module from unloading until we release the klp_mutex.
+ * We do not want to block removal of patched modules and therefore
+ * we do not take a reference here. The patches are removed by
+ * a going module handler instead.
+ */
+ mod = find_module(obj->name);
+ /*
+ * Do not mess work of the module coming and going notifiers.
+ * Note that the patch might still be needed before the going handler
+ * is called. Module functions can be called even in the GOING state
+ * until mod->exit() finishes. This is especially important for
+ * patches that modify semantic of the functions.
*/
- obj->mod = find_module(obj->name);
+ if (mod && mod->klp_alive)
+ obj->mod = mod;
+
mutex_unlock(&module_mutex);
}
@@ -767,6 +779,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
return -EINVAL;
obj->state = KLP_DISABLED;
+ obj->mod = NULL;
klp_find_object_module(obj);
@@ -961,6 +974,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
mutex_lock(&klp_mutex);
+ /*
+ * Each module has to know that the notifier has been called.
+ * We never know what module will get patched by a new patch.
+ */
+ if (action == MODULE_STATE_COMING)
+ mod->klp_alive = true;
+ else /* MODULE_STATE_GOING */
+ mod->klp_alive = false;
+
list_for_each_entry(patch, &klp_patches, list) {
for (obj = patch->objs; obj->funcs; obj++) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d44..ba77ab5 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
if (!new_class->name)
return 0;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
if (new_class->key - new_class->subclass == class->key)
return class->name_version;
if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
hash_head = classhashentry(key);
/*
- * We can walk the hash lockfree, because the hash only
- * grows, and we are careful when adding entries to the end:
+ * We do an RCU walk of the hash, see lockdep_free_key_range().
*/
- list_for_each_entry(class, hash_head, hash_entry) {
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return NULL;
+
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct list_head *hash_head;
struct lock_class *class;
- unsigned long flags;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
key = lock->key->subkeys + subclass;
hash_head = classhashentry(key);
- raw_local_irq_save(flags);
if (!graph_lock()) {
- raw_local_irq_restore(flags);
return NULL;
}
/*
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry(class, hash_head, hash_entry)
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
+ }
+
/*
* Allocate a new key from the static array, and add it to
* the hash:
*/
if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
if (!debug_locks_off_graph_unlock()) {
- raw_local_irq_restore(flags);
return NULL;
}
- raw_local_irq_restore(flags);
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (verbose(class)) {
graph_unlock();
- raw_local_irq_restore(flags);
printk("\nnew class %p: %s", class->key, class->name);
if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("\n");
dump_stack();
- raw_local_irq_save(flags);
if (!graph_lock()) {
- raw_local_irq_restore(flags);
return NULL;
}
}
out_unlock_set:
graph_unlock();
- raw_local_irq_restore(flags);
out_set_class_cache:
if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
entry->distance = distance;
entry->trace = *trace;
/*
- * Since we never remove from the dependency list, the list can
- * be walked lockless by other CPUs, it's only allocation
- * that must be protected by the spinlock. But this also means
- * we must make new entries visible only once writes to the
- * entry become visible - hence the RCU op:
+ * Both allocation and removal are done under the graph lock; but
+ * iteration is under RCU-sched; see look_up_lock_class() and
+ * lockdep_free_key_range().
*/
list_add_tail_rcu(&entry->entry, head);
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
else
head = &lock->class->locks_before;
- list_for_each_entry(entry, head, entry) {
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ list_for_each_entry_rcu(entry, head, entry) {
if (!lock_accessed(entry)) {
unsigned int cq_depth;
mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ list_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (unlikely(!debug_locks))
return;
- if (subclass)
+ if (subclass) {
+ unsigned long flags;
+
+ if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
register_lock_class(lock, subclass, 1);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+ }
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
void lockdep_free_key_range(void *start, unsigned long size)
{
- struct lock_class *class, *next;
+ struct lock_class *class;
struct list_head *head;
unsigned long flags;
int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
head = classhash_table + i;
if (list_empty(head))
continue;
- list_for_each_entry_safe(class, next, head, hash_entry) {
+ list_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
if (locked)
graph_unlock();
raw_local_irq_restore(flags);
+
+ /*
+ * Wait for any possible iterators from look_up_lock_class() to pass
+ * before continuing to free the memory they refer to.
+ *
+ * sync_sched() is sufficient because the read-side is IRQ disable.
+ */
+ synchronize_sched();
+
+ /*
+ * XXX at this point we could return the resources to the pool;
+ * instead we leak them. We would need to change to bitmap allocators
+ * instead of the linear allocators we have now.
+ */
}
void lockdep_reset_lock(struct lockdep_map *lock)
{
- struct lock_class *class, *next;
+ struct lock_class *class;
struct list_head *head;
unsigned long flags;
int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
head = classhash_table + i;
if (list_empty(head))
continue;
- list_for_each_entry_safe(class, next, head, hash_entry) {
+ list_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/module.c b/kernel/module.c
index cc93cf6..ec53f59 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,7 +56,6 @@
#include <linux/async.h>
#include <linux/percpu.h>
#include <linux/kmemleak.h>
-#include <linux/kasan.h>
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
@@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { }
void __weak module_memfree(void *module_region)
{
vfree(module_region);
- kasan_module_free(module_region);
}
void __weak module_arch_cleanup(struct module *mod)
@@ -1867,7 +1865,7 @@ static void free_module(struct module *mod)
kfree(mod->args);
percpu_modfree(mod);
- /* Free lock-classes: */
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod->module_core, mod->core_size);
/* Finally, free the core (containing the module structure) */
@@ -2481,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
return 0;
}
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -2500,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
vfree(info->hdr);
return -EFAULT;
}
@@ -3351,9 +3366,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
- /* Free lock-classes: */
- lockdep_free_key_range(mod->module_core, mod->core_size);
-
/* we can't deallocate the module until we clear memory protection */
unset_module_init_ro_nx(mod);
unset_module_core_ro_nx(mod);
@@ -3377,6 +3389,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
module_deallocate(mod, info);
free_copy:
free_copy(info);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c24d5a2..5235dd4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
}
-static bool is_nosave_page(unsigned long pfn)
-{
- struct nosave_region *region;
-
- list_for_each_entry(region, &nosave_regions, list) {
- if (pfn >= region->start_pfn && pfn < region->end_pfn) {
- pr_err("PM: %#010llx in e820 nosave region: "
- "[mem %#010llx-%#010llx]\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (unsigned long long) region->start_pfn << PAGE_SHIFT,
- ((unsigned long long) region->end_pfn << PAGE_SHIFT)
- - 1);
- return true;
- }
- }
-
- return false;
-}
-
/**
* create_basic_memory_bitmaps - create bitmaps needed for marking page
* frames that should not be saved and free page frames. The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
do {
pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
+ if (likely(pfn_valid(pfn)))
swsusp_set_page_free(pfn_to_page(pfn));
else
return -EFAULT;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d0c4209..3d5f6f6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3049,6 +3049,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
} else {
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
p->sched_class = &fair_sched_class;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3..241213b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
- * to automatic numa balancing. Scan slower
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, scan slower
*/
- if (local + shared == 0) {
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
+ if (flags & TNF_MIGRATE_FAIL)
+ p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
@@ -2161,8 +2165,10 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma)) {
continue;
+ }
/*
* Shared library pages mapped by multiple processes are not
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6..ce410bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "nr_pdflush_threads",
.mode = 0444 /* read-only */,
.proc_handler = pdflush_proc_obsolete,
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5..6aac4be 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
+ int bc_moved;
/*
* We try to cancel the timer first. If the callback is on
* flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* restart the timer because we are in the callback, but we
* can set the expiry time and let the callback return
* HRTIMER_RESTART.
+ *
+ * Since we are in the idle loop at this point and because
+ * hrtimer_{start/cancel} functions call into tracing,
+ * calls to these functions must be bound within RCU_NONIDLE.
*/
- if (hrtimer_try_to_cancel(&bctimer) >= 0) {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+ RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
+ !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+ 0);
+ if (bc_moved) {
/* Bind the "device" to the cpu */
bc->bound_on = smp_processor_id();
} else if (bc->bound_on == smp_processor_id()) {