summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c248
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/debug/debug_core.c32
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_debugger.c5
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/elfcore.c10
-rw-r--r--kernel/events/core.c172
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c101
-rw-r--r--kernel/events/uprobes.c223
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/gcov/Kconfig30
-rw-r--r--kernel/gcov/Makefile32
-rw-r--r--kernel/gcov/base.c32
-rw-r--r--kernel/gcov/fs.c52
-rw-r--r--kernel/gcov/gcc_3_4.c115
-rw-r--r--kernel/gcov/gcc_4_7.c560
-rw-r--r--kernel/gcov/gcov.h65
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/jump_label.c5
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/kthread.c73
-rw-r--r--kernel/module.c103
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/power/Kconfig16
-rw-r--r--kernel/power/qos.c26
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/user.c20
-rw-r--r--kernel/printk/printk.c35
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/fair.c38
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c14
-rw-r--r--kernel/softirq.c37
-rw-r--r--kernel/stop_machine.c15
-rw-r--r--kernel/sys.c1
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/sysctl_binary.c6
-rw-r--r--kernel/taskstats.c16
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/sched_clock.c114
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/trace/blktrace.c36
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_output.c19
60 files changed, 1590 insertions, 879 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfd..e0839bcd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -125,38 +125,6 @@ struct cfent {
};
/*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
- */
-#define CSS_ID_MAX (65535)
-struct css_id {
- /*
- * The css to which this ID points. This pointer is set to valid value
- * after cgroup is populated. If cgroup is removed, this will be NULL.
- * This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_tryget()
- * should be used for avoiding race.
- */
- struct cgroup_subsys_state __rcu *css;
- /*
- * ID of this css.
- */
- unsigned short id;
- /*
- * Depth in hierarchy which this ID belongs to.
- */
- unsigned short depth;
- /*
- * ID is freed by RCU. (and lookup routine is RCU safe.)
- */
- struct rcu_head rcu_head;
- /*
- * Hierarchy of CSS ID belongs to.
- */
- unsigned short stack[0]; /* Array of Length (depth+1) */
-};
-
-/*
* cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
@@ -387,9 +355,6 @@ struct cgrp_cset_link {
static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link;
-static int cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *css);
-
/*
* css_set_lock protects the list of css_set objects, and the chain of
* tasks off each css_set. Nests outside task->alloc_lock due to
@@ -841,8 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-static int alloc_css_id(struct cgroup_subsys_state *child_css);
-
static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -4240,21 +4203,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
goto err;
}
}
-
- /* This cgroup is ready now */
- for_each_root_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
- struct css_id *id = rcu_dereference_protected(css->id, true);
-
- /*
- * Update id->css pointer and make this css visible from
- * CSS ID functions. This pointer will be dereferened
- * from RCU-read-side without locks.
- */
- if (id)
- rcu_assign_pointer(id->css, css);
- }
-
return 0;
err:
cgroup_clear_dir(cgrp, subsys_mask);
@@ -4323,7 +4271,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
css->cgroup = cgrp;
css->ss = ss;
css->flags = 0;
- css->id = NULL;
if (cgrp->parent)
css->parent = cgroup_css(cgrp->parent, ss);
@@ -4455,12 +4402,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_free_all;
init_css(css, ss, cgrp);
-
- if (ss->use_id) {
- err = alloc_css_id(css);
- if (err)
- goto err_free_all;
- }
}
/*
@@ -4925,12 +4866,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
/* our new subsystem will be attached to the dummy hierarchy. */
init_css(css, ss, cgroup_dummy_top);
- /* init_idr must be after init_css() because it sets css->id. */
- if (ss->use_id) {
- ret = cgroup_init_idr(ss, css);
- if (ret)
- goto err_unload;
- }
/*
* Now we need to entangle the css into the existing css_sets. unlike
@@ -4996,9 +4931,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
offline_css(cgroup_css(cgroup_dummy_top, ss));
- if (ss->use_id)
- idr_destroy(&ss->idr);
-
/* deassign the subsys_id */
cgroup_subsys[ss->subsys_id] = NULL;
@@ -5025,8 +4957,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
/*
* remove subsystem's css from the cgroup_dummy_top and free it -
* need to free before marking as null because ss->css_free needs
- * the cgrp->subsys pointer to find their state. note that this
- * also takes care of freeing the css_id.
+ * the cgrp->subsys pointer to find their state.
*/
ss->css_free(cgroup_css(cgroup_dummy_top, ss));
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@ -5097,8 +5028,6 @@ int __init cgroup_init(void)
for_each_builtin_subsys(ss, i) {
if (!ss->early_init)
cgroup_init_subsys(ss);
- if (ss->use_id)
- cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
}
/* allocate id for the dummy hierarchy */
@@ -5518,181 +5447,6 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-
-/* to get ID other than 0, this should be called when !cgroup_is_dead() */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
- struct css_id *cssid;
-
- /*
- * This css_id() can return correct value when somone has refcnt
- * on this or this is under rcu_read_lock(). Once css->id is allocated,
- * it's unchanged until freed.
- */
- cssid = rcu_dereference_raw(css->id);
-
- if (cssid)
- return cssid->id;
- return 0;
-}
-EXPORT_SYMBOL_GPL(css_id);
-
-/**
- * css_is_ancestor - test "root" css is an ancestor of "child"
- * @child: the css to be tested.
- * @root: the css supporsed to be an ancestor of the child.
- *
- * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, the caller must hold rcu_read_lock().
- * But, considering usual usage, the csses should be valid objects after test.
- * Assuming that the caller will do some action to the child if this returns
- * returns true, the caller must take "child";s reference count.
- * If "child" is valid object and this returns true, "root" is valid, too.
- */
-
-bool css_is_ancestor(struct cgroup_subsys_state *child,
- const struct cgroup_subsys_state *root)
-{
- struct css_id *child_id;
- struct css_id *root_id;
-
- child_id = rcu_dereference(child->id);
- if (!child_id)
- return false;
- root_id = rcu_dereference(root->id);
- if (!root_id)
- return false;
- if (child_id->depth < root_id->depth)
- return false;
- if (child_id->stack[root_id->depth] != root_id->id)
- return false;
- return true;
-}
-
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
- struct css_id *id = rcu_dereference_protected(css->id, true);
-
- /* When this is called before css_id initialization, id can be NULL */
- if (!id)
- return;
-
- BUG_ON(!ss->use_id);
-
- rcu_assign_pointer(id->css, NULL);
- rcu_assign_pointer(css->id, NULL);
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, id->id);
- spin_unlock(&ss->id_lock);
- kfree_rcu(id, rcu_head);
-}
-EXPORT_SYMBOL_GPL(free_css_id);
-
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
- */
-
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
-{
- struct css_id *newid;
- int ret, size;
-
- BUG_ON(!ss->use_id);
-
- size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
- newid = kzalloc(size, GFP_KERNEL);
- if (!newid)
- return ERR_PTR(-ENOMEM);
-
- idr_preload(GFP_KERNEL);
- spin_lock(&ss->id_lock);
- /* Don't use 0. allocates an ID of 1-65535 */
- ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
- spin_unlock(&ss->id_lock);
- idr_preload_end();
-
- /* Returns error when there are no free spaces for new ID.*/
- if (ret < 0)
- goto err_out;
-
- newid->id = ret;
- newid->depth = depth;
- return newid;
-err_out:
- kfree(newid);
- return ERR_PTR(ret);
-
-}
-
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *rootcss)
-{
- struct css_id *newid;
-
- spin_lock_init(&ss->id_lock);
- idr_init(&ss->idr);
-
- newid = get_new_cssid(ss, 0);
- if (IS_ERR(newid))
- return PTR_ERR(newid);
-
- newid->stack[0] = newid->id;
- RCU_INIT_POINTER(newid->css, rootcss);
- RCU_INIT_POINTER(rootcss->id, newid);
- return 0;
-}
-
-static int alloc_css_id(struct cgroup_subsys_state *child_css)
-{
- struct cgroup_subsys_state *parent_css = css_parent(child_css);
- struct css_id *child_id, *parent_id;
- int i, depth;
-
- parent_id = rcu_dereference_protected(parent_css->id, true);
- depth = parent_id->depth + 1;
-
- child_id = get_new_cssid(child_css->ss, depth);
- if (IS_ERR(child_id))
- return PTR_ERR(child_id);
-
- for (i = 0; i < depth; i++)
- child_id->stack[i] = parent_id->stack[i];
- child_id->stack[depth] = child_id->id;
- /*
- * child_id->css pointer will be set after this cgroup is available
- * see cgroup_populate_dir()
- */
- rcu_assign_pointer(child_css->id, child_id);
-
- return 0;
-}
-
-/**
- * css_lookup - lookup css by id
- * @ss: cgroup subsys to be looked into.
- * @id: the id
- *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
- * NULL if not. Should be called under rcu_read_lock()
- */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
-{
- struct css_id *cssid = NULL;
-
- BUG_ON(!ss->use_id);
- cssid = idr_find(&ss->idr, id);
-
- if (unlikely(!cssid))
- return NULL;
-
- return rcu_dereference(cssid->css);
-}
-EXPORT_SYMBOL_GPL(css_lookup);
-
/**
* css_from_dir - get corresponding css from the dentry of a cgroup dir
* @dentry: directory dentry of interest
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 63aa50d..973d034 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -437,11 +437,6 @@ int cpu_up(unsigned int cpu)
{
int err = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG
- int nid;
- pg_data_t *pgdat;
-#endif
-
if (!cpu_possible(cpu)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
@@ -452,27 +447,9 @@ int cpu_up(unsigned int cpu)
return -EINVAL;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
- nid = cpu_to_node(cpu);
- if (!node_online(nid)) {
- err = mem_online_node(nid);
- if (err)
- return err;
- }
-
- pgdat = NODE_DATA(nid);
- if (!pgdat) {
- printk(KERN_ERR
- "Can't online cpu %d due to NULL pgdat\n", cpu);
- return -ENOMEM;
- }
-
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
-#endif
+ err = try_online_node(cpu_to_node(cpu));
+ if (err)
+ return err;
cpu_maps_update_begin();
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d44..7d2f35e 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
raw_spin_lock(&dbg_slave_lock);
#ifdef CONFIG_SMP
+ /* If send_ready set, slaves are already waiting */
+ if (ks->send_ready)
+ atomic_set(ks->send_ready, 1);
+
/* Signal the other CPUs to enter kgdb_wait() */
- if ((!kgdb_single_step) && kgdb_do_roundup)
+ else if ((!kgdb_single_step) && kgdb_do_roundup)
kgdb_roundup_cpus(flags);
#endif
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (arch_kgdb_ops.enable_nmi)
arch_kgdb_ops.enable_nmi(0);
+ memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
ks->signo = signo;
ks->err_code = ecode;
- ks->kgdb_usethreadid = 0;
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
return 1;
}
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+ if (!kgdb_io_ready(0) || !send_ready)
+ return 1;
+
+ if (kgdb_info[cpu].enter_kgdb == 0) {
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->ex_vector = trapnr;
+ ks->signo = SIGTRAP;
+ ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI;
+ ks->linux_regs = regs;
+ ks->send_ready = send_ready;
+ kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
static void kgdb_console_write(struct console *co, const char *s,
unsigned count)
{
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 2235967..572aa4f 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
+ atomic_t *send_ready;
};
/* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
extern int kdb_common_init_state(struct kgdb_state *ks);
extern int kdb_common_deinit_state(void);
+#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
return DBG_PASS_EVENT;
}
+#define KGDB_KDB_REASON_SYSTEM_NMI 0
#endif /* CONFIG_KGDB_KDB */
#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 328d18ef..8859ca3 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
if (atomic_read(&kgdb_setting_breakpoint))
reason = KDB_REASON_KEYBOARD;
- if (in_nmi())
+ if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+ reason = KDB_REASON_SYSTEM_NMI;
+
+ else if (in_nmi())
reason = KDB_REASON_NMI;
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 00eb8f7..0b097c8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
instruction_pointer(regs));
kdb_dumpregs(regs);
break;
+ case KDB_REASON_SYSTEM_NMI:
+ kdb_printf("due to System NonMaskable Interrupt\n");
+ break;
case KDB_REASON_NMI:
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d473988..54996b7 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
struct timespec ts;
cputime_t utime, stime, stimescaled, utimescaled;
- /* Though tsk->delays accessed later, early exit avoids
- * unnecessary returning of other data
- */
- if (!tsk->delays)
- goto done;
-
tmp = (s64)d->cpu_run_real_total;
task_cputime(tsk, &utime, &stime);
cputime_to_timespec(utime + stime, &ts);
@@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_count += tsk->delays->freepages_count;
spin_unlock_irqrestore(&tsk->delays->lock, flags);
-done:
return 0;
}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index ff915ef..e556751 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -1,23 +1,19 @@
#include <linux/elf.h>
#include <linux/fs.h>
#include <linux/mm.h>
-
-#include <asm/elf.h>
-
+#include <linux/binfmts.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
return 0;
}
-int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
- unsigned long limit)
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
return 1;
}
-int __weak elf_core_write_extra_data(struct file *file, size_t *size,
- unsigned long limit)
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
{
return 1;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 953c143..d724e77 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
-static atomic_t perf_sample_allowed_ns __read_mostly =
- ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
void update_perf_cpu_limits(void)
{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
tmp *= sysctl_perf_cpu_time_max_percent;
do_div(tmp, 100);
- atomic_set(&perf_sample_allowed_ns, tmp);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
* we detect that events are taking too long.
*/
#define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
+static DEFINE_PER_CPU(u64, running_sample_length);
void perf_sample_event_took(u64 sample_len_ns)
{
u64 avg_local_sample_len;
u64 local_samples_len;
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- if (atomic_read(&perf_sample_allowed_ns) == 0)
+ if (allowed_ns == 0)
return;
/* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
*/
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
- if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+ if (avg_local_sample_len <= allowed_ns)
return;
if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
printk_ratelimited(KERN_WARNING
- "perf samples too long (%lld > %d), lowering "
+ "perf samples too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len,
- atomic_read(&perf_sample_allowed_ns),
+ avg_local_sample_len, allowed_ns,
sysctl_perf_event_sample_rate);
update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
+ ctx->generation++;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+
+ ctx->generation++;
}
/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
event->header_size = size;
}
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
}
/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled events.
- * If the number of enabled events is the same, then the set
- * of enabled events should be the same, because these are both
- * inherited contexts, therefore we can't access individual events
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
- return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
- && ctx1->parent_gen == ctx2->parent_gen
- && !ctx1->pin_count && !ctx2->pin_count;
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
}
static void __perf_event_sync_stat(struct perf_event *event,
@@ -2210,9 +2234,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
perf_event_update_userpage(next_event);
}
-#define list_next_entry(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
@@ -2244,7 +2265,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
+ struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
@@ -2256,10 +2277,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
return;
rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp[ctxn];
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent && !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
@@ -2287,6 +2316,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
+unlock:
rcu_read_unlock();
if (do_switch) {
@@ -4572,6 +4602,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5100,27 +5133,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- const char *name;
-
- memset(tmp, 0, sizeof(tmp));
+ char *name;
if (file) {
struct inode *inode;
dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
/*
- * d_path works from the end of the rb backwards, so we
+ * d_path() works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
- if (!buf) {
- name = strncpy(tmp, "//enomem", sizeof(tmp));
- goto got_name;
- }
- name = d_path(&file->f_path, buf, PATH_MAX);
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
- name = strncpy(tmp, "//toolong", sizeof(tmp));
- goto got_name;
+ name = "//toolong";
+ goto cpy_name;
}
inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
@@ -5128,34 +5160,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
-
+ goto got_name;
} else {
- if (arch_vma_name(mmap_event->vma)) {
- name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp) - 1);
- tmp[sizeof(tmp) - 1] = '\0';
- goto got_name;
- }
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
- if (!vma->vm_mm) {
- name = strncpy(tmp, "[vdso]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
- name = strncpy(tmp, "[heap]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
- name = strncpy(tmp, "[stack]", sizeof(tmp));
- goto got_name;
+ name = "[stack]";
+ goto cpy_name;
}
- name = strncpy(tmp, "//anon", sizeof(tmp));
- goto got_name;
+ name = "//anon";
+ goto cpy_name;
}
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
got_name:
- size = ALIGN(strlen(name)+1, sizeof(u64));
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
mmap_event->file_name = name;
mmap_event->file_size = size;
@@ -6292,6 +6329,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);
static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
@@ -6336,17 +6374,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
return count;
}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static struct device_attribute pmu_dev_attrs[] = {
- __ATTR_RO(type),
- __ATTR_RW(perf_event_mux_interval_ms),
- __ATTR_NULL,
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
- .dev_attrs = pmu_dev_attrs,
+ .dev_groups = pmu_dev_groups,
};
static void pmu_dev_release(struct device *dev)
@@ -7126,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
perf_install_in_context(ctx, event, event->cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -7209,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca65997..569b2187 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
}
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned int \
+static inline unsigned long \
func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned int len) \
+ const void *buf, unsigned long len) \
{ \
unsigned long size, written; \
\
do { \
- size = min_t(unsigned long, handle->size, len); \
- \
+ size = min(handle->size, len); \
written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
\
len -= written; \
handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
-static inline int memcpy_common(void *dst, const void *src, size_t n)
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
{
memcpy(dst, src, n);
- return n;
+ return 0;
}
DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
#ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
+}
#endif
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 9c2ddfb..e8b168a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/circ_buf.h>
#include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
- unsigned long offset, unsigned long head)
-{
- unsigned long sz = perf_data_size(rb);
- unsigned long mask = sz - 1;
-
- /*
- * check if user-writable
- * overwrite : over-write its own tail
- * !overwrite: buffer possibly drops events.
- */
- if (rb->overwrite)
- return true;
-
- /*
- * verify that payload is not bigger than buffer
- * otherwise masking logic may fail to detect
- * the "not enough space" condition
- */
- if ((head - offset) > sz)
- return false;
-
- offset = (offset - tail) & mask;
- head = (head - tail) & mask;
-
- if ((int)(head - offset) < 0)
- return false;
-
- return true;
-}
-
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
@@ -115,8 +85,8 @@ again:
rb->user_page->data_head = head;
/*
- * Now check if we missed an update, rely on the (compiler)
- * barrier in atomic_dec_and_test() to re-read rb->head.
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
@@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
- int have_lost;
- struct perf_sample_data sample_data;
+ int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
@@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
event = event->parent;
rb = rcu_dereference(event->rb);
- if (!rb)
+ if (unlikely(!rb))
goto out;
- handle->rb = rb;
- handle->event = event;
-
- if (!rb->nr_pages)
+ if (unlikely(!rb->nr_pages))
goto out;
+ handle->rb = rb;
+ handle->event = event;
+
have_lost = local_read(&rb->lost);
- if (have_lost) {
- lost_event.header.size = sizeof(lost_event);
- perf_event_header__init_id(&lost_event.header, &sample_data,
- event);
- size += lost_event.header.size;
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
}
perf_output_get_handle(handle);
do {
- /*
- * Userspace could choose to issue a mb() before updating the
- * tail pointer. So that all reads will be completed before the
- * write is issued.
- *
- * See perf_output_put_handle().
- */
tail = ACCESS_ONCE(rb->user_page->data_tail);
- smp_mb();
offset = head = local_read(&rb->head);
- head += size;
- if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
+ head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
- if (head - local_read(&rb->wakeup) > rb->watermark)
+ /*
+ * Separate the userpage->tail read from the data stores below.
+ * Matches the MB userspace SHOULD issue after reading the data
+ * and before storing the new tail position.
+ *
+ * See perf_output_put_handle().
+ */
+ smp_mb();
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
- handle->page = offset >> (PAGE_SHIFT + page_order(rb));
- handle->page &= rb->nr_pages - 1;
- handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
- handle->addr = rb->data_pages[handle->page];
- handle->addr += handle->size;
- handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+ page_shift = PAGE_SHIFT + page_order(rb);
- if (have_lost) {
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
+
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bd..24b7d6c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
#include <linux/uprobes.h>
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
* supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
- * have fixed length instructions.
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
*/
/*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* For mm @mm, write the opcode at @vaddr.
* Return 0 (success) or a negative errno.
*/
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
return ret;
}
-static int
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
- unsigned long nbytes, loff_t offset)
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
{
struct page *page;
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
- struct address_space *mapping;
- unsigned long nbytes;
- int bytes;
-
- nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
- mapping = uprobe->inode->i_mapping;
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = uprobe->arch.insn;
+ int size = MAX_UINSN_BYTES;
+ int len, err = -EIO;
- /* Instruction at end of binary; copy only available bytes */
- if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
- bytes = uprobe->inode->i_size - uprobe->offset;
- else
- bytes = MAX_UINSN_BYTES;
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
- /* Instruction at the page-boundary; copy bytes in second page */
- if (nbytes < bytes) {
- int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
- bytes - nbytes, uprobe->offset + nbytes);
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
if (err)
- return err;
- bytes = nbytes;
- }
- return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
}
static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* write_opcode() assumes we don't cross page boundary */
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
BUG_ON((uprobe->offset & ~PAGE_MASK) +
UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
}
/* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- struct mm_struct *mm = current->mm;
int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
- ret = -ENOMEM;
- /* Try to map as high as possible, this is only a hint. */
- area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
- ret = area->vaddr;
- goto fail;
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
}
ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
- ret = 0;
fail:
up_write(&mm->mmap_sem);
return ret;
}
-/*
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
+static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- struct xol_area *area;
uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
- area = mm->uprobes_state.xol_area;
- if (area)
- goto ret;
-
- area = kzalloc(sizeof(*area), GFP_KERNEL);
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
if (!area->page)
goto free_bitmap;
- /* allocate first slot of task's xol_area for the return probes */
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
atomic_set(&area->slot_count, 1);
- init_waitqueue_head(&area->wq);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
- if (!xol_add_vma(area))
+ if (!xol_add_vma(mm, area))
return area;
__free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
free_area:
kfree(area);
out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
area = mm->uprobes_state.xol_area;
- ret:
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
return area;
}
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
return 0;
/* Initialize the slot */
- copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
+ copy_to_page(area->page, xol_vaddr,
+ uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
- t->utask = NULL;
-}
-
-/*
* Allocate a uprobe_task object for the task if if necessary.
* Called when the thread hits a breakpoint.
*
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ kfree(work);
+
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->vaddr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct callback_head *work;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ /* TODO: move it into the union in uprobe_task */
+ work = kmalloc(sizeof(*work), GFP_KERNEL);
+ if (!work)
+ return uprobe_warn(t, "dup xol area");
+
+ t->utask->vaddr = area->vaddr;
+ init_task_work(work, dup_xol_work);
+ task_work_add(t, work, true);
+}
+
/*
* Current area->vaddr notion assume the trampoline address is always
* equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
return register_die_notifier(&uprobe_exception_nb);
}
-module_init(init_uprobes);
-
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
+__initcall(init_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index c93be06..f6d11fc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1370,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
- uprobe_copy_process(p);
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -1487,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
+ uprobe_copy_process(p, clone_flags);
return p;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d4da55d..d04ce8a 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
+choice
+ prompt "Specify GCOV format"
+ depends on GCOV_KERNEL
+ default GCOV_FORMAT_AUTODETECT
+ ---help---
+ The gcov format is usually determined by the GCC version, but there are
+ exceptions where format changes are integrated in lower-version GCCs.
+ In such a case use this option to adjust the format used in the kernel
+ accordingly.
+
+ If unsure, choose "Autodetect".
+
+config GCOV_FORMAT_AUTODETECT
+ bool "Autodetect"
+ ---help---
+ Select this option to use the format that corresponds to your GCC
+ version.
+
+config GCOV_FORMAT_3_4
+ bool "GCC 3.4 format"
+ ---help---
+ Select this option to use the format defined by GCC 3.4.
+
+config GCOV_FORMAT_4_7
+ bool "GCC 4.7 format"
+ ---help---
+ Select this option to use the format defined by GCC 4.7.
+
+endchoice
+
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index e97ca59..52aa7e8 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,33 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
+# if-lt
+# Usage VAR := $(call if-lt, $(a), $(b))
+# Returns 1 if (a < b)
+if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
+
+ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
+ cc-ver := 0304
+else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
+ cc-ver := 0407
+else
+# Use cc-version if available, otherwise set 0
+#
+# scripts/Kbuild.include, which contains cc-version function, is not included
+# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
+# Meaning cc-ver is empty causing if-lt test to fail with
+# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
+# This has no affect on the clean phase, but the error message could be
+# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
+# is not available. We can probably move if-lt to Kbuild.include, so it's also
+# not defined during clean or to include Kbuild.include in
+# scripts/Makefile.clean. But the following workaround seems least invasive.
+ cc-ver := $(if $(call cc-version),$(call cc-version),0)
+endif
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
+
+ifeq ($(call if-lt, $(cc-ver), 0407),1)
+ obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
+else
+ obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
+endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9b22d03..f45b75b 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -20,7 +20,6 @@
#include <linux/mutex.h>
#include "gcov.h"
-static struct gcov_info *gcov_info_head;
static int gcov_events_enabled;
static DEFINE_MUTEX(gcov_lock);
@@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)
mutex_lock(&gcov_lock);
if (gcov_version == 0) {
- gcov_version = info->version;
+ gcov_version = gcov_info_version(info);
/*
* Printing gcc's version magic may prove useful for debugging
* incompatibility reports.
@@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)
* Add new profiling data structure to list and inform event
* listener.
*/
- info->next = gcov_info_head;
- gcov_info_head = info;
+ gcov_info_link(info);
if (gcov_events_enabled)
gcov_event(GCOV_ADD, info);
mutex_unlock(&gcov_lock);
@@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_delta);
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
@@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);
*/
void gcov_enable_events(void)
{
- struct gcov_info *info;
+ struct gcov_info *info = NULL;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
+
/* Perform event callback for previously registered entries. */
- for (info = gcov_info_head; info; info = info->next)
+ while ((info = gcov_info_next(info)))
gcov_event(GCOV_ADD, info);
+
mutex_unlock(&gcov_lock);
}
@@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
- struct gcov_info *info;
- struct gcov_info *prev;
+ struct gcov_info *info = NULL;
+ struct gcov_info *prev = NULL;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
- prev = NULL;
+
/* Remove entries located in module from linked list. */
- for (info = gcov_info_head; info; info = info->next) {
+ while ((info = gcov_info_next(info))) {
if (within(info, mod->module_core, mod->core_size)) {
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
+ gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
+
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 7a7d2ee..15ff01a 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str)
unsigned long val;
if (kstrtoul(str, 0, &val)) {
- pr_warning("invalid gcov_persist parameter '%s'\n", str);
+ pr_warn("invalid gcov_persist parameter '%s'\n", str);
return 0;
}
gcov_persist = val;
@@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)
list_for_each_entry(node, &all_head, all) {
info = get_node_info(node);
- if (info && (strcmp(info->filename, name) == 0))
+ if (info && (strcmp(gcov_info_filename(info), name) == 0))
return node;
}
@@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
seq = file->private_data;
info = gcov_iter_get_info(seq->private);
mutex_lock(&node_lock);
- node = get_node_by_name(info->filename);
+ node = get_node_by_name(gcov_info_filename(info));
if (node) {
/* Reset counts or remove node for unloaded modules. */
if (node->num_loaded == 0)
@@ -365,7 +365,7 @@ static const char *deskew(const char *basename)
*/
static void add_links(struct gcov_node *node, struct dentry *parent)
{
- char *basename;
+ const char *basename;
char *target;
int num;
int i;
@@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
if (!node->links)
return;
for (i = 0; i < num; i++) {
- target = get_link_target(get_node_info(node)->filename,
- &gcov_link[i]);
+ target = get_link_target(
+ gcov_info_filename(get_node_info(node)),
+ &gcov_link[i]);
if (!target)
goto out_err;
- basename = strrchr(target, '/');
- if (!basename)
+ basename = kbasename(target);
+ if (basename == target)
goto out_err;
- basename++;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
if (!node->links[i])
@@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
if (!node->dentry) {
- pr_warning("could not create file\n");
+ pr_warn("could not create file\n");
kfree(node);
return NULL;
}
@@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,
err_nomem:
kfree(node);
- pr_warning("out of memory\n");
+ pr_warn("out of memory\n");
return NULL;
}
@@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)
struct gcov_node *parent;
struct gcov_node *node;
- filename = kstrdup(info->filename, GFP_KERNEL);
+ filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
if (!filename)
return;
parent = &root_node;
@@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
*/
loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
if (!loaded_info) {
- pr_warning("could not add '%s' (out of memory)\n",
- info->filename);
+ pr_warn("could not add '%s' (out of memory)\n",
+ gcov_info_filename(info));
return;
}
memcpy(loaded_info, node->loaded_info,
@@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
* data set replaces the copy of the last one.
*/
if (!gcov_info_is_compatible(node->unloaded_info, info)) {
- pr_warning("discarding saved data for %s "
- "(incompatible version)\n", info->filename);
+ pr_warn("discarding saved data for %s "
+ "(incompatible version)\n",
+ gcov_info_filename(info));
gcov_info_free(node->unloaded_info);
node->unloaded_info = NULL;
}
@@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)
* The initial one takes precedence.
*/
if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
- pr_warning("could not add '%s' (incompatible "
- "version)\n", info->filename);
+ pr_warn("could not add '%s' (incompatible "
+ "version)\n", gcov_info_filename(info));
kfree(loaded_info);
return;
}
@@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)
else {
node->unloaded_info = gcov_info_dup(info);
if (!node->unloaded_info) {
- pr_warning("could not save data for '%s' "
- "(out of memory)\n", info->filename);
+ pr_warn("could not save data for '%s' "
+ "(out of memory)\n",
+ gcov_info_filename(info));
}
}
}
@@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)
i = get_info_index(node, info);
if (i < 0) {
- pr_warning("could not remove '%s' (not found)\n",
- info->filename);
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
return;
}
if (gcov_persist)
@@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
struct gcov_node *node;
mutex_lock(&node_lock);
- node = get_node_by_name(info->filename);
+ node = get_node_by_name(gcov_info_filename(info));
switch (action) {
case GCOV_ADD:
if (node)
@@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
if (node)
remove_info(node, info);
else {
- pr_warning("could not remove '%s' (not found)\n",
- info->filename);
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
}
break;
}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb42..27bc88a 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -21,6 +21,121 @@
#include <linux/vmalloc.h>
#include "gcov.h"
+#define GCOV_COUNTERS 5
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 0000000..2c6e463
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,560 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 4.7.
+ *
+ * This file is based heavily on gcc_3_4.c file.
+ *
+ * For a better understanding, refer to gcc source:
+ * gcc/gcov-io.h
+ * libgcc/libgcov.c
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+#define GCOV_COUNTERS 8
+#define GCOV_TAG_FUNCTION_LENGTH 3
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_ctr_info - information about counters for a single function
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+};
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @key: comdat key
+ * @ident: unique ident of function
+ * @lineno_checksum: function lineo_checksum
+ * @cfg_checksum: function cfg checksum
+ * @ctrs: instrumented counters
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ *
+ * Information about a single function. This uses the trailing array
+ * idiom. The number of counters is determined from the merge pointer
+ * array in gcov_info. The key is used to detect which of a set of
+ * comdat functions was selected -- it points to the gcov_info object
+ * of the object file containing the selected comdat function.
+ */
+struct gcov_fn_info {
+ const struct gcov_info *key;
+ unsigned int ident;
+ unsigned int lineno_checksum;
+ unsigned int cfg_checksum;
+ struct gcov_ctr_info ctrs[0];
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: uniquifying time stamp
+ * @filename: name of the associated gcov data file
+ * @merge: merge functions (null for unused counter type)
+ * @n_functions: number of instrumented functions
+ * @functions: pointer to pointers to function information
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
+ unsigned int n_functions;
+ struct gcov_fn_info **functions;
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Doesn't change at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return info->merge[type] ? 1 : 0;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ memset(ci_ptr->values, 0,
+ sizeof(gcov_type) * ci_ptr->num);
+ ci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_ctr_info *dci_ptr;
+ struct gcov_ctr_info *sci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int val_idx;
+
+ for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
+ dci_ptr = dst->functions[fi_idx]->ctrs;
+ sci_ptr = src->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(src, ct_idx))
+ continue;
+
+ for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
+ dci_ptr->values[val_idx] +=
+ sci_ptr->values[val_idx];
+
+ dci_ptr++;
+ sci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_ctr_info *dci_ptr; /* dst counter info */
+ struct gcov_ctr_info *sci_ptr; /* src counter info */
+ unsigned int active;
+ unsigned int fi_idx; /* function info idx */
+ unsigned int ct_idx; /* counter type idx */
+ size_t fi_size; /* function info size */
+ size_t cv_size; /* counter values size */
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+
+ dup->next = NULL;
+ dup->filename = NULL;
+ dup->functions = NULL;
+
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+
+ dup->functions = kcalloc(info->n_functions,
+ sizeof(struct gcov_fn_info *), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+
+ active = num_counter_active(info);
+ fi_size = sizeof(struct gcov_fn_info);
+ fi_size += sizeof(struct gcov_ctr_info) * active;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
+ if (!dup->functions[fi_idx])
+ goto err_free;
+
+ *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
+
+ sci_ptr = info->functions[fi_idx]->ctrs;
+ dci_ptr = dup->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++) {
+
+ cv_size = sizeof(gcov_type) * sci_ptr->num;
+
+ dci_ptr->values = vmalloc(cv_size);
+
+ if (!dci_ptr->values)
+ goto err_free;
+
+ dci_ptr->num = sci_ptr->num;
+ memcpy(dci_ptr->values, sci_ptr->values, cv_size);
+
+ sci_ptr++;
+ dci_ptr++;
+ }
+ }
+
+ return dup;
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ struct gcov_ctr_info *ci_ptr;
+
+ if (!info->functions)
+ goto free_info;
+
+ active = num_counter_active(info);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ if (!info->functions[fi_idx])
+ continue;
+
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
+ vfree(ci_ptr->values);
+
+ kfree(info->functions[fi_idx]);
+ }
+
+free_info:
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int cv_idx;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->stamp);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ fi_ptr = info->functions[fi_idx];
+
+ /* Function record. */
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ ci_ptr = fi_ptr->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ /* Counter record. */
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FOR_COUNTER(ct_idx));
+ pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+
+ for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
+ pos += store_gcov_u64(buffer, pos,
+ ci_ptr->values[cv_idx]);
+ }
+
+ ci_ptr++;
+ }
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073e..92c8e22 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,7 +21,6 @@
* gcc and need to be kept as close to the original definition as possible to
* remain compatible.
*/
-#define GCOV_COUNTERS 5
#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
@@ -34,60 +33,18 @@ typedef long gcov_type;
typedef long long gcov_type;
#endif
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @ident: object file-unique function identifier
- * @checksum: function checksum
- * @n_ctrs: number of values per counter type belonging to this function
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
- unsigned int ident;
- unsigned int checksum;
- unsigned int n_ctrs[0];
-};
-
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
- void (*merge)(gcov_type *, unsigned int);
-};
+/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
+ * we cannot use full definition here and they need to be placed in gcc specific
+ * implementation of gcov. This also means no direct access to the members in
+ * generic code and usage of the interface below.*/
+struct gcov_info;
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: time stamp
- * @filename: name of the associated gcov data file
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- unsigned int n_functions;
- const struct gcov_fn_info *functions;
- unsigned int ctr_mask;
- struct gcov_ctr_info counts[0];
-};
+/* Interface to access gcov_info data */
+const char *gcov_info_filename(struct gcov_info *info);
+unsigned int gcov_info_version(struct gcov_info *info);
+struct gcov_info *gcov_info_next(struct gcov_info *info);
+void gcov_info_link(struct gcov_info *info);
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
/* Base interface. */
enum gcov_action {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 706724e..cf68bb3 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
}
EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-unsigned int irq_create_of_mapping(struct device_node *controller,
- const u32 *intspec, unsigned int intsize)
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
{
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
unsigned int virq;
- domain = controller ? irq_find_host(controller) : irq_default_domain;
+ domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
if (!domain) {
pr_warn("no irq domain found for %s !\n",
- of_node_full_name(controller));
+ of_node_full_name(irq_data->np));
return 0;
}
/* If domain has no translation, then we assume interrupt line */
if (domain->ops->xlate == NULL)
- hwirq = intspec[0];
+ hwirq = irq_data->args[0];
else {
- if (domain->ops->xlate(domain, controller, intspec, intsize,
- &hwirq, &type))
+ if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
+ irq_data->args_count, &hwirq, &type))
return 0;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd..3e59f95 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
goto out_mput;
}
- sched_setscheduler(t, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
/*
* We keep the reference to the task struct even if
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 297a924..9019f15 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable);
void static_key_slow_inc(struct static_key *key)
{
+ STATIC_KEY_CHECK_USE();
if (atomic_inc_not_zero(&key->enabled))
return;
@@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work)
void static_key_slow_dec(struct static_key *key)
{
+ STATIC_KEY_CHECK_USE();
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
+ STATIC_KEY_CHECK_USE();
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
@@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
+ STATIC_KEY_CHECK_USE();
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
@@ -212,6 +216,7 @@ void __init jump_label_init(void)
key->next = NULL;
#endif
}
+ static_key_initialized = true;
jump_label_unlock();
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a0d367a..ceeadfc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2066,7 +2066,7 @@ static int __init init_kprobes(void)
{
int i, err = 0;
unsigned long offset = 0, size = 0;
- char *modname, namebuf[128];
+ char *modname, namebuf[KSYM_NAME_LEN];
const char *symbol_name;
void *addr;
struct kprobe_blackpoint *kb;
@@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
- char *modname, namebuf[128];
+ char *modname, namebuf[KSYM_NAME_LEN];
head = &kprobe_table[i];
preempt_disable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 760e86d..b5ae3ee 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -33,7 +33,7 @@ struct kthread_create_info
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
- struct completion done;
+ struct completion *done;
struct list_head list;
};
@@ -178,6 +178,7 @@ static int kthread(void *_create)
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
+ struct completion *done;
struct kthread self;
int ret;
@@ -187,10 +188,16 @@ static int kthread(void *_create)
init_completion(&self.parked);
current->vfork_done = &self.exited;
+ /* If user was SIGKILLed, I release the structure. */
+ done = xchg(&create->done, NULL);
+ if (!done) {
+ kfree(create);
+ do_exit(-EINTR);
+ }
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
- complete(&create->done);
+ complete(done);
schedule();
ret = -EINTR;
@@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create)
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
+ /* If user was SIGKILLed, I release the structure. */
+ struct completion *done = xchg(&create->done, NULL);
+
+ if (!done) {
+ kfree(create);
+ return;
+ }
create->result = ERR_PTR(pid);
- complete(&create->done);
+ complete(done);
}
}
@@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
const char namefmt[],
...)
{
- struct kthread_create_info create;
-
- create.threadfn = threadfn;
- create.data = data;
- create.node = node;
- init_completion(&create.done);
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct task_struct *task;
+ struct kthread_create_info *create = kmalloc(sizeof(*create),
+ GFP_KERNEL);
+
+ if (!create)
+ return ERR_PTR(-ENOMEM);
+ create->threadfn = threadfn;
+ create->data = data;
+ create->node = node;
+ create->done = &done;
spin_lock(&kthread_create_lock);
- list_add_tail(&create.list, &kthread_create_list);
+ list_add_tail(&create->list, &kthread_create_list);
spin_unlock(&kthread_create_lock);
wake_up_process(kthreadd_task);
- wait_for_completion(&create.done);
-
- if (!IS_ERR(create.result)) {
+ /*
+ * Wait for completion in killable state, for I might be chosen by
+ * the OOM killer while kthreadd is trying to allocate memory for
+ * new kernel thread.
+ */
+ if (unlikely(wait_for_completion_killable(&done))) {
+ /*
+ * If I was SIGKILLed before kthreadd (or new kernel thread)
+ * calls complete(), leave the cleanup of this structure to
+ * that thread.
+ */
+ if (xchg(&create->done, NULL))
+ return ERR_PTR(-ENOMEM);
+ /*
+ * kthreadd (or new kernel thread) will call complete()
+ * shortly.
+ */
+ wait_for_completion(&done);
+ }
+ task = create->result;
+ if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
- vsnprintf(create.result->comm, sizeof(create.result->comm),
- namefmt, args);
+ vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
va_end(args);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
- sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(create.result, cpu_all_mask);
+ sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(task, cpu_all_mask);
}
- return create.result;
+ kfree(create);
+ return task;
}
EXPORT_SYMBOL(kthread_create_on_node);
diff --git a/kernel/module.c b/kernel/module.c
index dc58274..af5ebd2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms,
if (syms->licence == GPL_ONLY)
return false;
if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
- printk(KERN_WARNING "Symbol %s is being used "
- "by a non-GPL module, which will not "
- "be allowed in the future\n", fsa->name);
+ pr_warn("Symbol %s is being used by a non-GPL module, "
+ "which will not be allowed in the future\n",
+ fsa->name);
}
}
#ifdef CONFIG_UNUSED_SYMBOLS
if (syms->unused && fsa->warn) {
- printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
- "however this module is using it.\n", fsa->name);
- printk(KERN_WARNING
- "This symbol will go away in the future.\n");
- printk(KERN_WARNING
- "Please evalute if this is the right api to use and if "
- "it really is, submit a report the linux kernel "
- "mailinglist together with submitting your code for "
- "inclusion.\n");
+ pr_warn("Symbol %s is marked as UNUSED, however this module is "
+ "using it.\n", fsa->name);
+ pr_warn("This symbol will go away in the future.\n");
+ pr_warn("Please evalute if this is the right api to use and if "
+ "it really is, submit a report the linux kernel "
+ "mailinglist together with submitting your code for "
+ "inclusion.\n");
}
#endif
@@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info)
return 0;
if (align > PAGE_SIZE) {
- printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
- mod->name, align, PAGE_SIZE);
+ pr_warn("%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
align = PAGE_SIZE;
}
mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
if (!mod->percpu) {
- printk(KERN_WARNING
- "%s: Could not allocate %lu bytes percpu data\n",
- mod->name, (unsigned long)pcpusec->sh_size);
+ pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
return -ENOMEM;
}
mod->percpu_size = pcpusec->sh_size;
@@ -679,7 +676,7 @@ static int add_module_usage(struct module *a, struct module *b)
pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
if (!use) {
- printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+ pr_warn("%s: out of memory loading\n", a->name);
return -ENOMEM;
}
@@ -1145,8 +1142,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
- printk(KERN_WARNING "%s: %s: kernel tainted.\n",
- mod->name, reason);
+ pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
return 0;
#else
@@ -1199,8 +1195,7 @@ static int check_version(Elf_Shdr *sechdrs,
goto bad_version;
}
- printk(KERN_WARNING "%s: no symbol version for %s\n",
- mod->name, symname);
+ pr_warn("%s: no symbol version for %s\n", mod->name, symname);
return 0;
bad_version:
@@ -1309,8 +1304,8 @@ resolve_symbol_wait(struct module *mod,
!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
|| PTR_ERR(ksym) != -EBUSY,
30 * HZ) <= 0) {
- printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
- mod->name, owner);
+ pr_warn("%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
}
return ksym;
}
@@ -1626,15 +1621,14 @@ static int mod_sysfs_init(struct module *mod)
struct kobject *kobj;
if (!module_sysfs_initialized) {
- printk(KERN_ERR "%s: module sysfs not initialized\n",
- mod->name);
+ pr_err("%s: module sysfs not initialized\n", mod->name);
err = -EINVAL;
goto out;
}
kobj = kset_find_obj(module_kset, mod->name);
if (kobj) {
- printk(KERN_ERR "%s: module is already loaded\n", mod->name);
+ pr_err("%s: module is already loaded\n", mod->name);
kobject_put(kobj);
err = -EINVAL;
goto out;
@@ -1961,8 +1955,7 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(s->name, &owner, NULL, true, false)) {
- printk(KERN_ERR
- "%s: exports duplicate symbol %s"
+ pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, s->name, module_name(owner));
return -ENOEXEC;
@@ -2013,8 +2006,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
- printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
- mod->name, name, PTR_ERR(ksym));
+ pr_warn("%s: Unknown symbol %s (err %li)\n",
+ mod->name, name, PTR_ERR(ksym));
ret = PTR_ERR(ksym) ?: -ENOENT;
break;
@@ -2168,8 +2161,8 @@ static void set_license(struct module *mod, const char *license)
if (!license_is_gpl_compatible(license)) {
if (!test_taint(TAINT_PROPRIETARY_MODULE))
- printk(KERN_WARNING "%s: module license '%s' taints "
- "kernel.\n", mod->name, license);
+ pr_warn("%s: module license '%s' taints kernel.\n",
+ mod->name, license);
add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
LOCKDEP_NOW_UNRELIABLE);
}
@@ -2405,8 +2398,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
return;
#ifdef CONFIG_DYNAMIC_DEBUG
if (ddebug_add_module(debug, num, debug->modname))
- printk(KERN_ERR "dynamic debug error adding module: %s\n",
- debug->modname);
+ pr_err("dynamic debug error adding module: %s\n",
+ debug->modname);
#endif
}
@@ -2619,8 +2612,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
Elf_Shdr *shdr = &info->sechdrs[i];
if (shdr->sh_type != SHT_NOBITS
&& info->len < shdr->sh_offset + shdr->sh_size) {
- printk(KERN_ERR "Module len %lu truncated\n",
- info->len);
+ pr_err("Module len %lu truncated\n", info->len);
return -ENOEXEC;
}
@@ -2682,15 +2674,14 @@ static struct module *setup_load_info(struct load_info *info, int flags)
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
- printk(KERN_WARNING "No module found in object\n");
+ pr_warn("No module found in object\n");
return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
if (info->index.sym == 0) {
- printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
- mod->name);
+ pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
return ERR_PTR(-ENOEXEC);
}
@@ -2717,7 +2708,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
if (err)
return err;
} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
- printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+ pr_err("%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
return -ENOEXEC;
}
@@ -2727,9 +2718,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- printk(KERN_WARNING "%s: module is from the staging directory,"
- " the quality is unknown, you have been warned.\n",
- mod->name);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
}
/* Set up license info based on the info section */
@@ -2801,8 +2791,7 @@ static void find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->extable), &mod->num_exentries);
if (section_addr(info, "__obsparm"))
- printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
- mod->name);
+ pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
info->debug = section_objs(info, "__verbose",
sizeof(*info->debug), &info->num_debug);
@@ -3078,11 +3067,10 @@ static int do_init_module(struct module *mod)
return ret;
}
if (ret > 0) {
- printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
- __func__, mod->name, ret,
- __func__);
+ pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ "follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
+ __func__, mod->name, ret, __func__);
dump_stack();
}
@@ -3205,10 +3193,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname)
{
/* Check for magic 'dyndbg' arg */
int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
- if (ret != 0) {
- printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
- modname, param);
- }
+ if (ret != 0)
+ pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
return 0;
}
@@ -3243,10 +3229,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
#ifdef CONFIG_MODULE_SIG
mod->sig_ok = info->sig_ok;
if (!mod->sig_ok) {
- printk_once(KERN_NOTICE
- "%s: module verification failed: signature and/or"
- " required key missing - tainting kernel\n",
- mod->name);
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
}
#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index b6c482c..c00b4ce 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -233,7 +233,7 @@ static const struct tnt tnts[] = {
*/
const char *print_tainted(void)
{
- static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
+ static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
if (tainted_mask) {
char *s;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4208655..06c62de 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -132,6 +132,12 @@ out:
return ERR_PTR(err);
}
+static void delayed_free_pidns(struct rcu_head *p)
+{
+ kmem_cache_free(pid_ns_cachep,
+ container_of(p, struct pid_namespace, rcu));
+}
+
static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
@@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
put_user_ns(ns->user_ns);
- kmem_cache_free(pid_ns_cachep, ns);
+ call_rcu(&ns->rcu, delayed_free_pidns);
}
struct pid_namespace *copy_pid_ns(unsigned long flags,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d444c4e..2fac9cc 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG
def_bool y
depends on PM_DEBUG && PM_SLEEP
+config DPM_WATCHDOG
+ bool "Device suspend/resume watchdog"
+ depends on PM_DEBUG && PSTORE
+ ---help---
+ Sets up a watchdog timer to capture drivers that are
+ locked up attempting to suspend/resume a device.
+ A detected lockup causes system panic with message
+ captured in pstore device for inspection in subsequent
+ boot session.
+
+config DPM_WATCHDOG_TIMEOUT
+ int "Watchdog timeout in seconds"
+ range 1 120
+ default 12
+ depends on DPM_WATCHDOG
+
config PM_TRACE
bool
help
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index a394297..8dff9b4 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
- } else if (count <= 11) { /* ASCII perhaps? */
- char ascii_value[11];
- unsigned long int ulval;
+ } else {
int ret;
- if (copy_from_user(ascii_value, buf, count))
- return -EFAULT;
-
- if (count > 10) {
- if (ascii_value[10] == '\n')
- ascii_value[10] = '\0';
- else
- return -EINVAL;
- } else {
- ascii_value[count] = '\0';
- }
- ret = kstrtoul(ascii_value, 16, &ulval);
- if (ret) {
- pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
- return -EINVAL;
- }
- value = (s32)lower_32_bits(ulval);
- } else {
- return -EINVAL;
+ ret = kstrtos32_from_user(buf, count, 16, &value);
+ if (ret)
+ return ret;
}
req = filp->private_data;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 98c3b34..10c22ca 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1402,7 +1402,11 @@ int hibernate_preallocate_memory(void)
* highmem and non-highmem zones separately.
*/
pages_highmem = preallocate_image_highmem(highmem / 2);
- alloc = (count - max_size) - pages_highmem;
+ alloc = count - max_size;
+ if (alloc > pages_highmem)
+ alloc -= pages_highmem;
+ else
+ alloc = 0;
pages = preallocate_image_memory(alloc, avail_normal);
if (pages < alloc) {
/* We have exhausted non-highmem pages, try highmem. */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 957f061..2485027 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -36,9 +36,9 @@ static struct snapshot_data {
struct snapshot_handle handle;
int swap;
int mode;
- char frozen;
- char ready;
- char platform_support;
+ bool frozen;
+ bool ready;
+ bool platform_support;
bool free_bitmaps;
} snapshot_state;
@@ -93,9 +93,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
atomic_inc(&snapshot_device_available);
- data->frozen = 0;
- data->ready = 0;
- data->platform_support = 0;
+ data->frozen = false;
+ data->ready = false;
+ data->platform_support = false;
Unlock:
unlock_system_sleep();
@@ -229,7 +229,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (error)
thaw_processes();
else
- data->frozen = 1;
+ data->frozen = true;
break;
@@ -240,7 +240,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
free_basic_memory_bitmaps();
data->free_bitmaps = false;
thaw_processes();
- data->frozen = 0;
+ data->frozen = false;
break;
case SNAPSHOT_CREATE_IMAGE:
@@ -270,7 +270,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
case SNAPSHOT_FREE:
swsusp_free();
memset(&data->handle, 0, sizeof(struct snapshot_handle));
- data->ready = 0;
+ data->ready = false;
/*
* It is necessary to thaw kernel threads here, because
* SNAPSHOT_CREATE_IMAGE may be invoked directly after
@@ -334,7 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
* PM_HIBERNATION_PREPARE
*/
error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- data->ready = 0;
+ data->ready = false;
break;
case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b4e8500..be7c86b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = {
#ifdef CONFIG_KEXEC
/*
- * This appends the listed symbols to /proc/vmcoreinfo
+ * This appends the listed symbols to /proc/vmcore
*
- * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * /proc/vmcore is used by various utilities, like crash and makedumpfile to
* obtain access to symbols that are otherwise very difficult to locate. These
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
@@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
ignore_loglevel = 1;
- printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+ pr_info("debug: ignoring loglevel setting.\n");
return 0;
}
@@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str)
pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
"HZ: %d, loops_per_msec: %llu\n",
boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
- return 1;
+ return 0;
}
-__setup("boot_delay=", boot_delay_setup);
+early_param("boot_delay", boot_delay_setup);
static void boot_delay_msec(int level)
{
@@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon;
static int __init keep_bootcon_setup(char *str)
{
keep_bootcon = 1;
- printk(KERN_INFO "debug: skip boot console de-registration.\n");
+ pr_info("debug: skip boot console de-registration.\n");
return 0;
}
@@ -2241,7 +2241,7 @@ void register_console(struct console *newcon)
/* find the last or real console */
for_each_console(bcon) {
if (!(bcon->flags & CON_BOOT)) {
- printk(KERN_INFO "Too late to register bootconsole %s%d\n",
+ pr_info("Too late to register bootconsole %s%d\n",
newcon->name, newcon->index);
return;
}
@@ -2358,21 +2358,18 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
+ pr_info("%sconsole [%s%d] enabled\n",
+ (newcon->flags & CON_BOOT) ? "boot" : "" ,
+ newcon->name, newcon->index);
if (bcon &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* we need to iterate through twice, to make sure we print
- * everything out, before we unregister the console(s)
+ /* We need to iterate through all boot consoles, to make
+ * sure we print everything out, before we unregister them.
*/
- printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
- newcon->name, newcon->index);
for_each_console(bcon)
if (bcon->flags & CON_BOOT)
unregister_console(bcon);
- } else {
- printk(KERN_INFO "%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
}
}
EXPORT_SYMBOL(register_console);
@@ -2382,6 +2379,10 @@ int unregister_console(struct console *console)
struct console *a, *b;
int res;
+ pr_info("%sconsole [%s%d] disabled\n",
+ (console->flags & CON_BOOT) ? "boot" : "" ,
+ console->name, console->index);
+
res = _braille_unregister_console(console);
if (res)
return res;
@@ -2421,8 +2422,6 @@ static int __init printk_late_init(void)
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
- printk(KERN_INFO "turn off boot console %s%d\n",
- con->name, con->index);
unregister_console(con);
}
}
@@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
if (pending & PRINTK_PENDING_SCHED) {
char *buf = __get_cpu_var(printk_sched_buf);
- printk(KERN_WARNING "[sched_delayed] %s", buf);
+ pr_warn("[sched_delayed] %s", buf);
}
if (pending & PRINTK_PENDING_WAKEUP)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dd562e9..1f4bcb3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,7 +257,8 @@ ok:
if (task->mm)
dumpable = get_dumpable(task->mm);
rcu_read_lock();
- if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ if (dumpable != SUID_DUMP_USER &&
+ !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
rcu_read_unlock();
return -EPERM;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aa066f3..1deccd7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4883,6 +4883,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
@@ -4894,6 +4896,7 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
@@ -4902,6 +4905,9 @@ static void update_top_cache_domain(int cpu)
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41c02b6..df77c60 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6534,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu)
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
+ int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- for (; sd; sd = sd->parent)
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6551,16 +6551,16 @@ unlock:
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
+ int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- for (; sd; sd = sd->parent)
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6767,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
struct sched_domain *sd;
+ struct sched_group_power *sgp;
+ int nr_busy;
if (unlikely(idle_cpu(cpu)))
return 0;
@@ -6792,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
goto need_kick;
rcu_read_lock();
- for_each_domain(cpu, sd) {
- struct sched_group *sg = sd->groups;
- struct sched_group_power *sgp = sg->sgp;
- int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
- goto need_kick_unlock;
+ if (sd) {
+ sgp = sd->groups->sgp;
+ nr_busy = atomic_read(&sgp->nr_busy_cpus);
- if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
- && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
+ if (nr_busy > 1)
goto need_kick_unlock;
-
- if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
- break;
}
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
rcu_read_unlock();
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e650ac..88c85b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -623,6 +623,8 @@ DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_power {
atomic_t ref;
diff --git a/kernel/signal.c b/kernel/signal.c
index ded28b9..940b30e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
-int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
{
int err;
diff --git a/kernel/smp.c b/kernel/smp.c
index 0564571..4611610 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,6 +18,7 @@
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
enum {
CSD_FLAG_LOCK = 0x01,
+ CSD_FLAG_WAIT = 0x02,
};
struct call_function_data {
@@ -124,7 +125,7 @@ static void csd_lock(struct call_single_data *csd)
static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
+ WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
@@ -146,6 +147,9 @@ void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
unsigned long flags;
int ipi;
+ if (wait)
+ csd->flags |= CSD_FLAG_WAIT;
+
raw_spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
list_add_tail(&csd->list, &dst->list);
@@ -340,6 +344,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
}
put_cpu();
}
+EXPORT_SYMBOL_GPL(__smp_call_function_single);
/**
* smp_call_function_many(): Run a function on a set of other CPUs.
@@ -524,6 +529,11 @@ void __init setup_nr_cpu_ids(void)
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
+void __weak smp_announce(void)
+{
+ printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
+}
+
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
@@ -540,7 +550,7 @@ void __init smp_init(void)
}
/* Any cleanup work */
- printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+ smp_announce();
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index dcab1d3..b249883 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -29,7 +29,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
-#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
@@ -134,7 +133,6 @@ EXPORT_SYMBOL(local_bh_disable);
static void __local_bh_enable(unsigned int cnt)
{
- WARN_ON_ONCE(in_irq());
WARN_ON_ONCE(!irqs_disabled());
if (softirq_count() == cnt)
@@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt)
*/
void _local_bh_enable(void)
{
+ WARN_ON_ONCE(in_irq());
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
@@ -171,8 +170,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
*/
preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
- if (unlikely(!in_interrupt() && local_softirq_pending()))
+ if (unlikely(!in_interrupt() && local_softirq_pending())) {
+ /*
+ * Run softirq if any pending. And do it in its own stack
+ * as we may be calling this deep in a task call stack already.
+ */
do_softirq();
+ }
preempt_count_dec();
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -280,10 +284,11 @@ restart:
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
-#ifndef __ARCH_HAS_DO_SOFTIRQ
+
asmlinkage void do_softirq(void)
{
@@ -298,13 +303,11 @@ asmlinkage void do_softirq(void)
pending = local_softirq_pending();
if (pending)
- __do_softirq();
+ do_softirq_own_stack();
local_irq_restore(flags);
}
-#endif
-
/*
* Enter an interrupt context.
*/
@@ -329,15 +332,21 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
if (!force_irqthreads) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
* it is the irq stack, because it should be near empty
- * at this stage. But we have no way to know if the arch
- * calls irq_exit() on the irq stack. So call softirq
- * in its own stack to prevent from any overrun on top
- * of a potentially deep task stack.
+ * at this stage.
*/
- do_softirq();
+ __do_softirq();
+#else
+ /*
+ * Otherwise, irq_exit() is called on the task stack that can
+ * be potentially deep already. So call softirq in its own stack
+ * to prevent from any overrun.
+ */
+ do_softirq_own_stack();
+#endif
} else {
wakeup_softirqd();
}
@@ -771,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable();
if (local_softirq_pending()) {
+ /*
+ * We can safely run softirq on inline stack, as we are not deep
+ * in the task stack here.
+ */
__do_softirq();
rcu_note_context_switch(cpu);
local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c530bc5..84571e0 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
+#include <linux/lglock.h>
/*
* Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
+/*
+ * Avoids a race between stop_two_cpus and global stop_cpus, where
+ * the stoppers could get queued up in reverse order, leading to
+ * system deadlock. Using an lglock means stop_two_cpus remains
+ * relatively cheap.
+ */
+DEFINE_STATIC_LGLOCK(stop_cpus_lock);
+
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
@@ -276,6 +285,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}
+ lg_local_lock(&stop_cpus_lock);
/*
* Queuing needs to be done by the lowest numbered CPU, to ensure
* that works are always queued in the same order on every CPU.
@@ -284,6 +294,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
smp_call_function_single(min(cpu1, cpu2),
&irq_cpu_stop_queue_work,
&call_args, 0);
+ lg_local_unlock(&stop_cpus_lock);
preempt_enable();
wait_for_completion(&done.completion);
@@ -335,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- preempt_disable();
+ lg_global_lock(&stop_cpus_lock);
for_each_cpu(cpu, cpumask)
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
- preempt_enable();
+ lg_global_unlock(&stop_cpus_lock);
}
static int __stop_cpus(const struct cpumask *cpumask,
diff --git a/kernel/sys.c b/kernel/sys.c
index c18ecca..c723113 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,7 +16,6 @@
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
-#include <linux/kexec.h>
#include <linux/workqueue.h>
#include <linux/capability.h>
#include <linux/device.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 339c003..34a6047 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
-static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
+static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -1057,6 +1057,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
.proc_handler = perf_proc_update_handler,
+ .extra1 = &one,
},
{
.procname = "perf_cpu_time_max_percent",
@@ -2222,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
*i = val;
} else {
val = convdiv * (*i) / convmul;
- if (!first)
+ if (!first) {
err = proc_put_char(&buffer, &left, '\t');
+ if (err)
+ break;
+ }
err = proc_put_long(&buffer, &left, val, false);
if (err)
break;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b609213..653cbbd 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file,
if (get_user(value, vec + i))
goto out_kfree;
- str += snprintf(str, end - str, "%lu\t", value);
+ str += scnprintf(str, end - str, "%lu\t", value);
}
result = kernel_write(file, buffer, str - buffer, 0);
@@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file,
if (get_user(value, vec + i))
goto out_kfree;
- str += snprintf(str, end - str, "%lu\t", value);
+ str += scnprintf(str, end - str, "%lu\t", value);
}
result = kernel_write(file, buffer, str - buffer, 0);
@@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file,
if (get_user(dnaddr, (__le16 __user *)newval))
goto out;
- len = snprintf(buf, sizeof(buf), "%hu.%hu",
+ len = scnprintf(buf, sizeof(buf), "%hu.%hu",
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 145bb4d..9f4618e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
struct listener_list *listeners;
struct listener *s, *tmp, *s2;
unsigned int cpu;
+ int ret = 0;
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
@@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
for_each_cpu(cpu, mask) {
s = kmalloc_node(sizeof(struct listener),
GFP_KERNEL, cpu_to_node(cpu));
- if (!s)
+ if (!s) {
+ ret = -ENOMEM;
goto cleanup;
-
+ }
s->pid = pid;
s->valid = 1;
@@ -339,7 +341,7 @@ cleanup:
}
up_write(&listeners->sem);
}
- return 0;
+ return ret;
}
static int parse(struct nlattr *na, struct cpumask *mask)
@@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
if (!na)
goto err;
- if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
+ nla_nest_cancel(skb, na);
goto err;
+ }
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
- if (!ret)
+ if (!ret) {
+ nla_nest_cancel(skb, na);
goto err;
+ }
nla_nest_end(skb, na);
return nla_data(ret);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2b62fe8..3ce6e8c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
# RCU_USER_QS dependency
depends on HAVE_CONTEXT_TRACKING
# VIRT_CPU_ACCOUNTING_GEN dependency
- depends on 64BIT
+ depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
select RCU_USER_QS
select RCU_NOCB_CPU
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fc..88c9c65 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EINVAL;
return hrtimer_get_res(baseid, tp);
}
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EINVAL;
*tp = ktime_to_timespec(base->gettime());
return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 662c579..086ad60 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
const char *buf, size_t count)
{
char name[CS_NAME_LEN];
- size_t ret = sysfs_get_uname(buf, name, count);
+ ssize_t ret = sysfs_get_uname(buf, name, count);
struct clock_event_device *ce;
if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736..ba3e502 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
+void clocksource_mark_unstable(struct clocksource *cs) { }
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
}
/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
- *
+ * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
+ * @mult: cycle to nanosecond multiplier
+ * @shift: cycle to nanosecond divisor (power of two)
+ * @maxadj: maximum adjustment value to mult (~11%)
+ * @mask: bitmask for two's complement subtraction of non 64 bit counters
*/
-static u64 clocksource_max_deferment(struct clocksource *cs)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
* cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+ * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
* which is equivalent to the below.
- * max_cycles < (2^63)/(cs->mult + cs->maxadj)
- * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
- * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
- * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
+ * max_cycles < (2^63)/(mult + maxadj)
+ * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
+ * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
+ * max_cycles < 2^(63 - log2(mult + maxadj))
+ * max_cycles < 1 << (63 - log2(mult + maxadj))
* Please note that we add 1 to the result of the log2 to account for
* any rounding errors, ensure the above inequality is satisfied and
* no overflow will occur.
*/
- max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
+ max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
/*
* The actual maximum number of cycles we can defer the clocksource is
- * determined by the minimum of max_cycles and cs->mask.
+ * determined by the minimum of max_cycles and mask.
* Note: Here we subtract the maxadj to make sure we don't sleep for
* too long if there's a large negative adjustment.
*/
- max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
- max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
- cs->shift);
+ max_cycles = min(max_cycles, mask);
+ max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+
+ return max_nsecs;
+}
+
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs: Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+ u64 max_nsecs;
+ max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+ cs->mask);
/*
* To ensure that the clocksource does not wrap whilst we are idle,
* limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
return count;
}
-size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
{
size_t ret = cnt;
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- size_t ret;
+ ssize_t ret;
mutex_lock(&clocksource_mutex);
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
{
struct clocksource *cs;
char name[CS_NAME_LEN];
- size_t ret;
+ ssize_t ret;
ret = sysfs_get_uname(buf, name, count);
if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bb22151..af8d1d4 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
* called as close as possible to 500 ms before the new second starts.
* This code is run on a timer. If the clock is set, that timer
* may not expire at the correct time. Thus, we adjust...
+ * We want the clock to be within a couple of ticks from the target.
*/
if (!ntp_synced()) {
/*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
}
getnstimeofday(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
struct timespec adjust = now;
fail = -ENODEV;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0b479a6..68b7993 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/jiffies.h>
+#include <linux/ktime.h>
#include <linux/kernel.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/syscore_ops.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
#include <linux/sched_clock.h>
+#include <linux/seqlock.h>
+#include <linux/bitops.h>
struct clock_data {
+ ktime_t wrap_kt;
u64 epoch_ns;
- u32 epoch_cyc;
- u32 epoch_cyc_copy;
+ u64 epoch_cyc;
+ seqcount_t seq;
unsigned long rate;
u32 mult;
u32 shift;
bool suspended;
};
-static void sched_clock_poll(unsigned long wrap_ticks);
-static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
+static struct hrtimer sched_clock_timer;
static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
.mult = NSEC_PER_SEC / HZ,
};
-static u32 __read_mostly sched_clock_mask = 0xffffffff;
+static u64 __read_mostly sched_clock_mask;
-static u32 notrace jiffy_sched_clock_read(void)
+static u64 notrace jiffy_sched_clock_read(void)
{
- return (u32)(jiffies - INITIAL_JIFFIES);
+ /*
+ * We don't need to use get_jiffies_64 on 32-bit arches here
+ * because we register with BITS_PER_LONG
+ */
+ return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static u32 __read_mostly (*read_sched_clock_32)(void);
+
+static u64 notrace read_sched_clock_32_wrapper(void)
+{
+ return read_sched_clock_32();
+}
+
+static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
return (cyc * mult) >> shift;
}
-static unsigned long long notrace sched_clock_32(void)
+unsigned long long notrace sched_clock(void)
{
u64 epoch_ns;
- u32 epoch_cyc;
- u32 cyc;
+ u64 epoch_cyc;
+ u64 cyc;
+ unsigned long seq;
if (cd.suspended)
return cd.epoch_ns;
- /*
- * Load the epoch_cyc and epoch_ns atomically. We do this by
- * ensuring that we always write epoch_cyc, epoch_ns and
- * epoch_cyc_copy in strict order, and read them in strict order.
- * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
- * the middle of an update, and we should repeat the load.
- */
do {
+ seq = read_seqcount_begin(&cd.seq);
epoch_cyc = cd.epoch_cyc;
- smp_rmb();
epoch_ns = cd.epoch_ns;
- smp_rmb();
- } while (epoch_cyc != cd.epoch_cyc_copy);
+ } while (read_seqcount_retry(&cd.seq, seq));
cyc = read_sched_clock();
cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
static void notrace update_sched_clock(void)
{
unsigned long flags;
- u32 cyc;
+ u64 cyc;
u64 ns;
cyc = read_sched_clock();
ns = cd.epoch_ns +
cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
cd.mult, cd.shift);
- /*
- * Write epoch_cyc and epoch_ns in a way that the update is
- * detectable in cyc_to_fixed_sched_clock().
- */
+
raw_local_irq_save(flags);
- cd.epoch_cyc_copy = cyc;
- smp_wmb();
+ write_seqcount_begin(&cd.seq);
cd.epoch_ns = ns;
- smp_wmb();
cd.epoch_cyc = cyc;
+ write_seqcount_end(&cd.seq);
raw_local_irq_restore(flags);
}
-static void sched_clock_poll(unsigned long wrap_ticks)
+static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
{
- mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
update_sched_clock();
+ hrtimer_forward_now(hrt, cd.wrap_kt);
+ return HRTIMER_RESTART;
}
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
+void __init sched_clock_register(u64 (*read)(void), int bits,
+ unsigned long rate)
{
- unsigned long r, w;
+ unsigned long r;
u64 res, wrap;
char r_unit;
if (cd.rate > rate)
return;
- BUG_ON(bits > 32);
WARN_ON(!irqs_disabled());
read_sched_clock = read;
- sched_clock_mask = (1ULL << bits) - 1;
+ sched_clock_mask = CLOCKSOURCE_MASK(bits);
cd.rate = rate;
/* calculate the mult/shift to convert counter ticks to ns. */
- clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
+ clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
r = rate;
if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
r_unit = ' ';
/* calculate how many ns until we wrap */
- wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
- do_div(wrap, NSEC_PER_MSEC);
- w = wrap;
+ wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
+ cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
/* calculate the ns resolution of this counter */
res = cyc_to_ns(1ULL, cd.mult, cd.shift);
- pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
- bits, r, r_unit, res, w);
+ pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
+ bits, r, r_unit, res, wrap);
- /*
- * Start the timer to keep sched_clock() properly updated and
- * sets the initial epoch.
- */
- sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
update_sched_clock();
/*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pF as sched_clock source\n", read);
}
-unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
-
-unsigned long long notrace sched_clock(void)
+void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
{
- return sched_clock_func();
+ read_sched_clock_32 = read;
+ sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
}
void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
* make it the final one one.
*/
if (read_sched_clock == jiffy_sched_clock_read)
- setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
+ sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
- sched_clock_poll(sched_clock_timer.data);
+ update_sched_clock();
+
+ /*
+ * Start the timer to keep sched_clock() properly updated and
+ * sets the initial epoch.
+ */
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ sched_clock_timer.function = sched_clock_poll;
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
}
static int sched_clock_suspend(void)
{
- sched_clock_poll(sched_clock_timer.data);
+ sched_clock_poll(&sched_clock_timer);
cd.suspended = true;
return 0;
}
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void)
{
cd.epoch_cyc = read_sched_clock();
- cd.epoch_cyc_copy = cd.epoch_cyc;
cd.suspended = false;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb5..9532690 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
struct clock_event_device *newdev)
{
if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
(newdev->features & CLOCK_EVT_FEAT_C3STOP))
return false;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906ca..18e71f7 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
extern void clockevents_shutdown(struct clock_event_device *dev);
-extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/*
* NO_HZ / high resolution timer shared code
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 947ba25..3abf534 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
* ktime_get_update_offsets - hrtimer helper
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
+ * @offs_tai: pointer to storage for monotonic -> clock tai offset
*
* Returns current monotonic time and updates the offsets
- * Called from hrtimer_interupt() or retrigger_next_event()
+ * Called from hrtimer_interrupt() or retrigger_next_event()
*/
ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
ktime_t *offs_tai)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f2..1fb08f2 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
period = ktime_to_timespec(time);
ms = period.tv_nsec / 1000000;
- seq_puts(m, "Timer Stats Version: v0.2\n");
+ seq_puts(m, "Timer Stats Version: v0.3\n");
seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
if (atomic_read(&overflow_count))
- seq_printf(m, "Overflow: %d entries\n",
- atomic_read(&overflow_count));
+ seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
+ seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
- if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
seq_printf(m, "%4luD, %5d %-16s ",
entry->count, entry->pid, entry->comm);
} else {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b8b8560..f785aef 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/time.h>
#include <linux/uaccess.h>
+#include <linux/list.h>
#include <trace/events/block.h>
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;
static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
+static LIST_HEAD(running_trace_list);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
+
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -107,10 +111,18 @@ record_it:
* Send out a notify for this process, if we haven't done so since a trace
* started
*/
-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+static void trace_note_tsk(struct task_struct *tsk)
{
+ unsigned long flags;
+ struct blk_trace *bt;
+
tsk->btrace_seq = blktrace_seq;
- trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+ spin_lock_irqsave(&running_trace_lock, flags);
+ list_for_each_entry(bt, &running_trace_list, running_list) {
+ trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
+ sizeof(tsk->comm));
+ }
+ spin_unlock_irqrestore(&running_trace_lock, flags);
}
static void trace_note_time(struct blk_trace *bt)
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
goto record_it;
}
+ if (unlikely(tsk->btrace_seq != blktrace_seq))
+ trace_note_tsk(tsk);
+
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
-
- if (unlikely(tsk->btrace_seq != blktrace_seq))
- trace_note_tsk(bt, tsk);
-
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->dir = dir;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
+ INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.end_lba = cbuts.end_lba,
.pid = cbuts.pid,
};
- memcpy(&buts.name, &cbuts.name, 32);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
if (ret)
return ret;
- if (copy_to_user(arg, &buts.name, 32)) {
+ if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
blk_trace_remove(q);
return -EFAULT;
}
@@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
+ spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ spin_unlock_irq(&running_trace_lock);
trace_note_time(bt);
ret = 0;
@@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
ret = 0;
}
@@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
+ spin_lock_irq(&running_trace_lock);
+ list_del(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7974ba2..d9fea7d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
- (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+ (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10c86fb..73d08aa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_PREEMPT_RESCHED = 0x20,
};
#define TRACE_BUF_SIZE 1024
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bc..78e27e3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
{
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
/* No tracing, just counting, so no obvious leak */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cba..ed32284 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
- need_resched =
- (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+
+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+ TRACE_FLAG_PREEMPT_RESCHED)) {
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'N';
+ break;
+ case TRACE_FLAG_NEED_RESCHED:
+ need_resched = 'n';
+ break;
+ case TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'p';
+ break;
+ default:
+ need_resched = '.';
+ break;
+ }
+
hardsoft_irq =
(hardirq && softirq) ? 'H' :
hardirq ? 'h' :