summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2014-04-15 20:02:24 (GMT)
committerJens Axboe <axboe@fb.com>2014-04-15 20:02:24 (GMT)
commitf89e0dd9d1a72fdf6b8958bcadfa6abf84f3cae0 (patch)
tree6d4ca8c67dc22d1c81053392078588f9ab3804b5 /kernel
parent21f9fcd81593e201172160853b8647336fb81f4f (diff)
parentc9eaa447e77efe77b7fa4c953bd62de8297fd6c5 (diff)
downloadlinux-f89e0dd9d1a72fdf6b8958bcadfa6abf84f3cae0.tar.xz
Merge tag 'v3.15-rc1' into for-3.16/core
We don't like this, but things have diverged with the blk-mq fixes in 3.15-rc1. So merge it in.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c27
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c33
-rw-r--r--kernel/auditsc.c133
-rw-r--r--kernel/cgroup.c32
-rw-r--r--kernel/cpu.c38
-rw-r--r--kernel/debug/debug_core.c14
-rw-r--r--kernel/exit.c110
-rw-r--r--kernel/fork.c34
-rw-r--r--kernel/futex.c37
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c5
-rw-r--r--kernel/ksysfs.c5
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/module.c12
-rw-r--r--kernel/panic.c15
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/profile.c20
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/res_counter.c23
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c49
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/sys.c15
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_events.c55
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_export.c6
-rw-r--r--kernel/trace/trace_kprobe.c21
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_uprobe.c20
-rw-r--r--kernel/tracepoint.c521
40 files changed, 726 insertions, 594 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 95a20f3..7c28936 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
struct audit_reply {
__u32 portid;
- struct net *net;
+ struct net *net;
struct sk_buff *skb;
};
@@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb)
if (printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
- audit_log_lost("printk limit exceeded\n");
+ audit_log_lost("printk limit exceeded");
}
audit_hold_skb(skb);
@@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
if (audit_pid) {
pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared\n");
+ audit_log_lost("auditd disappeared");
audit_pid = 0;
audit_sock = NULL;
}
@@ -607,7 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
- /* Only support the initial namespaces for now. */
+ /* Only support initial user namespace for now. */
/*
* We return ECONNREFUSED because it tricks userspace into thinking
* that audit was not configured into the kernel. Lots of users
@@ -618,8 +618,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
* userspace will reject all logins. This should be removed when we
* support non init namespaces!!
*/
- if ((current_user_ns() != &init_user_ns) ||
- (task_active_pid_ns(current) != &init_pid_ns))
+ if (current_user_ns() != &init_user_ns)
return -ECONNREFUSED;
switch (msg_type) {
@@ -639,6 +638,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if ((task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
if (!capable(CAP_AUDIT_CONTROL))
err = -EPERM;
break;
@@ -659,6 +663,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
@@ -668,7 +673,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return rc;
- audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid);
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
@@ -1097,7 +1102,7 @@ static void __net_exit audit_net_exit(struct net *net)
audit_sock = NULL;
}
- rcu_assign_pointer(aunet->nlsk, NULL);
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
synchronize_net();
netlink_kernel_release(sock);
}
@@ -1829,11 +1834,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
spin_unlock_irq(&tsk->sighand->siglock);
audit_log_format(ab,
- " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- sys_getppid(),
- tsk->pid,
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/audit.h b/kernel/audit.h
index 8df13221..7bb6573 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -106,6 +106,11 @@ struct audit_names {
bool should_free;
};
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
@@ -202,6 +207,7 @@ struct audit_context {
} execve;
};
int fds[2];
+ struct audit_proctitle proctitle;
#if AUDIT_DEBUG
int put_count;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 92062fd..8e9bc9c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -226,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
#endif
/* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
@@ -249,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
- printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -403,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
int i;
char *str;
- entry = audit_to_entry_common((struct audit_rule *)data);
+ entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
@@ -431,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->val = 0;
}
+ if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+ struct pid *pid;
+ rcu_read_lock();
+ pid = find_vpid(f->val);
+ if (!pid) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto exit_free;
+ }
+ f->val = pid_nr(pid);
+ rcu_read_unlock();
+ }
+
err = audit_field_valid(entry, f);
if (err)
goto exit_free;
@@ -479,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM "
- "\'%s\' is invalid\n", str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
err = 0;
}
if (err) {
@@ -709,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM \'%s\' is "
- "invalid\n", df->lsm_str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
ret = 0;
}
@@ -1240,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ pid_t pid;
int result = 0;
u32 sid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(task_pid_vnr(current), f->op, f->val);
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
result = audit_uid_comparator(current_uid(), f->op, f->uid);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7aef2f4..f251a5e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,6 +42,8 @@
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <asm/types.h>
#include <linux/atomic.h>
@@ -68,6 +70,7 @@
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
+#include <linux/ctype.h>
#include "audit.h"
@@ -79,6 +82,9 @@
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
/* number of audit rules */
int audit_n_rules;
@@ -451,15 +457,17 @@ static int audit_filter_rules(struct task_struct *tsk,
struct audit_field *f = &rule->fields[i];
struct audit_names *n;
int result = 0;
+ pid_t pid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(tsk->pid, f->op, f->val);
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
- ctx->ppid = sys_getppid();
+ ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
@@ -805,7 +813,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
int return_valid,
long return_code)
{
@@ -842,6 +851,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
return context;
}
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
@@ -850,16 +866,15 @@ static inline void audit_free_names(struct audit_context *context)
if (context->put_count + context->ino_count != context->name_count) {
int i = 0;
- printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d"
- " ino_count=%d [NOT freeing]\n",
- __FILE__, __LINE__,
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d ino_count=%d"
+ " [NOT freeing]\n", __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i++,
- n->name, n->name->name ?: "(null)");
+ pr_err("names[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -955,6 +970,7 @@ static inline void audit_free_context(struct audit_context *context)
audit_free_aux(context);
kfree(context->filterkey);
kfree(context->sockaddr);
+ audit_proctitle_free(context);
kfree(context);
}
@@ -1157,7 +1173,7 @@ static void audit_log_execve_info(struct audit_context *context,
*/
buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf) {
- audit_panic("out of memory for argv string\n");
+ audit_panic("out of memory for argv string");
return;
}
@@ -1271,6 +1287,59 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
}
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
+{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
+ if (!ab)
+ return; /* audit_panic or being filtered */
+
+ audit_log_format(ab, "proctitle=");
+
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
+ }
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
int i, call_panic = 0;
@@ -1388,6 +1457,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_name(context, n, NULL, i++, &call_panic);
}
+ audit_log_proctitle(tsk, context);
+
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
@@ -1406,7 +1477,7 @@ void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- context = audit_get_context(tsk, 0, 0);
+ context = audit_take_context(tsk, 0, 0);
if (!context)
return;
@@ -1500,7 +1571,7 @@ void __audit_syscall_exit(int success, long return_code)
else
success = AUDITSC_FAILURE;
- context = audit_get_context(tsk, success, return_code);
+ context = audit_take_context(tsk, success, return_code);
if (!context)
return;
@@ -1550,7 +1621,7 @@ static inline void handle_one(const struct inode *inode)
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
- printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
@@ -1609,8 +1680,7 @@ retry:
goto retry;
}
/* too bad */
- printk(KERN_WARNING
- "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
@@ -1682,7 +1752,7 @@ void __audit_getname(struct filename *name)
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ pr_err("%s:%d(:%d): ignoring getname(%p)\n",
__FILE__, __LINE__, context->serial, name);
dump_stack();
#endif
@@ -1721,15 +1791,15 @@ void audit_putname(struct filename *name)
BUG_ON(!context);
if (!name->aname || !context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n",
+ pr_err("%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
struct audit_names *n;
int i = 0;
list_for_each_entry(n, &context->names_list, list)
- printk(KERN_ERR "name[%d] = %p = %s\n", i++,
- n->name, n->name->name ?: "(null)");
+ pr_err("name[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
#endif
final_putname(name);
@@ -1738,9 +1808,8 @@ void audit_putname(struct filename *name)
else {
++context->put_count;
if (context->put_count > context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d"
- " in_syscall=%d putname(%p) name_count=%d"
- " put_count=%d\n",
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+ " name_count=%d put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
context->in_syscall, name->name,
@@ -1981,12 +2050,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
- audit_log_format(ab, "pid=%d uid=%u"
- " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
- " res=%d",
- current->pid, uid,
- oldloginuid, loginuid, oldsessionid, sessionid,
- !rc);
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
audit_log_end(ab);
}
@@ -2208,7 +2275,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = t->pid;
+ context->target_pid = task_pid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2233,7 +2300,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = tsk->pid;
+ audit_sig_pid = task_pid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2247,7 +2314,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
- ctx->target_pid = t->tgid;
+ ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
@@ -2268,7 +2335,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
- axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2368,7 +2435,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
audit_log_untrustedstring(ab, current->comm);
if (mm) {
down_read(&mm->mmap_sem);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fede3d3..9fcdaa7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1487,6 +1487,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+ bool new_sb;
/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -1603,8 +1604,8 @@ out_unlock:
if (ret)
return ERR_PTR(ret);
- dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
- if (IS_ERR(dentry))
+ dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
return dentry;
}
@@ -2345,11 +2346,26 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
return ret;
}
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
+ int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
@@ -2357,7 +2373,13 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft), 0, cft->kf_ops, cft,
NULL, false, key);
- return PTR_ERR_OR_ZERO(kn);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ kernfs_remove(kn);
+ return ret;
}
/**
@@ -3752,6 +3774,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
*/
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+ err = cgroup_kn_set_ugid(kn);
+ if (err)
+ goto err_destroy;
+
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
if (err)
goto err_destroy;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index deff2e6..a9e710e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include <linux/lockdep.h>
#include "smpboot.h"
@@ -27,18 +28,23 @@
static DEFINE_MUTEX(cpu_add_remove_lock);
/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_done);
static RAW_NOTIFIER_HEAD(cpu_chain);
@@ -57,17 +63,30 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
};
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
+ cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
@@ -87,6 +106,7 @@ void put_online_cpus(void)
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -117,6 +137,7 @@ void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
@@ -131,6 +152,7 @@ void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
/*
@@ -166,6 +188,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
int *nr_calls)
{
@@ -189,6 +216,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
BUG_ON(cpu_notify(val, v));
}
EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
@@ -198,6 +226,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 99982a7..2956c8d 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm && current->mm->mmap_cache) {
- flush_cache_range(current->mm->mmap_cache,
- addr, addr + BREAK_INSTR_SIZE);
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
}
+
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 6480d1c..6ed6a1d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -570,7 +570,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (same_thread_group(p->real_parent, father))
return;
- /* We don't want people slaying init. */
+ /* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
@@ -784,9 +784,10 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
+ if (group_dead)
+ disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
- check_stack_usage();
exit_thread();
/*
@@ -799,19 +800,15 @@ void do_exit(long code)
cgroup_exit(tsk);
- if (group_dead)
- disassociate_ctty(1);
-
module_put(task_thread_info(tsk)->exec_domain->module);
- proc_exit_connector(tsk);
-
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
@@ -844,6 +841,7 @@ void do_exit(long code)
validate_creds_for_do_exit(tsk);
+ check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
@@ -1038,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
+ traced = ptrace_reparented(p);
/*
- * Try to move the task's state to DEAD
- * only one thread is allowed to do this:
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = xchg(&p->exit_state, EXIT_DEAD);
- if (state != EXIT_ZOMBIE) {
- BUG_ON(state != EXIT_DEAD);
+ state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
- }
-
- traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
* thread_group_leader() to filter out sub-threads.
@@ -1109,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to EXIT_DEAD.
+ * thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
@@ -1146,22 +1140,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (!retval)
retval = pid;
- if (traced) {
+ if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
- /*
- * If this is not a sub-thread, notify the parent.
- * If parent wants a zombie, don't release it now.
- */
- if (thread_group_leader(p) &&
- !do_notify_parent(p, p->exit_signal)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
- if (p != NULL)
+ if (state == EXIT_DEAD)
release_task(p);
return retval;
@@ -1338,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
- int ret = eligible_child(wo, p);
+ int ret;
+
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1356,33 +1352,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- /* dead body doesn't have much to contribute */
- if (unlikely(p->exit_state == EXIT_DEAD)) {
+ if (unlikely(p->exit_state == EXIT_TRACE)) {
/*
- * But do not ignore this task until the tracer does
- * wait_task_zombie()->do_notify_parent().
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
*/
- if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+ if (likely(!ptrace))
wo->notask_error = 0;
return 0;
}
- /* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
- * A zombie ptracee is only visible to its ptracer.
- * Notification and reaping will be cascaded to the real
- * parent when the ptracer detaches.
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
*/
- if (likely(!ptrace) && unlikely(p->ptrace)) {
- /* it will become visible, clear notask_error */
- wo->notask_error = 0;
- return 0;
- }
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
- if (!delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
/*
* Allow access to stopped/continued state via zombie by
@@ -1408,19 +1415,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
wo->notask_error = 0;
} else {
/*
- * If @p is ptraced by a task in its real parent's group,
- * hide group stop/continued state when looking at @p as
- * the real parent; otherwise, a single stop can be
- * reported twice as group and ptrace stops.
- *
- * If a ptracer wants to distinguish the two events for its
- * own children, it should create a separate process which
- * takes the role of real parent.
- */
- if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
- return 0;
-
- /*
* @p is alive and it's gonna stop, continue or exit, so
* there always is something to wait for.
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index abc4589..54a8d26 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -71,6 +73,7 @@
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -284,7 +287,7 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC];
}
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
@@ -364,7 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->locked_vm = 0;
mm->mmap = NULL;
- mm->mmap_cache = NULL;
+ mm->vmacache_seqnum = 0;
mm->map_count = 0;
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
@@ -530,8 +533,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
- mm->flags = (current->mm) ?
- (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
@@ -540,8 +541,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm_init_owner(mm, p);
clear_tlb_flush_pending(mm);
- if (likely(!mm_alloc_pgd(mm))) {
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
mm->def_flags = 0;
+ }
+
+ if (likely(!mm_alloc_pgd(mm))) {
mmu_notifier_mm_init(mm);
return mm;
}
@@ -877,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
@@ -1070,15 +1081,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
-{
- unsigned long new_flags = p->flags;
-
- new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
- new_flags |= PF_FORKNOEXEC;
- p->flags = new_flags;
-}
-
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@ -1228,7 +1230,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- copy_flags(clone_flags, p);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
@@ -1274,7 +1277,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
}
- mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
diff --git a/kernel/futex.c b/kernel/futex.c
index 67dacaf..5f58927 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -70,7 +70,10 @@
#include "locking/rtmutex_common.h"
/*
- * Basic futex operation and ordering guarantees:
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
*
* The waiter reads the futex value in user space and calls
* futex_wait(). This function computes the hash bucket and acquires
@@ -119,7 +122,7 @@
* sys_futex(WAIT, futex, val);
* futex_wait(futex, val);
*
- * waiters++;
+ * waiters++; (a)
* mb(); (A) <-- paired with -.
* |
* lock(hash_bucket(futex)); |
@@ -135,14 +138,14 @@
* unlock(hash_bucket(futex));
* schedule(); if (waiters)
* lock(hash_bucket(futex));
- * wake_waiters(futex);
- * unlock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
*
- * Where (A) orders the waiters increment and the futex value read -- this
- * is guaranteed by the head counter in the hb spinlock; and where (B)
- * orders the write to futex and the waiters read -- this is done by the
- * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
- * depending on the futex type.
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
*
* This yields the following case (where X:=waiters, Y:=futex):
*
@@ -155,6 +158,17 @@
* Which guarantees that x==0 && y==0 is impossible; which translates back into
* the guarantee that we cannot both miss the futex variable change and the
* enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
*/
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
@@ -1452,6 +1466,7 @@ retry:
hb2 = hash_futex(&key2);
retry_private:
+ hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -1461,6 +1476,7 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1);
if (ret)
@@ -1510,6 +1526,7 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
@@ -1519,6 +1536,7 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
cond_resched();
@@ -1594,6 +1612,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
/*
* drop_futex_key_refs() must be called outside the spinlocks. During
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3127ad5..cb0cf37 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <asm/sections.h>
@@ -36,8 +37,8 @@
* These will be re-linked against their real values
* during the second link stage.
*/
-extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern const u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const u8 kallsyms_names[] __weak;
/*
* Tell the compiler that the count isn't in the small data section if the arch
@@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak));
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
-extern const unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __weak;
static inline int is_kernel_inittext(unsigned long addr)
{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0d261c..c8380ad 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -1551,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...)
* provide an empty default implementation here -- architecture
* code may override this
*/
-void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
+void __weak arch_crash_save_vmcoreinfo(void)
{}
-unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
+unsigned long __weak paddr_vmcoreinfo_note(void)
{
return __pa((unsigned long)(char *)&vmcoreinfo_note);
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e660964..2495a9b 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -18,6 +18,7 @@
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/capability.h>
+#include <linux/compiler.h>
#include <linux/rcupdate.h> /* rcu_expedited */
@@ -162,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited);
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
-extern const void __start_notes __attribute__((weak));
-extern const void __stop_notes __attribute__((weak));
+extern const void __start_notes __weak;
+extern const void __stop_notes __weak;
#define notes_size (&__stop_notes - &__start_notes)
static ssize_t notes_read(struct file *filp, struct kobject *kobj,
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 306a76b..b8bdcd4 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
diff --git a/kernel/module.c b/kernel/module.c
index 8dc7f5e..1186940 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod)
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- __this_cpu_write(mod->refptr->incs, 1);
+ raw_cpu_write(mod->refptr->incs, 1);
return 0;
}
@@ -1013,6 +1013,8 @@ static size_t module_flags_taint(struct module *mod, char *buf)
buf[l++] = 'F';
if (mod->taints & (1 << TAINT_CRAP))
buf[l++] = 'C';
+ if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
+ buf[l++] = 'E';
/*
* TAINT_FORCED_RMMOD: could be added.
* TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -3218,7 +3220,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
pr_notice_once("%s: module verification failed: signature "
"and/or required key missing - tainting "
"kernel\n", mod->name);
- add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
#endif
@@ -3813,12 +3815,12 @@ void print_modules(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- printk(" %s%s", mod->name, module_flags(mod, buf));
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf));
}
preempt_enable();
if (last_unloaded_module[0])
- printk(" [last unloaded: %s]", last_unloaded_module);
- printk("\n");
+ pr_cont(" [last unloaded: %s]", last_unloaded_module);
+ pr_cont("\n");
}
#ifdef CONFIG_MODVERSIONS
diff --git a/kernel/panic.c b/kernel/panic.c
index cca8a91..d02fa9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -100,7 +100,7 @@ void panic(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ pr_emerg("Kernel panic - not syncing: %s\n", buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
@@ -141,7 +141,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -165,7 +165,7 @@ void panic(const char *fmt, ...)
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
- printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
+ pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
@@ -176,6 +176,7 @@ void panic(const char *fmt, ...)
disabled_wait(caller);
}
#endif
+ pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -210,6 +211,7 @@ static const struct tnt tnts[] = {
{ TAINT_CRAP, 'C', ' ' },
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
{ TAINT_OOT_MODULE, 'O', ' ' },
+ { TAINT_UNSIGNED_MODULE, 'E', ' ' },
};
/**
@@ -228,6 +230,7 @@ static const struct tnt tnts[] = {
* 'C' - modules from drivers/staging are loaded.
* 'I' - Working around severe firmware bug.
* 'O' - Out-of-tree module has been loaded.
+ * 'E' - Unsigned module has been loaded.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -274,8 +277,7 @@ unsigned long get_taint(void)
void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
- printk(KERN_WARNING
- "Disabling lock debugging due to kernel taint\n");
+ pr_warn("Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
@@ -380,8 +382,7 @@ late_initcall(init_oops_id);
void print_oops_end_marker(void)
{
init_oops_id();
- printk(KERN_WARNING "---[ end trace %016llx ]---\n",
- (unsigned long long)oops_id);
+ pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
}
/*
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1ca7531..15f37ea 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -2,6 +2,7 @@
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
+#include <linux/compiler.h>
struct swsusp_info {
struct new_utsname uts;
@@ -11,7 +12,7 @@ struct swsusp_info {
unsigned long image_pages;
unsigned long pages;
unsigned long size;
-} __attribute__((aligned(PAGE_SIZE)));
+} __aligned(PAGE_SIZE);
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 149e745..18fb7a2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -27,6 +27,7 @@
#include <linux/highmem.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
struct linked_page {
struct linked_page *next;
char data[LINKED_PAGE_DATA_SIZE];
-} __attribute__((packed));
+} __packed;
static inline void
free_list_of_pages(struct linked_page *list, int clear_page_nosave)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 90b3d93..c3ad9ca 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,6 +26,7 @@
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
+#include <linux/compiler.h>
#include "power.h"
@@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state)
}
/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+void __weak arch_suspend_disable_irqs(void)
{
local_irq_disable();
}
/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+void __weak arch_suspend_enable_irqs(void)
{
local_irq_enable();
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c33ed2..8c9a481 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -101,7 +101,7 @@ struct swsusp_header {
unsigned int flags; /* Flags to pass to the "boot" kernel */
char orig_sig[10];
char sig[10];
-} __attribute__((packed));
+} __packed;
static struct swsusp_header *swsusp_header;
diff --git a/kernel/profile.c b/kernel/profile.c
index 1b266db..cb980f0 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -591,18 +591,28 @@ out_cleanup:
int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
{
struct proc_dir_entry *entry;
+ int err = 0;
if (!prof_on)
return 0;
- if (create_hash_tables())
- return -ENOMEM;
+
+ cpu_notifier_register_begin();
+
+ if (create_hash_tables()) {
+ err = -ENOMEM;
+ goto out;
+ }
+
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &proc_profile_operations);
if (!entry)
- return 0;
+ goto out;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
- hotcpu_notifier(profile_cpu_callback, 0);
- return 0;
+ __hotcpu_notifier(profile_cpu_callback, 0);
+
+out:
+ cpu_notifier_register_done();
+ return err;
}
subsys_initcall(create_proc_profile);
#endif /* CONFIG_PROC_FS */
diff --git a/kernel/relay.c b/kernel/relay.c
index 52d6a6f..5a56d3c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
static const struct pipe_buf_operations relay_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = relay_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -1253,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
+ nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 4aa8a30..51dbac6 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
counter->parent = parent;
}
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
- bool force)
+static u64 res_counter_uncharge_locked(struct res_counter *counter,
+ unsigned long val)
+{
+ if (WARN_ON(counter->usage < val))
+ val = counter->usage;
+
+ counter->usage -= val;
+ return counter->usage;
+}
+
+static int res_counter_charge_locked(struct res_counter *counter,
+ unsigned long val, bool force)
{
int ret = 0;
@@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
return __res_counter_charge(counter, val, limit_fail_at, true);
}
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
-{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
-
- counter->usage -= val;
- return counter->usage;
-}
-
u64 res_counter_uncharge_until(struct res_counter *counter,
struct res_counter *top,
unsigned long val)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b30a292..3ef6451 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -60,13 +60,14 @@
#include <linux/sched.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
+#include <linux/compiler.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __attribute__((weak)) sched_clock(void)
+unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d1b87b..268a45e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
+#include <linux/compiler.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -2845,52 +2846,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
- unsigned long flags;
- wait_queue_t wait;
-
- init_waitqueue_entry(&wait, current);
-
- __set_current_state(state);
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, &wait);
- spin_unlock(&q->lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&q->lock);
- __remove_wait_queue(q, &wait);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
#ifdef CONFIG_RT_MUTEXES
/*
@@ -6498,7 +6453,7 @@ static cpumask_var_t fallback_doms;
* cpu core maps. It is supposed to return 1 if the topology changed
* or 0 if it stayed the same.
*/
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
{
return 0;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index fd609bd..d8d046c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -71,7 +71,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
struct pt_regs *regs = task_pt_regs(task);
sd->nr = syscall_get_nr(task, regs);
- sd->arch = syscall_get_arch(task, regs);
+ sd->arch = syscall_get_arch();
/* Unroll syscall_get_args to help gcc on arm. */
syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);
@@ -348,7 +348,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
info.si_code = SYS_SECCOMP;
info.si_call_addr = (void __user *)KSTK_EIP(current);
info.si_errno = reason;
- info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+ info.si_arch = syscall_get_arch();
info.si_syscall = syscall;
force_sig_info(SIGSYS, &info, current);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 5d4b05a..6ea13c0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,6 +33,8 @@
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
+#include <linux/compiler.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -3618,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
}
#endif
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index adaeab6..fba0f29 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
return current->no_new_privs ? 1 : 0;
+ case PR_GET_THP_DISABLE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ break;
+ case PR_SET_THP_DISABLE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ down_write(&me->mm->mmap_sem);
+ if (arg2)
+ me->mm->def_flags |= VM_NOHUGEPAGE;
+ else
+ me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ up_write(&me->mm->mmap_sem);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5c14b54..74f5b58 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
+/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+#ifdef CONFIG_DETECT_HUNG_TASK
+static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
+#endif
+
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
@@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = &hung_task_timeout_max,
},
{
.procname = "hung_task_warnings",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b40279..f7df8ea 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
+#include <linux/compiler.h>
#include "tick-internal.h"
#include "ntp_internal.h"
@@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
@@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fc4da2d..c634868 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1301,7 +1301,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
* In that off case, we need to allocate for all possible cpus.
*/
#ifdef CONFIG_HOTPLUG_CPU
- get_online_cpus();
+ cpu_notifier_register_begin();
cpumask_copy(buffer->cpumask, cpu_online_mask);
#else
cpumask_copy(buffer->cpumask, cpu_possible_mask);
@@ -1324,10 +1324,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
#ifdef CONFIG_HOTPLUG_CPU
buffer->cpu_notify.notifier_call = rb_cpu_notify;
buffer->cpu_notify.priority = 0;
- register_cpu_notifier(&buffer->cpu_notify);
+ __register_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_done();
#endif
- put_online_cpus();
mutex_init(&buffer->mutex);
return buffer;
@@ -1341,7 +1341,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
fail_free_buffer:
kfree(buffer);
@@ -1358,16 +1360,17 @@ ring_buffer_free(struct ring_buffer *buffer)
{
int cpu;
- get_online_cpus();
-
#ifdef CONFIG_HOTPLUG_CPU
- unregister_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&buffer->cpu_notify);
#endif
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
- put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9be67c5..737b0ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3611,6 +3611,8 @@ static const char readme_msg[] =
#ifdef CONFIG_TRACER_SNAPSHOT
"\t\t snapshot\n"
#endif
+ "\t\t dump\n"
+ "\t\t cpudump\n"
"\t example: echo do_fault:traceoff > set_ftrace_filter\n"
"\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
"\t The first one will disable tracing every time do_fault is hit\n"
@@ -4390,8 +4392,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
static const struct pipe_buf_operations tracing_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -4486,7 +4486,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
spd.pages[i] = alloc_page(GFP_KERNEL);
if (!spd.pages[i])
break;
@@ -5279,8 +5279,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -5356,7 +5354,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
- for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ffc314b..2e29d7b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
+#include <linux/compiler.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -1279,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
extern struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_##call;
+ __aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 83a4378..3ddfd8f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -223,24 +223,25 @@ int ftrace_event_reg(struct ftrace_event_call *call,
{
struct ftrace_event_file *file = data;
+ WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
case TRACE_REG_REGISTER:
- return tracepoint_probe_register(call->name,
+ return tracepoint_probe_register(call->tp,
call->class->probe,
file);
case TRACE_REG_UNREGISTER:
- tracepoint_probe_unregister(call->name,
+ tracepoint_probe_unregister(call->tp,
call->class->probe,
file);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return tracepoint_probe_register(call->name,
+ return tracepoint_probe_register(call->tp,
call->class->perf_probe,
call);
case TRACE_REG_PERF_UNREGISTER:
- tracepoint_probe_unregister(call->name,
+ tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
call);
return 0;
@@ -352,7 +353,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
- "%s\n", call->name);
+ "%s\n", ftrace_event_name(call));
break;
}
set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
@@ -481,27 +482,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
{
struct ftrace_event_file *file;
struct ftrace_event_call *call;
+ const char *name;
int ret = -EINVAL;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+ name = ftrace_event_name(call);
- if (!call->name || !call->class || !call->class->reg)
+ if (!name || !call->class || !call->class->reg)
continue;
if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
continue;
if (match &&
- strcmp(match, call->name) != 0 &&
+ strcmp(match, name) != 0 &&
strcmp(match, call->class->system) != 0)
continue;
if (sub && strcmp(sub, call->class->system) != 0)
continue;
- if (event && strcmp(event, call->name) != 0)
+ if (event && strcmp(event, name) != 0)
continue;
ftrace_event_enable_disable(file, set);
@@ -699,7 +702,7 @@ static int t_show(struct seq_file *m, void *v)
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->class->system);
- seq_printf(m, "%s\n", call->name);
+ seq_printf(m, "%s\n", ftrace_event_name(call));
return 0;
}
@@ -792,7 +795,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!call->name || !call->class || !call->class->reg)
+ if (!ftrace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -907,7 +910,7 @@ static int f_show(struct seq_file *m, void *v)
switch ((unsigned long)v) {
case FORMAT_HEADER:
- seq_printf(m, "name: %s\n", call->name);
+ seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_printf(m, "format:\n");
return 0;
@@ -1527,6 +1530,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
struct trace_array *tr = file->tr;
struct list_head *head;
struct dentry *d_events;
+ const char *name;
int ret;
/*
@@ -1540,10 +1544,11 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
} else
d_events = parent;
- file->dir = debugfs_create_dir(call->name, d_events);
+ name = ftrace_event_name(call);
+ file->dir = debugfs_create_dir(name, d_events);
if (!file->dir) {
pr_warning("Could not create debugfs '%s' directory\n",
- call->name);
+ name);
return -1;
}
@@ -1567,7 +1572,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
ret = call->class->define_fields(call);
if (ret < 0) {
pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
+ " events/%s\n", name);
return -1;
}
}
@@ -1631,15 +1636,17 @@ static void event_remove(struct ftrace_event_call *call)
static int event_init(struct ftrace_event_call *call)
{
int ret = 0;
+ const char *name;
- if (WARN_ON(!call->name))
+ name = ftrace_event_name(call);
+ if (WARN_ON(!name))
return -EINVAL;
if (call->class->raw_init) {
ret = call->class->raw_init(call);
if (ret < 0 && ret != -ENOSYS)
pr_warn("Could not initialize trace events/%s\n",
- call->name);
+ name);
}
return ret;
@@ -1885,7 +1892,7 @@ __trace_add_event_dirs(struct trace_array *tr)
ret = __trace_add_new_event(call, tr);
if (ret < 0)
pr_warning("Could not create directory for event %s\n",
- call->name);
+ ftrace_event_name(call));
}
}
@@ -1894,18 +1901,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
{
struct ftrace_event_file *file;
struct ftrace_event_call *call;
+ const char *name;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+ name = ftrace_event_name(call);
- if (!call->name || !call->class || !call->class->reg)
+ if (!name || !call->class || !call->class->reg)
continue;
if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
continue;
- if (strcmp(event, call->name) == 0 &&
+ if (strcmp(event, name) == 0 &&
strcmp(system, call->class->system) == 0)
return file;
}
@@ -1973,7 +1982,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%s:%s:%s",
data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
data->file->event_call->class->system,
- data->file->event_call->name);
+ ftrace_event_name(data->file->event_call));
if (data->count == -1)
seq_printf(m, ":unlimited\n");
@@ -2193,7 +2202,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
pr_warning("Could not create directory for event %s\n",
- file->event_call->name);
+ ftrace_event_name(file->event_call));
}
}
@@ -2217,7 +2226,7 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
pr_warning("Could not create early event %s\n",
- call->name);
+ ftrace_event_name(call));
}
}
@@ -2549,7 +2558,7 @@ static __init void event_trace_self_tests(void)
continue;
#endif
- pr_info("Testing event %s: ", call->name);
+ pr_info("Testing event %s: ", ftrace_event_name(call));
/*
* If an event is already enabled, someone is using
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8efbb69..925f537 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
seq_printf(m, "%s:%s:%s",
enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
enable_data->file->event_call->class->system,
- enable_data->file->event_call->name);
+ ftrace_event_name(enable_data->file->event_call));
if (data->count == -1)
seq_puts(m, ":unlimited");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index ee0a509..d4ddde2 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -173,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = { \
}; \
\
struct ftrace_event_call __used event_##call = { \
- .name = #call, \
- .event.type = etype, \
.class = &event_class_ftrace_##call, \
+ { \
+ .name = #call, \
+ }, \
+ .event.type = etype, \
.print_fmt = print, \
.flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
}; \
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d021d21..903ae28 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -341,7 +341,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
struct trace_kprobe *tk;
list_for_each_entry(tk, &probe_list, list)
- if (strcmp(tk->tp.call.name, event) == 0 &&
+ if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
strcmp(tk->tp.call.class->system, group) == 0)
return tk;
return NULL;
@@ -516,7 +516,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&probe_lock);
/* Delete old (same name) event if exist */
- old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system);
+ old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+ tk->tp.call.class->system);
if (old_tk) {
ret = unregister_trace_kprobe(old_tk);
if (ret < 0)
@@ -564,7 +565,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (ret)
pr_warning("Failed to re-register probe %s on"
"%s: %d\n",
- tk->tp.call.name, mod->name, ret);
+ ftrace_event_name(&tk->tp.call),
+ mod->name, ret);
}
}
mutex_unlock(&probe_lock);
@@ -818,7 +820,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name);
+ seq_printf(m, ":%s/%s", tk->tp.call.class->system,
+ ftrace_event_name(&tk->tp.call));
if (!tk->symbol)
seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -876,7 +879,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_kprobe *tk = v;
- seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit,
+ seq_printf(m, " %-44s %15lu %15lu\n",
+ ftrace_event_name(&tk->tp.call), tk->nhit,
tk->rp.kp.nmissed);
return 0;
@@ -1011,7 +1015,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
goto partial;
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1047,7 +1051,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
goto partial;
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1286,7 +1290,8 @@ static int register_kprobe_event(struct trace_kprobe *tk)
call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
- pr_info("Failed to register kprobe event: %s\n", call->name);
+ pr_info("Failed to register kprobe event: %s\n",
+ ftrace_event_name(call));
kfree(call->print_fmt);
unregister_ftrace_event(&call->event);
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ca0e79e2..a436de1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -431,7 +431,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- ret = trace_seq_printf(s, "%s: ", event->name);
+ ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e447336..930e514 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -294,7 +294,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
struct trace_uprobe *tu;
list_for_each_entry(tu, &uprobe_list, list)
- if (strcmp(tu->tp.call.name, event) == 0 &&
+ if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
strcmp(tu->tp.call.class->system, group) == 0)
return tu;
@@ -324,7 +324,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
mutex_lock(&uprobe_lock);
/* register as an event */
- old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system);
+ old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+ tu->tp.call.class->system);
if (old_tu) {
/* delete old event */
ret = unregister_trace_uprobe(old_tu);
@@ -599,7 +600,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
- seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name);
+ seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
+ ftrace_event_name(&tu->tp.call));
seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
for (i = 0; i < tu->tp.nr_args; i++)
@@ -649,7 +651,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_uprobe *tu = v;
- seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit);
+ seq_printf(m, " %s %-44s %15lu\n", tu->filename,
+ ftrace_event_name(&tu->tp.call), tu->nhit);
return 0;
}
@@ -844,12 +847,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
tu = container_of(event, struct trace_uprobe, tp.call.event);
if (is_ret_probe(tu)) {
- if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name,
+ if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
entry->vaddr[1], entry->vaddr[0]))
goto partial;
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
- if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name,
+ if (!trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
entry->vaddr[0]))
goto partial;
data = DATAOF_TRACE_ENTRY(entry, false);
@@ -1275,7 +1280,8 @@ static int register_uprobe_event(struct trace_uprobe *tu)
ret = trace_add_event_call(call);
if (ret) {
- pr_info("Failed to register uprobe event: %s\n", call->name);
+ pr_info("Failed to register uprobe event: %s\n",
+ ftrace_event_name(call));
kfree(call->print_fmt);
unregister_ftrace_event(&call->event);
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 50f8329..ac5b23c 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 Mathieu Desnoyers
+ * Copyright (C) 2008-2014 Mathieu Desnoyers
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,39 +33,27 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
+#ifdef CONFIG_MODULES
/*
- * Tracepoints mutex protects the builtin and module tracepoints and the hash
- * table, as well as the local module list.
+ * Tracepoint module list mutex protects the local module list.
*/
-static DEFINE_MUTEX(tracepoints_mutex);
+static DEFINE_MUTEX(tracepoint_module_list_mutex);
-#ifdef CONFIG_MODULES
-/* Local list of struct module */
+/* Local list of struct tp_module */
static LIST_HEAD(tracepoint_module_list);
#endif /* CONFIG_MODULES */
/*
- * Tracepoint hash table, containing the active tracepoints.
- * Protected by tracepoints_mutex.
+ * tracepoints_mutex protects the builtin and module tracepoints.
+ * tracepoints_mutex nests inside tracepoint_module_list_mutex.
*/
-#define TRACEPOINT_HASH_BITS 6
-#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+static DEFINE_MUTEX(tracepoints_mutex);
/*
* Note about RCU :
* It is used to delay the free of multiple probes array until a quiescent
* state is reached.
- * Tracepoint entries modifications are protected by the tracepoints_mutex.
*/
-struct tracepoint_entry {
- struct hlist_node hlist;
- struct tracepoint_func *funcs;
- int refcount; /* Number of times armed. 0 if disarmed. */
- int enabled; /* Tracepoint enabled */
- char name[0];
-};
-
struct tp_probes {
struct rcu_head rcu;
struct tracepoint_func probes[0];
@@ -92,34 +80,33 @@ static inline void release_probes(struct tracepoint_func *old)
}
}
-static void debug_print_probes(struct tracepoint_entry *entry)
+static void debug_print_probes(struct tracepoint_func *funcs)
{
int i;
- if (!tracepoint_debug || !entry->funcs)
+ if (!tracepoint_debug || !funcs)
return;
- for (i = 0; entry->funcs[i].func; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
+ for (i = 0; funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
}
-static struct tracepoint_func *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry,
- void *probe, void *data)
+static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
{
int nr_probes = 0;
struct tracepoint_func *old, *new;
- if (WARN_ON(!probe))
+ if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
- debug_print_probes(entry);
- old = entry->funcs;
+ debug_print_probes(*funcs);
+ old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++)
- if (old[nr_probes].func == probe &&
- old[nr_probes].data == data)
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
@@ -128,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
return ERR_PTR(-ENOMEM);
if (old)
memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- new[nr_probes].func = probe;
- new[nr_probes].data = data;
+ new[nr_probes] = *tp_func;
new[nr_probes + 1].func = NULL;
- entry->refcount = nr_probes + 1;
- entry->funcs = new;
- debug_print_probes(entry);
+ *funcs = new;
+ debug_print_probes(*funcs);
return old;
}
-static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
- void *probe, void *data)
+static void *func_remove(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
{
int nr_probes = 0, nr_del = 0, i;
struct tracepoint_func *old, *new;
- old = entry->funcs;
+ old = *funcs;
if (!old)
return ERR_PTR(-ENOENT);
- debug_print_probes(entry);
+ debug_print_probes(*funcs);
/* (N -> M), (N > 1, M >= 0) probes */
- if (probe) {
+ if (tp_func->func) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (old[nr_probes].func == probe &&
- old[nr_probes].data == data)
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
nr_del++;
}
}
@@ -165,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
*/
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
- entry->funcs = NULL;
- entry->refcount = 0;
- debug_print_probes(entry);
+ *funcs = NULL;
+ debug_print_probes(*funcs);
return old;
} else {
int j = 0;
@@ -177,91 +160,35 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
- if (old[i].func != probe || old[i].data != data)
+ if (old[i].func != tp_func->func
+ || old[i].data != tp_func->data)
new[j++] = old[i];
new[nr_probes - nr_del].func = NULL;
- entry->refcount = nr_probes - nr_del;
- entry->funcs = new;
+ *funcs = new;
}
- debug_print_probes(entry);
+ debug_print_probes(*funcs);
return old;
}
/*
- * Get tracepoint if the tracepoint is present in the tracepoint hash table.
- * Must be called with tracepoints_mutex held.
- * Returns NULL if not present.
+ * Add the probe function to a tracepoint.
*/
-static struct tracepoint_entry *get_tracepoint(const char *name)
+static int tracepoint_add_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
{
- struct hlist_head *head;
- struct tracepoint_entry *e;
- u32 hash = jhash(name, strlen(name), 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, head, hlist) {
- if (!strcmp(name, e->name))
- return e;
- }
- return NULL;
-}
+ struct tracepoint_func *old, *tp_funcs;
-/*
- * Add the tracepoint to the tracepoint hash table. Must be called with
- * tracepoints_mutex held.
- */
-static struct tracepoint_entry *add_tracepoint(const char *name)
-{
- struct hlist_head *head;
- struct tracepoint_entry *e;
- size_t name_len = strlen(name) + 1;
- u32 hash = jhash(name, name_len-1, 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, head, hlist) {
- if (!strcmp(name, e->name)) {
- printk(KERN_NOTICE
- "tracepoint %s busy\n", name);
- return ERR_PTR(-EEXIST); /* Already there */
- }
- }
- /*
- * Using kmalloc here to allocate a variable length element. Could
- * cause some memory fragmentation if overused.
- */
- e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
- if (!e)
- return ERR_PTR(-ENOMEM);
- memcpy(&e->name[0], name, name_len);
- e->funcs = NULL;
- e->refcount = 0;
- e->enabled = 0;
- hlist_add_head(&e->hlist, head);
- return e;
-}
+ if (tp->regfunc && !static_key_enabled(&tp->key))
+ tp->regfunc();
-/*
- * Remove the tracepoint from the tracepoint hash table. Must be called with
- * mutex_lock held.
- */
-static inline void remove_tracepoint(struct tracepoint_entry *e)
-{
- hlist_del(&e->hlist);
- kfree(e);
-}
-
-/*
- * Sets the probe callback corresponding to one tracepoint.
- */
-static void set_tracepoint(struct tracepoint_entry **entry,
- struct tracepoint *elem, int active)
-{
- WARN_ON(strcmp((*entry)->name, elem->name) != 0);
-
- if (elem->regfunc && !static_key_enabled(&elem->key) && active)
- elem->regfunc();
- else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
- elem->unregfunc();
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_add(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
+ }
+ release_probes(old);
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -270,197 +197,163 @@ static void set_tracepoint(struct tracepoint_entry **entry,
* include/linux/tracepoints.h. A matching smp_read_barrier_depends()
* is used.
*/
- rcu_assign_pointer(elem->funcs, (*entry)->funcs);
- if (active && !static_key_enabled(&elem->key))
- static_key_slow_inc(&elem->key);
- else if (!active && static_key_enabled(&elem->key))
- static_key_slow_dec(&elem->key);
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ if (!static_key_enabled(&tp->key))
+ static_key_slow_inc(&tp->key);
+ return 0;
}
/*
- * Disable a tracepoint and its probe callback.
+ * Remove a probe function from a tracepoint.
* Note: only waiting an RCU period after setting elem->call to the empty
* function insures that the original callback is not used anymore. This insured
* by preempt_disable around the call site.
*/
-static void disable_tracepoint(struct tracepoint *elem)
+static int tracepoint_remove_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
{
- if (elem->unregfunc && static_key_enabled(&elem->key))
- elem->unregfunc();
-
- if (static_key_enabled(&elem->key))
- static_key_slow_dec(&elem->key);
- rcu_assign_pointer(elem->funcs, NULL);
-}
+ struct tracepoint_func *old, *tp_funcs;
-/**
- * tracepoint_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of tracepoints.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probe_range(struct tracepoint * const *begin,
- struct tracepoint * const *end)
-{
- struct tracepoint * const *iter;
- struct tracepoint_entry *mark_entry;
-
- if (!begin)
- return;
-
- for (iter = begin; iter < end; iter++) {
- mark_entry = get_tracepoint((*iter)->name);
- if (mark_entry) {
- set_tracepoint(&mark_entry, *iter,
- !!mark_entry->refcount);
- mark_entry->enabled = !!mark_entry->refcount;
- } else {
- disable_tracepoint(*iter);
- }
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_remove(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
}
-}
-
-#ifdef CONFIG_MODULES
-void module_update_tracepoints(void)
-{
- struct tp_module *tp_mod;
-
- list_for_each_entry(tp_mod, &tracepoint_module_list, list)
- tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
- tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
-}
-#else /* CONFIG_MODULES */
-void module_update_tracepoints(void)
-{
-}
-#endif /* CONFIG_MODULES */
+ release_probes(old);
+ if (!tp_funcs) {
+ /* Removed last function */
+ if (tp->unregfunc && static_key_enabled(&tp->key))
+ tp->unregfunc();
-/*
- * Update probes, removing the faulty probes.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probes(void)
-{
- /* Core kernel tracepoints */
- tracepoint_update_probe_range(__start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- /* tracepoints in modules. */
- module_update_tracepoints();
-}
-
-static struct tracepoint_func *
-tracepoint_add_probe(const char *name, void *probe, void *data)
-{
- struct tracepoint_entry *entry;
- struct tracepoint_func *old;
-
- entry = get_tracepoint(name);
- if (!entry) {
- entry = add_tracepoint(name);
- if (IS_ERR(entry))
- return (struct tracepoint_func *)entry;
+ if (static_key_enabled(&tp->key))
+ static_key_slow_dec(&tp->key);
}
- old = tracepoint_entry_add_probe(entry, probe, data);
- if (IS_ERR(old) && !entry->refcount)
- remove_tracepoint(entry);
- return old;
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ return 0;
}
/**
* tracepoint_probe_register - Connect a probe to a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
* @probe: probe handler
- * @data: probe private data
- *
- * Returns:
- * - 0 if the probe was successfully registered, and tracepoint
- * callsites are currently loaded for that probe,
- * - -ENODEV if the probe was successfully registered, but no tracepoint
- * callsite is currently loaded for that probe,
- * - other negative error value on error.
- *
- * When tracepoint_probe_register() returns either 0 or -ENODEV,
- * parameters @name, @probe, and @data may be used by the tracepoint
- * infrastructure until the probe is unregistered.
*
- * The probe address must at least be aligned on the architecture pointer size.
+ * Returns 0 if ok, error value on error.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
*/
-int tracepoint_probe_register(const char *name, void *probe, void *data)
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
- struct tracepoint_func *old;
- struct tracepoint_entry *entry;
- int ret = 0;
+ struct tracepoint_func tp_func;
+ int ret;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_update_probes(); /* may update entry */
- entry = get_tracepoint(name);
- /* Make sure the entry was enabled */
- if (!entry || !entry->enabled)
- ret = -ENODEV;
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_add_func(tp, &tp_func);
mutex_unlock(&tracepoints_mutex);
- release_probes(old);
return ret;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static struct tracepoint_func *
-tracepoint_remove_probe(const char *name, void *probe, void *data)
-{
- struct tracepoint_entry *entry;
- struct tracepoint_func *old;
-
- entry = get_tracepoint(name);
- if (!entry)
- return ERR_PTR(-ENOENT);
- old = tracepoint_entry_remove_probe(entry, probe, data);
- if (IS_ERR(old))
- return old;
- if (!entry->refcount)
- remove_tracepoint(entry);
- return old;
-}
-
/**
* tracepoint_probe_unregister - Disconnect a probe from a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
* @probe: probe function pointer
- * @data: probe private data
*
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
+ * Returns 0 if ok, error value on error.
*/
-int tracepoint_probe_unregister(const char *name, void *probe, void *data)
+int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
{
- struct tracepoint_func *old;
+ struct tracepoint_func tp_func;
+ int ret;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_update_probes(); /* may update entry */
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_remove_func(tp, &tp_func);
mutex_unlock(&tracepoints_mutex);
- release_probes(old);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
-
#ifdef CONFIG_MODULES
bool trace_module_has_bad_taint(struct module *mod)
{
- return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+ return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
+ (1 << TAINT_UNSIGNED_MODULE));
+}
+
+static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
+
+/**
+ * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * @nb: notifier block
+ *
+ * Notifiers registered with this function are called on module
+ * coming/going with the tracepoint_module_list_mutex held.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
+ */
+int register_tracepoint_module_notifier(struct notifier_block *nb)
+{
+ struct tp_module *tp_mod;
+ int ret;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
+
+/**
+ * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * @nb: notifier block
+ *
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
+ */
+int unregister_tracepoint_module_notifier(struct notifier_block *nb)
+{
+ struct tp_module *tp_mod;
+ int ret;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
+
+/*
+ * Ensure the tracer unregistered the module's probes before the module
+ * teardown is performed. Prevents leaks of probe and data pointers.
+ */
+static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
+{
+ struct tracepoint * const *iter;
+
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ WARN_ON_ONCE((*iter)->funcs);
}
static int tracepoint_module_coming(struct module *mod)
@@ -474,40 +367,45 @@ static int tracepoint_module_coming(struct module *mod)
/*
* We skip modules that taint the kernel, especially those with different
* module headers (for forced load), to make sure we don't cause a crash.
- * Staging and out-of-tree GPL modules are fine.
+ * Staging, out-of-tree, and unsigned GPL modules are fine.
*/
if (trace_module_has_bad_taint(mod))
return 0;
- mutex_lock(&tracepoints_mutex);
+ mutex_lock(&tracepoint_module_list_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
if (!tp_mod) {
ret = -ENOMEM;
goto end;
}
- tp_mod->num_tracepoints = mod->num_tracepoints;
- tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+ tp_mod->mod = mod;
list_add_tail(&tp_mod->list, &tracepoint_module_list);
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_COMING, tp_mod);
end:
- mutex_unlock(&tracepoints_mutex);
+ mutex_unlock(&tracepoint_module_list_mutex);
return ret;
}
-static int tracepoint_module_going(struct module *mod)
+static void tracepoint_module_going(struct module *mod)
{
- struct tp_module *pos;
+ struct tp_module *tp_mod;
if (!mod->num_tracepoints)
- return 0;
+ return;
- mutex_lock(&tracepoints_mutex);
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
- list_for_each_entry(pos, &tracepoint_module_list, list) {
- if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
- list_del(&pos->list);
- kfree(pos);
+ mutex_lock(&tracepoint_module_list_mutex);
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
+ if (tp_mod->mod == mod) {
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_GOING, tp_mod);
+ list_del(&tp_mod->list);
+ kfree(tp_mod);
+ /*
+ * Called the going notifier before checking for
+ * quiescence.
+ */
+ tp_module_going_check_quiescent(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
break;
}
}
@@ -517,12 +415,11 @@ static int tracepoint_module_going(struct module *mod)
* flag on "going", in case a module taints the kernel only after being
* loaded.
*/
- mutex_unlock(&tracepoints_mutex);
- return 0;
+ mutex_unlock(&tracepoint_module_list_mutex);
}
-int tracepoint_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int tracepoint_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
int ret = 0;
@@ -534,24 +431,58 @@ int tracepoint_module_notify(struct notifier_block *self,
case MODULE_STATE_LIVE:
break;
case MODULE_STATE_GOING:
- ret = tracepoint_module_going(mod);
+ tracepoint_module_going(mod);
+ break;
+ case MODULE_STATE_UNFORMED:
break;
}
return ret;
}
-struct notifier_block tracepoint_module_nb = {
+static struct notifier_block tracepoint_module_nb = {
.notifier_call = tracepoint_module_notify,
.priority = 0,
};
-static int init_tracepoints(void)
+static __init int init_tracepoints(void)
{
- return register_module_notifier(&tracepoint_module_nb);
+ int ret;
+
+ ret = register_module_notifier(&tracepoint_module_nb);
+ if (ret)
+ pr_warning("Failed to register tracepoint module enter notifier\n");
+
+ return ret;
}
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end,
+ void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ struct tracepoint * const *iter;
+
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ fct(*iter, priv);
+}
+
+/**
+ * for_each_kernel_tracepoint - iteration on all kernel tracepoints
+ * @fct: callback
+ * @priv: private data
+ */
+void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ for_each_tracepoint_range(__start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs, fct, priv);
+}
+EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
+
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */