diff options
author | Jens Axboe <axboe@fb.com> | 2014-04-15 20:02:24 (GMT) |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2014-04-15 20:02:24 (GMT) |
commit | f89e0dd9d1a72fdf6b8958bcadfa6abf84f3cae0 (patch) | |
tree | 6d4ca8c67dc22d1c81053392078588f9ab3804b5 /kernel | |
parent | 21f9fcd81593e201172160853b8647336fb81f4f (diff) | |
parent | c9eaa447e77efe77b7fa4c953bd62de8297fd6c5 (diff) | |
download | linux-f89e0dd9d1a72fdf6b8958bcadfa6abf84f3cae0.tar.xz |
Merge tag 'v3.15-rc1' into for-3.16/core
We don't like this, but things have diverged with the blk-mq fixes
in 3.15-rc1. So merge it in.
Diffstat (limited to 'kernel')
40 files changed, 726 insertions, 594 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 95a20f3..7c28936 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -182,7 +182,7 @@ struct audit_buffer { struct audit_reply { __u32 portid; - struct net *net; + struct net *net; struct sk_buff *skb; }; @@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb) if (printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); else - audit_log_lost("printk limit exceeded\n"); + audit_log_lost("printk limit exceeded"); } audit_hold_skb(skb); @@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb) BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ if (audit_pid) { pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared\n"); + audit_log_lost("auditd disappeared"); audit_pid = 0; audit_sock = NULL; } @@ -607,7 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; - /* Only support the initial namespaces for now. */ + /* Only support initial user namespace for now. */ /* * We return ECONNREFUSED because it tricks userspace into thinking * that audit was not configured into the kernel. Lots of users @@ -618,8 +618,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ - if ((current_user_ns() != &init_user_ns) || - (task_active_pid_ns(current) != &init_pid_ns)) + if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { @@ -639,6 +638,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: + /* Only support auditd and auditctl in initial pid namespace + * for now. */ + if ((task_active_pid_ns(current) != &init_pid_ns)) + return -EPERM; + if (!capable(CAP_AUDIT_CONTROL)) err = -EPERM; break; @@ -659,6 +663,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); + pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; @@ -668,7 +673,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return rc; - audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); + audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); @@ -1097,7 +1102,7 @@ static void __net_exit audit_net_exit(struct net *net) audit_sock = NULL; } - rcu_assign_pointer(aunet->nlsk, NULL); + RCU_INIT_POINTER(aunet->nlsk, NULL); synchronize_net(); netlink_kernel_release(sock); } @@ -1829,11 +1834,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) spin_unlock_irq(&tsk->sighand->siglock); audit_log_format(ab, - " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - sys_getppid(), - tsk->pid, + task_ppid_nr(tsk), + task_pid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/audit.h b/kernel/audit.h index 8df13221..7bb6573 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -106,6 +106,11 @@ struct audit_names { bool should_free; }; +struct audit_proctitle { + int len; /* length of the cmdline field. */ + char *value; /* the cmdline field */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -202,6 +207,7 @@ struct audit_context { } execve; }; int fds[2]; + struct audit_proctitle proctitle; #if AUDIT_DEBUG int put_count; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 92062fd..8e9bc9c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -19,6 +19,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> @@ -226,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry) #endif /* Common user-space to kernel rule translation. */ -static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) +static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule) { unsigned listnr; struct audit_entry *entry; @@ -249,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { - printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + pr_err("AUDIT_POSSIBLE is deprecated\n"); goto exit_err; } if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) @@ -403,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, int i; char *str; - entry = audit_to_entry_common((struct audit_rule *)data); + entry = audit_to_entry_common(data); if (IS_ERR(entry)) goto exit_nofree; @@ -431,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->val = 0; } + if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { + struct pid *pid; + rcu_read_lock(); + pid = find_vpid(f->val); + if (!pid) { + rcu_read_unlock(); + err = -ESRCH; + goto exit_free; + } + f->val = pid_nr(pid); + rcu_read_unlock(); + } + err = audit_field_valid(entry, f); if (err) goto exit_free; @@ -479,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM " - "\'%s\' is invalid\n", str); + pr_warn("audit rule for LSM \'%s\' is invalid\n", + str); err = 0; } if (err) { @@ -709,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM \'%s\' is " - "invalid\n", df->lsm_str); + pr_warn("audit rule for LSM \'%s\' is invalid\n", + df->lsm_str); ret = 0; } @@ -1240,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type, for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + pid_t pid; int result = 0; u32 sid; switch (f->type) { case AUDIT_PID: - result = audit_comparator(task_pid_vnr(current), f->op, f->val); + pid = task_pid_nr(current); + result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: result = audit_uid_comparator(current_uid(), f->op, f->uid); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7aef2f4..f251a5e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -42,6 +42,8 @@ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <asm/types.h> #include <linux/atomic.h> @@ -68,6 +70,7 @@ #include <linux/capability.h> #include <linux/fs_struct.h> #include <linux/compat.h> +#include <linux/ctype.h> #include "audit.h" @@ -79,6 +82,9 @@ /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 +/* max length to print of cmdline/proctitle value during audit */ +#define MAX_PROCTITLE_AUDIT_LEN 128 + /* number of audit rules */ int audit_n_rules; @@ -451,15 +457,17 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_field *f = &rule->fields[i]; struct audit_names *n; int result = 0; + pid_t pid; switch (f->type) { case AUDIT_PID: - result = audit_comparator(tsk->pid, f->op, f->val); + pid = task_pid_nr(tsk); + result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: if (ctx) { if (!ctx->ppid) - ctx->ppid = sys_getppid(); + ctx->ppid = task_ppid_nr(tsk); result = audit_comparator(ctx->ppid, f->op, f->val); } break; @@ -805,7 +813,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) rcu_read_unlock(); } -static inline struct audit_context *audit_get_context(struct task_struct *tsk, +/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */ +static inline struct audit_context *audit_take_context(struct task_struct *tsk, int return_valid, long return_code) { @@ -842,6 +851,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, return context; } +static inline void audit_proctitle_free(struct audit_context *context) +{ + kfree(context->proctitle.value); + context->proctitle.value = NULL; + context->proctitle.len = 0; +} + static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -850,16 +866,15 @@ static inline void audit_free_names(struct audit_context *context) if (context->put_count + context->ino_count != context->name_count) { int i = 0; - printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d" - " ino_count=%d [NOT freeing]\n", - __FILE__, __LINE__, + pr_err("%s:%d(:%d): major=%d in_syscall=%d" + " name_count=%d put_count=%d ino_count=%d" + " [NOT freeing]\n", __FILE__, __LINE__, context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i++, - n->name, n->name->name ?: "(null)"); + pr_err("names[%d] = %p = %s\n", i++, n->name, + n->name->name ?: "(null)"); } dump_stack(); return; @@ -955,6 +970,7 @@ static inline void audit_free_context(struct audit_context *context) audit_free_aux(context); kfree(context->filterkey); kfree(context->sockaddr); + audit_proctitle_free(context); kfree(context); } @@ -1157,7 +1173,7 @@ static void audit_log_execve_info(struct audit_context *context, */ buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); if (!buf) { - audit_panic("out of memory for argv string\n"); + audit_panic("out of memory for argv string"); return; } @@ -1271,6 +1287,59 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static inline int audit_proctitle_rtrim(char *proctitle, int len) +{ + char *end = proctitle + len - 1; + while (end > proctitle && !isprint(*end)) + end--; + + /* catch the case where proctitle is only 1 non-print character */ + len = end - proctitle + 1; + len -= isprint(proctitle[len-1]) == 0; + return len; +} + +static void audit_log_proctitle(struct task_struct *tsk, + struct audit_context *context) +{ + int res; + char *buf; + char *msg = "(null)"; + int len = strlen(msg); + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); + if (!ab) + return; /* audit_panic or being filtered */ + + audit_log_format(ab, "proctitle="); + + /* Not cached */ + if (!context->proctitle.value) { + buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL); + if (!buf) + goto out; + /* Historically called this from procfs naming */ + res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN); + if (res == 0) { + kfree(buf); + goto out; + } + res = audit_proctitle_rtrim(buf, res); + if (res == 0) { + kfree(buf); + goto out; + } + context->proctitle.value = buf; + context->proctitle.len = res; + } + msg = context->proctitle.value; + len = context->proctitle.len; +out: + audit_log_n_untrustedstring(ab, msg, len); + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1388,6 +1457,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_name(context, n, NULL, i++, &call_panic); } + audit_log_proctitle(tsk, context); + /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); if (ab) @@ -1406,7 +1477,7 @@ void __audit_free(struct task_struct *tsk) { struct audit_context *context; - context = audit_get_context(tsk, 0, 0); + context = audit_take_context(tsk, 0, 0); if (!context) return; @@ -1500,7 +1571,7 @@ void __audit_syscall_exit(int success, long return_code) else success = AUDITSC_FAILURE; - context = audit_get_context(tsk, success, return_code); + context = audit_take_context(tsk, success, return_code); if (!context) return; @@ -1550,7 +1621,7 @@ static inline void handle_one(const struct inode *inode) if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); + pr_warn("out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); @@ -1609,8 +1680,7 @@ retry: goto retry; } /* too bad */ - printk(KERN_WARNING - "out of memory, audit has lost a tree reference\n"); + pr_warn("out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; @@ -1682,7 +1752,7 @@ void __audit_getname(struct filename *name) if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", + pr_err("%s:%d(:%d): ignoring getname(%p)\n", __FILE__, __LINE__, context->serial, name); dump_stack(); #endif @@ -1721,15 +1791,15 @@ void audit_putname(struct filename *name) BUG_ON(!context); if (!name->aname || !context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", + pr_err("%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; int i = 0; list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i++, - n->name, n->name->name ?: "(null)"); + pr_err("name[%d] = %p = %s\n", i++, n->name, + n->name->name ?: "(null)"); } #endif final_putname(name); @@ -1738,9 +1808,8 @@ void audit_putname(struct filename *name) else { ++context->put_count; if (context->put_count > context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d" - " in_syscall=%d putname(%p) name_count=%d" - " put_count=%d\n", + pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)" + " name_count=%d put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, context->in_syscall, name->name, @@ -1981,12 +2050,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u" - " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" - " res=%d", - current->pid, uid, - oldloginuid, loginuid, oldsessionid, sessionid, - !rc); + audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, oldsessionid, sessionid, !rc); audit_log_end(ab); } @@ -2208,7 +2275,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = t->pid; + context->target_pid = task_pid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2233,7 +2300,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = tsk->pid; + audit_sig_pid = task_pid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2247,7 +2314,7 @@ int __audit_signal_info(int sig, struct task_struct *t) /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { - ctx->target_pid = t->tgid; + ctx->target_pid = task_tgid_nr(t); ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); @@ -2268,7 +2335,7 @@ int __audit_signal_info(int sig, struct task_struct *t) } BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); - axp->target_pid[axp->pid_count] = t->tgid; + axp->target_pid[axp->pid_count] = task_tgid_nr(t); axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); @@ -2368,7 +2435,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); audit_log_untrustedstring(ab, current->comm); if (mm) { down_read(&mm->mmap_sem); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fede3d3..9fcdaa7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1487,6 +1487,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + bool new_sb; /* * The first time anyone tries to mount a cgroup, enable the list @@ -1603,8 +1604,8 @@ out_unlock: if (ret) return ERR_PTR(ret); - dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL); - if (IS_ERR(dentry)) + dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); return dentry; } @@ -2345,11 +2346,26 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return ret; } +/* set uid and gid of cgroup dirs and files to that of the creator */ +static int cgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; + int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; @@ -2357,7 +2373,13 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), 0, cft->kf_ops, cft, NULL, false, key); - return PTR_ERR_OR_ZERO(kn); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = cgroup_kn_set_ugid(kn); + if (ret) + kernfs_remove(kn); + return ret; } /** @@ -3752,6 +3774,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + err = cgroup_kn_set_ugid(kn); + if (err) + goto err_destroy; + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; diff --git a/kernel/cpu.c b/kernel/cpu.c index deff2e6..a9e710e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/suspend.h> +#include <linux/lockdep.h> #include "smpboot.h" @@ -27,18 +28,23 @@ static DEFINE_MUTEX(cpu_add_remove_lock); /* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. + * The following two APIs (cpu_maps_update_begin/done) must be used when + * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. + * The APIs cpu_notifier_register_begin/done() must be used to protect CPU + * hotplug callback (un)registration performed using __register_cpu_notifier() + * or __unregister_cpu_notifier(). */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_done); static RAW_NOTIFIER_HEAD(cpu_chain); @@ -57,17 +63,30 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif } cpu_hotplug = { .active_writer = NULL, .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "cpu_hotplug.lock" }, +#endif }; +/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ +#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) +#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) + void get_online_cpus(void) { might_sleep(); if (cpu_hotplug.active_writer == current) return; + cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); @@ -87,6 +106,7 @@ void put_online_cpus(void) if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -117,6 +137,7 @@ void cpu_hotplug_begin(void) { cpu_hotplug.active_writer = current; + cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); if (likely(!cpu_hotplug.refcount)) @@ -131,6 +152,7 @@ void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } /* @@ -166,6 +188,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -189,6 +216,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -198,6 +226,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 99982a7..2956c8d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/rcupdate.h> #include <asm/cacheflush.h> @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); + if (current->mm) { + int i; + + for (i = 0; i < VMACACHE_SIZE; i++) { + if (!current->vmacache[i]) + continue; + flush_cache_range(current->vmacache[i], + addr, addr + BREAK_INSTR_SIZE); + } } + /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/exit.c b/kernel/exit.c index 6480d1c..6ed6a1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -570,7 +570,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ @@ -784,9 +784,10 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - check_stack_usage(); exit_thread(); /* @@ -799,19 +800,15 @@ void do_exit(long code) cgroup_exit(tsk); - if (group_dead) - disassociate_ctty(1); - module_put(task_thread_info(tsk)->exec_domain->module); - proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); @@ -844,6 +841,7 @@ void do_exit(long code) validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); @@ -1038,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return wait_noreap_copyout(wo, p, pid, uid, why, status); } + traced = ptrace_reparented(p); /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } - - traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check * thread_group_leader() to filter out sub-threads. @@ -1109,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. + * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); @@ -1146,22 +1140,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1338,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + int ret; + + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, p); if (!ret) return ret; @@ -1356,33 +1352,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(p->exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by @@ -1408,19 +1415,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, wo->notask_error = 0; } else { /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ diff --git a/kernel/fork.c b/kernel/fork.c index abc4589..54a8d26 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> +#include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -71,6 +73,7 @@ #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> +#include <linux/compiler.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -284,7 +287,7 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -364,7 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; + mm->vmacache_seqnum = 0; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -530,8 +533,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); @@ -540,8 +541,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm_init_owner(mm, p); clear_tlb_flush_pending(mm); - if (likely(!mm_alloc_pgd(mm))) { + if (current->mm) { + mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; + } else { + mm->flags = default_dump_filter; mm->def_flags = 0; + } + + if (likely(!mm_alloc_pgd(mm))) { mmu_notifier_mm_init(mm); return mm; } @@ -877,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; @@ -1070,15 +1081,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1228,7 +1230,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); @@ -1274,7 +1277,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; diff --git a/kernel/futex.c b/kernel/futex.c index 67dacaf..5f58927 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -70,7 +70,10 @@ #include "locking/rtmutex_common.h" /* - * Basic futex operation and ordering guarantees: + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires @@ -119,7 +122,7 @@ * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * - * waiters++; + * waiters++; (a) * mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | @@ -135,14 +138,14 @@ * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); - * wake_waiters(futex); - * unlock(hash_bucket(futex)); + * else wake_waiters(futex); + * waiters--; (b) unlock(hash_bucket(futex)); * - * Where (A) orders the waiters increment and the futex value read -- this - * is guaranteed by the head counter in the hb spinlock; and where (B) - * orders the write to futex and the waiters read -- this is done by the - * barriers in get_futex_key_refs(), through either ihold or atomic_inc, - * depending on the futex type. + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read -- this is done by the barriers in + * get_futex_key_refs(), through either ihold or atomic_inc, depending on the + * futex type. * * This yields the following case (where X:=waiters, Y:=futex): * @@ -155,6 +158,17 @@ * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in queue_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. */ #ifndef CONFIG_HAVE_FUTEX_CMPXCHG @@ -1452,6 +1466,7 @@ retry: hb2 = hash_futex(&key2); retry_private: + hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -1461,6 +1476,7 @@ retry_private: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ret = get_user(curval, uaddr1); if (ret) @@ -1510,6 +1526,7 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); @@ -1519,6 +1536,7 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); cond_resched(); @@ -1594,6 +1612,7 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); /* * drop_futex_key_refs() must be called outside the spinlocks. During diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3127ad5..cb0cf37 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <asm/sections.h> @@ -36,8 +37,8 @@ * These will be re-linked against their real values * during the second link stage. */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __weak; +extern const u8 kallsyms_names[] __weak; /* * Tell the compiler that the count isn't in the small data section if the arch @@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak)); extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __weak; static inline int is_kernel_inittext(unsigned long addr) { diff --git a/kernel/kexec.c b/kernel/kexec.c index c0d261c..c8380ad 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> +#include <linux/compiler.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -1551,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...) * provide an empty default implementation here -- architecture * code may override this */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +unsigned long __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e660964..2495a9b 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -18,6 +18,7 @@ #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> +#include <linux/compiler.h> #include <linux/rcupdate.h> /* rcu_expedited */ @@ -162,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); +extern const void __start_notes __weak; +extern const void __stop_notes __weak; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 306a76b..b8bdcd4 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o diff --git a/kernel/module.c b/kernel/module.c index 8dc7f5e..1186940 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->incs, 1); + raw_cpu_write(mod->refptr->incs, 1); return 0; } @@ -1013,6 +1013,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'F'; if (mod->taints & (1 << TAINT_CRAP)) buf[l++] = 'C'; + if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) + buf[l++] = 'E'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -3218,7 +3220,7 @@ static int load_module(struct load_info *info, const char __user *uargs, pr_notice_once("%s: module verification failed: signature " "and/or required key missing - tainting " "kernel\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } #endif @@ -3813,12 +3815,12 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - printk(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf)); } preempt_enable(); if (last_unloaded_module[0]) - printk(" [last unloaded: %s]", last_unloaded_module); - printk("\n"); + pr_cont(" [last unloaded: %s]", last_unloaded_module); + pr_cont("\n"); } #ifdef CONFIG_MODVERSIONS diff --git a/kernel/panic.c b/kernel/panic.c index cca8a91..d02fa9f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -100,7 +100,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing @@ -141,7 +141,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); @@ -165,7 +165,7 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) @@ -176,6 +176,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif + pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -210,6 +211,7 @@ static const struct tnt tnts[] = { { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, + { TAINT_UNSIGNED_MODULE, 'E', ' ' }, }; /** @@ -228,6 +230,7 @@ static const struct tnt tnts[] = { * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. + * 'E' - Unsigned module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ @@ -274,8 +277,7 @@ unsigned long get_taint(void) void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) - printk(KERN_WARNING - "Disabling lock debugging due to kernel taint\n"); + pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -380,8 +382,7 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } /* diff --git a/kernel/power/power.h b/kernel/power/power.h index 1ca7531..15f37ea 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -2,6 +2,7 @@ #include <linux/suspend_ioctls.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/compiler.h> struct swsusp_info { struct new_utsname uts; @@ -11,7 +12,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 149e745..18fb7a2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -27,6 +27,7 @@ #include <linux/highmem.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 90b3d93..c3ad9ca 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -26,6 +26,7 @@ #include <linux/syscore_ops.h> #include <linux/ftrace.h> #include <trace/events/power.h> +#include <linux/compiler.h> #include "power.h" @@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state) } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +void __weak arch_suspend_disable_irqs(void) { local_irq_disable(); } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +void __weak arch_suspend_enable_irqs(void) { local_irq_enable(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c33ed2..8c9a481 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -101,7 +101,7 @@ struct swsusp_header { unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; diff --git a/kernel/profile.c b/kernel/profile.c index 1b266db..cb980f0 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -591,18 +591,28 @@ out_cleanup: int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ { struct proc_dir_entry *entry; + int err = 0; if (!prof_on) return 0; - if (create_hash_tables()) - return -ENOMEM; + + cpu_notifier_register_begin(); + + if (create_hash_tables()) { + err = -ENOMEM; + goto out; + } + entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - return 0; + goto out; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - hotcpu_notifier(profile_cpu_callback, 0); - return 0; + __hotcpu_notifier(profile_cpu_callback, 0); + +out: + cpu_notifier_register_done(); + return err; } subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */ diff --git a/kernel/relay.c b/kernel/relay.c index 52d6a6f..5a56d3c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -1253,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); + nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 4aa8a30..51dbac6 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b30a292..3ef6451 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -60,13 +60,14 @@ #include <linux/sched.h> #include <linux/static_key.h> #include <linux/workqueue.h> +#include <linux/compiler.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d1b87b..268a45e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <linux/init_task.h> #include <linux/binfmts.h> #include <linux/context_tracking.h> +#include <linux/compiler.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -2845,52 +2846,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* @@ -6498,7 +6453,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fd609bd..d8d046c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -71,7 +71,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) struct pt_regs *regs = task_pt_regs(task); sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(task, regs); + sd->arch = syscall_get_arch(); /* Unroll syscall_get_args to help gcc on arm. */ syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); @@ -348,7 +348,7 @@ static void seccomp_send_sigsys(int syscall, int reason) info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; - info.si_arch = syscall_get_arch(current, task_pt_regs(current)); + info.si_arch = syscall_get_arch(); info.si_syscall = syscall; force_sig_info(SIGSYS, &info, current); } diff --git a/kernel/signal.c b/kernel/signal.c index 5d4b05a..6ea13c0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -33,6 +33,8 @@ #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> +#include <linux/compiler.h> + #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -3618,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) } #endif -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +__weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } diff --git a/kernel/sys.c b/kernel/sys.c index adaeab6..fba0f29 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return current->no_new_privs ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5c14b54..74f5b58 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> #endif @@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b40279..f7df8ea 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include <linux/compiler.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fc4da2d..c634868 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1301,7 +1301,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, * In that off case, we need to allocate for all possible cpus. */ #ifdef CONFIG_HOTPLUG_CPU - get_online_cpus(); + cpu_notifier_register_begin(); cpumask_copy(buffer->cpumask, cpu_online_mask); #else cpumask_copy(buffer->cpumask, cpu_possible_mask); @@ -1324,10 +1324,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, #ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; - register_cpu_notifier(&buffer->cpu_notify); + __register_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_done(); #endif - put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -1341,7 +1341,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif fail_free_buffer: kfree(buffer); @@ -1358,16 +1360,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; - get_online_cpus(); - #ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&buffer->cpu_notify); #endif for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9be67c5..737b0ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,6 +3611,8 @@ static const char readme_msg[] = #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif + "\t\t dump\n" + "\t\t cpudump\n" "\t example: echo do_fault:traceoff > set_ftrace_filter\n" "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" "\t The first one will disable tracing every time do_fault is hit\n" @@ -4390,8 +4392,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -4486,7 +4486,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; @@ -5279,8 +5279,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -5356,7 +5354,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ffc314b..2e29d7b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#include <linux/compiler.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -1279,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_##call; + __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 83a4378..3ddfd8f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -223,24 +223,25 @@ int ftrace_event_reg(struct ftrace_event_call *call, { struct ftrace_event_file *file = data; + WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; @@ -352,7 +353,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " - "%s\n", call->name); + "%s\n", ftrace_event_name(call)); break; } set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); @@ -481,27 +482,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, { struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; int ret = -EINVAL; list_for_each_entry(file, &tr->events, list) { call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && - strcmp(match, call->name) != 0 && + strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; - if (event && strcmp(event, call->name) != 0) + if (event && strcmp(event, name) != 0) continue; ftrace_event_enable_disable(file, set); @@ -699,7 +702,7 @@ static int t_show(struct seq_file *m, void *v) if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", call->name); + seq_printf(m, "%s\n", ftrace_event_name(call)); return 0; } @@ -792,7 +795,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!call->name || !call->class || !call->class->reg) + if (!ftrace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -907,7 +910,7 @@ static int f_show(struct seq_file *m, void *v) switch ((unsigned long)v) { case FORMAT_HEADER: - seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_printf(m, "format:\n"); return 0; @@ -1527,6 +1530,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) struct trace_array *tr = file->tr; struct list_head *head; struct dentry *d_events; + const char *name; int ret; /* @@ -1540,10 +1544,11 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) } else d_events = parent; - file->dir = debugfs_create_dir(call->name, d_events); + name = ftrace_event_name(call); + file->dir = debugfs_create_dir(name, d_events); if (!file->dir) { pr_warning("Could not create debugfs '%s' directory\n", - call->name); + name); return -1; } @@ -1567,7 +1572,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) ret = call->class->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" - " events/%s\n", call->name); + " events/%s\n", name); return -1; } } @@ -1631,15 +1636,17 @@ static void event_remove(struct ftrace_event_call *call) static int event_init(struct ftrace_event_call *call) { int ret = 0; + const char *name; - if (WARN_ON(!call->name)) + name = ftrace_event_name(call); + if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", - call->name); + name); } return ret; @@ -1885,7 +1892,7 @@ __trace_add_event_dirs(struct trace_array *tr) ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create directory for event %s\n", - call->name); + ftrace_event_name(call)); } } @@ -1894,18 +1901,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) { struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; - if (strcmp(event, call->name) == 0 && + if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } @@ -1973,7 +1982,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%s:%s:%s", data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, data->file->event_call->class->system, - data->file->event_call->name); + ftrace_event_name(data->file->event_call)); if (data->count == -1) seq_printf(m, ":unlimited\n"); @@ -2193,7 +2202,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warning("Could not create directory for event %s\n", - file->event_call->name); + ftrace_event_name(file->event_call)); } } @@ -2217,7 +2226,7 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create early event %s\n", - call->name); + ftrace_event_name(call)); } } @@ -2549,7 +2558,7 @@ static __init void event_trace_self_tests(void) continue; #endif - pr_info("Testing event %s: ", call->name); + pr_info("Testing event %s: ", ftrace_event_name(call)); /* * If an event is already enabled, someone is using diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 8efbb69..925f537 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, seq_printf(m, "%s:%s:%s", enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, enable_data->file->event_call->class->system, - enable_data->file->event_call->name); + ftrace_event_name(enable_data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited"); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index ee0a509..d4ddde2 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -173,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = { \ }; \ \ struct ftrace_event_call __used event_##call = { \ - .name = #call, \ - .event.type = etype, \ .class = &event_class_ftrace_##call, \ + { \ + .name = #call, \ + }, \ + .event.type = etype, \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d021d21..903ae28 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -341,7 +341,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; list_for_each_entry(tk, &probe_list, list) - if (strcmp(tk->tp.call.name, event) == 0 && + if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; return NULL; @@ -516,7 +516,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); + old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + tk->tp.call.class->system); if (old_tk) { ret = unregister_trace_kprobe(old_tk); if (ret < 0) @@ -564,7 +565,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tk->tp.call.name, mod->name, ret); + ftrace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -818,7 +820,8 @@ static int probes_seq_show(struct seq_file *m, void *v) int i; seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, + ftrace_event_name(&tk->tp.call)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -876,7 +879,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, + seq_printf(m, " %-44s %15lu %15lu\n", + ftrace_event_name(&tk->tp.call), tk->nhit, tk->rp.kp.nmissed); return 0; @@ -1011,7 +1015,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1047,7 +1051,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1286,7 +1290,8 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->data = tk; ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register kprobe event: %s\n", call->name); + pr_info("Failed to register kprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ca0e79e2..a436de1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -431,7 +431,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - ret = trace_seq_printf(s, "%s: ", event->name); + ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e447336..930e514 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -294,7 +294,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(tu->tp.call.name, event) == 0 && + if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -324,7 +324,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); + old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + tu->tp.call.class->system); if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); @@ -599,7 +600,8 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, + ftrace_event_name(&tu->tp.call)); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->tp.nr_args; i++) @@ -649,7 +651,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; - seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, + ftrace_event_name(&tu->tp.call), tu->nhit); return 0; } @@ -844,12 +847,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), entry->vaddr[1], entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, + if (!trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, false); @@ -1275,7 +1280,8 @@ static int register_uprobe_event(struct trace_uprobe *tu) ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register uprobe event: %s\n", call->name); + pr_info("Failed to register uprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 50f8329..ac5b23c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Mathieu Desnoyers + * Copyright (C) 2008-2014 Mathieu Desnoyers * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,39 +33,27 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; +#ifdef CONFIG_MODULES /* - * Tracepoints mutex protects the builtin and module tracepoints and the hash - * table, as well as the local module list. + * Tracepoint module list mutex protects the local module list. */ -static DEFINE_MUTEX(tracepoints_mutex); +static DEFINE_MUTEX(tracepoint_module_list_mutex); -#ifdef CONFIG_MODULES -/* Local list of struct module */ +/* Local list of struct tp_module */ static LIST_HEAD(tracepoint_module_list); #endif /* CONFIG_MODULES */ /* - * Tracepoint hash table, containing the active tracepoints. - * Protected by tracepoints_mutex. + * tracepoints_mutex protects the builtin and module tracepoints. + * tracepoints_mutex nests inside tracepoint_module_list_mutex. */ -#define TRACEPOINT_HASH_BITS 6 -#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +static DEFINE_MUTEX(tracepoints_mutex); /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent * state is reached. - * Tracepoint entries modifications are protected by the tracepoints_mutex. */ -struct tracepoint_entry { - struct hlist_node hlist; - struct tracepoint_func *funcs; - int refcount; /* Number of times armed. 0 if disarmed. */ - int enabled; /* Tracepoint enabled */ - char name[0]; -}; - struct tp_probes { struct rcu_head rcu; struct tracepoint_func probes[0]; @@ -92,34 +80,33 @@ static inline void release_probes(struct tracepoint_func *old) } } -static void debug_print_probes(struct tracepoint_entry *entry) +static void debug_print_probes(struct tracepoint_func *funcs) { int i; - if (!tracepoint_debug || !entry->funcs) + if (!tracepoint_debug || !funcs) return; - for (i = 0; entry->funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); + for (i = 0; funcs[i].func; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); } -static struct tracepoint_func * -tracepoint_entry_add_probe(struct tracepoint_entry *entry, - void *probe, void *data) +static struct tracepoint_func *func_add(struct tracepoint_func **funcs, + struct tracepoint_func *tp_func) { int nr_probes = 0; struct tracepoint_func *old, *new; - if (WARN_ON(!probe)) + if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); - debug_print_probes(entry); - old = entry->funcs; + debug_print_probes(*funcs); + old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe && - old[nr_probes].data == data) + if (old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ @@ -128,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes].func = probe; - new[nr_probes].data = data; + new[nr_probes] = *tp_func; new[nr_probes + 1].func = NULL; - entry->refcount = nr_probes + 1; - entry->funcs = new; - debug_print_probes(entry); + *funcs = new; + debug_print_probes(*funcs); return old; } -static void * -tracepoint_entry_remove_probe(struct tracepoint_entry *entry, - void *probe, void *data) +static void *func_remove(struct tracepoint_func **funcs, + struct tracepoint_func *tp_func) { int nr_probes = 0, nr_del = 0, i; struct tracepoint_func *old, *new; - old = entry->funcs; + old = *funcs; if (!old) return ERR_PTR(-ENOENT); - debug_print_probes(entry); + debug_print_probes(*funcs); /* (N -> M), (N > 1, M >= 0) probes */ - if (probe) { + if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (old[nr_probes].func == probe && - old[nr_probes].data == data) + if (old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) nr_del++; } } @@ -165,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ - entry->funcs = NULL; - entry->refcount = 0; - debug_print_probes(entry); + *funcs = NULL; + debug_print_probes(*funcs); return old; } else { int j = 0; @@ -177,91 +160,35 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i].func; i++) - if (old[i].func != probe || old[i].data != data) + if (old[i].func != tp_func->func + || old[i].data != tp_func->data) new[j++] = old[i]; new[nr_probes - nr_del].func = NULL; - entry->refcount = nr_probes - nr_del; - entry->funcs = new; + *funcs = new; } - debug_print_probes(entry); + debug_print_probes(*funcs); return old; } /* - * Get tracepoint if the tracepoint is present in the tracepoint hash table. - * Must be called with tracepoints_mutex held. - * Returns NULL if not present. + * Add the probe function to a tracepoint. */ -static struct tracepoint_entry *get_tracepoint(const char *name) +static int tracepoint_add_func(struct tracepoint *tp, + struct tracepoint_func *func) { - struct hlist_head *head; - struct tracepoint_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} + struct tracepoint_func *old, *tp_funcs; -/* - * Add the tracepoint to the tracepoint hash table. Must be called with - * tracepoints_mutex held. - */ -static struct tracepoint_entry *add_tracepoint(const char *name) -{ - struct hlist_head *head; - struct tracepoint_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "tracepoint %s busy\n", name); - return ERR_PTR(-EEXIST); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - e->funcs = NULL; - e->refcount = 0; - e->enabled = 0; - hlist_add_head(&e->hlist, head); - return e; -} + if (tp->regfunc && !static_key_enabled(&tp->key)) + tp->regfunc(); -/* - * Remove the tracepoint from the tracepoint hash table. Must be called with - * mutex_lock held. - */ -static inline void remove_tracepoint(struct tracepoint_entry *e) -{ - hlist_del(&e->hlist); - kfree(e); -} - -/* - * Sets the probe callback corresponding to one tracepoint. - */ -static void set_tracepoint(struct tracepoint_entry **entry, - struct tracepoint *elem, int active) -{ - WARN_ON(strcmp((*entry)->name, elem->name) != 0); - - if (elem->regfunc && !static_key_enabled(&elem->key) && active) - elem->regfunc(); - else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) - elem->unregfunc(); + tp_funcs = rcu_dereference_protected(tp->funcs, + lockdep_is_held(&tracepoints_mutex)); + old = func_add(&tp_funcs, func); + if (IS_ERR(old)) { + WARN_ON_ONCE(1); + return PTR_ERR(old); + } + release_probes(old); /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new @@ -270,197 +197,163 @@ static void set_tracepoint(struct tracepoint_entry **entry, * include/linux/tracepoints.h. A matching smp_read_barrier_depends() * is used. */ - rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !static_key_enabled(&elem->key)) - static_key_slow_inc(&elem->key); - else if (!active && static_key_enabled(&elem->key)) - static_key_slow_dec(&elem->key); + rcu_assign_pointer(tp->funcs, tp_funcs); + if (!static_key_enabled(&tp->key)) + static_key_slow_inc(&tp->key); + return 0; } /* - * Disable a tracepoint and its probe callback. + * Remove a probe function from a tracepoint. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured * by preempt_disable around the call site. */ -static void disable_tracepoint(struct tracepoint *elem) +static int tracepoint_remove_func(struct tracepoint *tp, + struct tracepoint_func *func) { - if (elem->unregfunc && static_key_enabled(&elem->key)) - elem->unregfunc(); - - if (static_key_enabled(&elem->key)) - static_key_slow_dec(&elem->key); - rcu_assign_pointer(elem->funcs, NULL); -} + struct tracepoint_func *old, *tp_funcs; -/** - * tracepoint_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of tracepoints. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) -{ - struct tracepoint * const *iter; - struct tracepoint_entry *mark_entry; - - if (!begin) - return; - - for (iter = begin; iter < end; iter++) { - mark_entry = get_tracepoint((*iter)->name); - if (mark_entry) { - set_tracepoint(&mark_entry, *iter, - !!mark_entry->refcount); - mark_entry->enabled = !!mark_entry->refcount; - } else { - disable_tracepoint(*iter); - } + tp_funcs = rcu_dereference_protected(tp->funcs, + lockdep_is_held(&tracepoints_mutex)); + old = func_remove(&tp_funcs, func); + if (IS_ERR(old)) { + WARN_ON_ONCE(1); + return PTR_ERR(old); } -} - -#ifdef CONFIG_MODULES -void module_update_tracepoints(void) -{ - struct tp_module *tp_mod; - - list_for_each_entry(tp_mod, &tracepoint_module_list, list) - tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, - tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); -} -#else /* CONFIG_MODULES */ -void module_update_tracepoints(void) -{ -} -#endif /* CONFIG_MODULES */ + release_probes(old); + if (!tp_funcs) { + /* Removed last function */ + if (tp->unregfunc && static_key_enabled(&tp->key)) + tp->unregfunc(); -/* - * Update probes, removing the faulty probes. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probes(void) -{ - /* Core kernel tracepoints */ - tracepoint_update_probe_range(__start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - /* tracepoints in modules. */ - module_update_tracepoints(); -} - -static struct tracepoint_func * -tracepoint_add_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) - return (struct tracepoint_func *)entry; + if (static_key_enabled(&tp->key)) + static_key_slow_dec(&tp->key); } - old = tracepoint_entry_add_probe(entry, probe, data); - if (IS_ERR(old) && !entry->refcount) - remove_tracepoint(entry); - return old; + rcu_assign_pointer(tp->funcs, tp_funcs); + return 0; } /** * tracepoint_probe_register - Connect a probe to a tracepoint - * @name: tracepoint name + * @tp: tracepoint * @probe: probe handler - * @data: probe private data - * - * Returns: - * - 0 if the probe was successfully registered, and tracepoint - * callsites are currently loaded for that probe, - * - -ENODEV if the probe was successfully registered, but no tracepoint - * callsite is currently loaded for that probe, - * - other negative error value on error. - * - * When tracepoint_probe_register() returns either 0 or -ENODEV, - * parameters @name, @probe, and @data may be used by the tracepoint - * infrastructure until the probe is unregistered. * - * The probe address must at least be aligned on the architecture pointer size. + * Returns 0 if ok, error value on error. + * Note: if @tp is within a module, the caller is responsible for + * unregistering the probe before the module is gone. This can be + * performed either with a tracepoint module going notifier, or from + * within module exit functions. */ -int tracepoint_probe_register(const char *name, void *probe, void *data) +int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) { - struct tracepoint_func *old; - struct tracepoint_entry *entry; - int ret = 0; + struct tracepoint_func tp_func; + int ret; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ - entry = get_tracepoint(name); - /* Make sure the entry was enabled */ - if (!entry || !entry->enabled) - ret = -ENODEV; + tp_func.func = probe; + tp_func.data = data; + ret = tracepoint_add_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); - release_probes(old); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); -static struct tracepoint_func * -tracepoint_remove_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) - return ERR_PTR(-ENOENT); - old = tracepoint_entry_remove_probe(entry, probe, data); - if (IS_ERR(old)) - return old; - if (!entry->refcount) - remove_tracepoint(entry); - return old; -} - /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint - * @name: tracepoint name + * @tp: tracepoint * @probe: probe function pointer - * @data: probe private data * - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. + * Returns 0 if ok, error value on error. */ -int tracepoint_probe_unregister(const char *name, void *probe, void *data) +int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) { - struct tracepoint_func *old; + struct tracepoint_func tp_func; + int ret; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ + tp_func.func = probe; + tp_func.data = data; + ret = tracepoint_remove_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); - release_probes(old); - return 0; + return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); - #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { - return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | + (1 << TAINT_UNSIGNED_MODULE)); +} + +static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); + +/** + * register_tracepoint_notifier - register tracepoint coming/going notifier + * @nb: notifier block + * + * Notifiers registered with this function are called on module + * coming/going with the tracepoint_module_list_mutex held. + * The notifier block callback should expect a "struct tp_module" data + * pointer. + */ +int register_tracepoint_module_notifier(struct notifier_block *nb) +{ + struct tp_module *tp_mod; + int ret; + + mutex_lock(&tracepoint_module_list_mutex); + ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb); + if (ret) + goto end; + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod); +end: + mutex_unlock(&tracepoint_module_list_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); + +/** + * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier + * @nb: notifier block + * + * The notifier block callback should expect a "struct tp_module" data + * pointer. + */ +int unregister_tracepoint_module_notifier(struct notifier_block *nb) +{ + struct tp_module *tp_mod; + int ret; + + mutex_lock(&tracepoint_module_list_mutex); + ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb); + if (ret) + goto end; + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod); +end: + mutex_unlock(&tracepoint_module_list_mutex); + return ret; + +} +EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); + +/* + * Ensure the tracer unregistered the module's probes before the module + * teardown is performed. Prevents leaks of probe and data pointers. + */ +static void tp_module_going_check_quiescent(struct tracepoint * const *begin, + struct tracepoint * const *end) +{ + struct tracepoint * const *iter; + + if (!begin) + return; + for (iter = begin; iter < end; iter++) + WARN_ON_ONCE((*iter)->funcs); } static int tracepoint_module_coming(struct module *mod) @@ -474,40 +367,45 @@ static int tracepoint_module_coming(struct module *mod) /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging and out-of-tree GPL modules are fine. + * Staging, out-of-tree, and unsigned GPL modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; - mutex_lock(&tracepoints_mutex); + mutex_lock(&tracepoint_module_list_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); if (!tp_mod) { ret = -ENOMEM; goto end; } - tp_mod->num_tracepoints = mod->num_tracepoints; - tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; + tp_mod->mod = mod; list_add_tail(&tp_mod->list, &tracepoint_module_list); - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + blocking_notifier_call_chain(&tracepoint_notify_list, + MODULE_STATE_COMING, tp_mod); end: - mutex_unlock(&tracepoints_mutex); + mutex_unlock(&tracepoint_module_list_mutex); return ret; } -static int tracepoint_module_going(struct module *mod) +static void tracepoint_module_going(struct module *mod) { - struct tp_module *pos; + struct tp_module *tp_mod; if (!mod->num_tracepoints) - return 0; + return; - mutex_lock(&tracepoints_mutex); - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - list_for_each_entry(pos, &tracepoint_module_list, list) { - if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { - list_del(&pos->list); - kfree(pos); + mutex_lock(&tracepoint_module_list_mutex); + list_for_each_entry(tp_mod, &tracepoint_module_list, list) { + if (tp_mod->mod == mod) { + blocking_notifier_call_chain(&tracepoint_notify_list, + MODULE_STATE_GOING, tp_mod); + list_del(&tp_mod->list); + kfree(tp_mod); + /* + * Called the going notifier before checking for + * quiescence. + */ + tp_module_going_check_quiescent(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); break; } } @@ -517,12 +415,11 @@ static int tracepoint_module_going(struct module *mod) * flag on "going", in case a module taints the kernel only after being * loaded. */ - mutex_unlock(&tracepoints_mutex); - return 0; + mutex_unlock(&tracepoint_module_list_mutex); } -int tracepoint_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; int ret = 0; @@ -534,24 +431,58 @@ int tracepoint_module_notify(struct notifier_block *self, case MODULE_STATE_LIVE: break; case MODULE_STATE_GOING: - ret = tracepoint_module_going(mod); + tracepoint_module_going(mod); + break; + case MODULE_STATE_UNFORMED: break; } return ret; } -struct notifier_block tracepoint_module_nb = { +static struct notifier_block tracepoint_module_nb = { .notifier_call = tracepoint_module_notify, .priority = 0, }; -static int init_tracepoints(void) +static __init int init_tracepoints(void) { - return register_module_notifier(&tracepoint_module_nb); + int ret; + + ret = register_module_notifier(&tracepoint_module_nb); + if (ret) + pr_warning("Failed to register tracepoint module enter notifier\n"); + + return ret; } __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + struct tracepoint * const *iter; + + if (!begin) + return; + for (iter = begin; iter < end; iter++) + fct(*iter, priv); +} + +/** + * for_each_kernel_tracepoint - iteration on all kernel tracepoints + * @fct: callback + * @priv: private data + */ +void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + for_each_tracepoint_range(__start___tracepoints_ptrs, + __stop___tracepoints_ptrs, fct, priv); +} +EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); + #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ |