diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-09-24 08:31:34 (GMT) |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-09-24 08:31:34 (GMT) |
commit | e6aa0f07cb5e81a7cbeaf3be6e2101234c2f0d30 (patch) | |
tree | 77926550ac0c31b1423bcf193a4ed0ecb7fda2c1 /kernel | |
parent | d4738792fb86600b6cb7220459d9c47e819b3580 (diff) | |
parent | 72d31053f62c4bc464c2783974926969614a8649 (diff) | |
download | linux-fsl-qoriq-e6aa0f07cb5e81a7cbeaf3be6e2101234c2f0d30.tar.xz |
Merge commit 'v2.6.27-rc7' into x86/microcode
Diffstat (limited to 'kernel')
56 files changed, 1410 insertions, 859 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 382dd5a..94fabd5 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) diff --git a/kernel/audit.c b/kernel/audit.c index e092f1c..4414e93 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(status_get->enabled, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(status_get->failure, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_PID) { int new_pid = status_get->pid; @@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = new_pid; audit_nlk_pid = NETLINK_CB(skb).pid; } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, loginuid, sessionid, sid); + if (err < 0) + return err; + } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sessionid, sid); @@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len && *p; p++) { - if (*p == '"' || *p < 0x21 || *p > 0x7f) + if (*p == '"' || *p < 0x21 || *p > 0x7e) return 1; } return 0; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 98c50cc..b7d354e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); audit_log_format(ab, - "op=updated rules specifying path="); + " op=updated rules specifying path="); audit_log_untrustedstring(ab, owatch->path); audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); @@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule path="); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_format(ab, " op=remove rule path="); audit_log_untrustedstring(ab, w->path); if (r->filterkey) { audit_log_format(ab, " key="); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4699950..59cedfb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,7 +243,11 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { - unsigned n = ctx->major; + unsigned n; + if (unlikely(!ctx)) + return 0; + + n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && @@ -284,6 +288,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; mode_t mode = which & S_IFMT; + + if (unlikely(!ctx)) + return 0; + if (index >= ctx->name_count) return 0; if (ctx->names[index].ino == -1) @@ -610,7 +618,7 @@ static int audit_filter_rules(struct task_struct *tsk, if (!result) return 0; } - if (rule->filterkey) + if (rule->filterkey && ctx) ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; @@ -2375,7 +2383,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_context *ctx = tsk->audit_context; if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; diff --git a/kernel/capability.c b/kernel/capability.c index 0101e84..33e51e7 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) return ret; } -int __capable(struct task_struct *t, int cap) +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +int capable(int cap) { - if (security_capable(t, cap) == 0) { - t->flags |= PF_SUPERPRIV; + if (has_capability(current, cap)) { + current->flags |= PF_SUPERPRIV; return 1; } return 0; } - -int capable(int cap) -{ - return __capable(current, cap); -} EXPORT_SYMBOL(capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 657f8f8..13932abd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set( return NULL; } +static void free_cg_links(struct list_head *tmp) +{ + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { + list_del(&link->cgrp_link_list); + kfree(link); + } +} + /* * allocate_cg_links() allocates "count" cg_cgroup_link structures * and chains them on tmp through their cgrp_link_list fields. Returns 0 on @@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - list_for_each_entry_safe(link, saved_link, tmp, - cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } + free_cg_links(tmp); return -ENOMEM; } list_add(&link->cgrp_link_list, tmp); @@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *root; struct list_head tmp_cg_links; - INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -1424,14 +1418,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, if (buffer == NULL) return -ENOMEM; } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out; + } buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); retval = cft->write_string(cgrp, cft, buffer); if (!retval) retval = nbytes; +out: if (buffer != local_buffer) kfree(buffer); return retval; @@ -2371,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static inline int cgroup_has_css_refs(struct cgroup *cgrp) +static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the diff --git a/kernel/cpu.c b/kernel/cpu.c index e202a68..f17e985 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -349,6 +349,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + cpu_set(cpu, cpu_active_map); + /* Now call notifier in preparation. */ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); @@ -367,7 +369,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (!cpu_isset(cpu, cpu_possible_map)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) printk(KERN_ERR "please check additional_cpus= boot " "parameter\n"); #endif @@ -383,9 +385,6 @@ int __cpuinit cpu_up(unsigned int cpu) err = _cpu_up(cpu, 0); - if (cpu_online(cpu)) - cpu_set(cpu, cpu_active_map); - out: cpu_maps_update_done(); return err; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 91cf85b..827cd9a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups + * 2008 Rework of the scheduler domains and CPU hotplug handling + * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -54,7 +56,6 @@ #include <asm/uaccess.h> #include <asm/atomic.h> #include <linux/mutex.h> -#include <linux/kfifo.h> #include <linux/workqueue.h> #include <linux/cgroup.h> @@ -237,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); -/* This is ugly, but preserves the userspace API for existing cpuset +/* + * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead */ + * silently switch it to mount "cgroup" instead + */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) @@ -474,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* - * Helper routine for rebuild_sched_domains(). + * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ - static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); @@ -486,34 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { - if (!dattr) - return; if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } +static void +update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +{ + LIST_HEAD(q); + + list_add(&c->stack_list, &q); + while (!list_empty(&q)) { + struct cpuset *cp; + struct cgroup *cont; + struct cpuset *child; + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpus_empty(cp->cpus_allowed)) + continue; + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } +} + /* - * rebuild_sched_domains() - * - * This routine will be called to rebuild the scheduler's dynamic - * sched domains: - * - if the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, - * - or if the 'cpus' allowed changes in any cpuset which has that - * flag enabled, - * - or if the 'sched_relax_domain_level' of any cpuset which has - * that flag enabled and with non-empty 'cpus' changes, - * - or if any cpuset with non-empty 'cpus' is removed, - * - or if a cpu gets offlined. - * - * This routine builds a partial partition of the systems CPUs - * (the set of non-overlappping cpumask_t's in the array 'part' - * below), and passes that partial partition to the kernel/sched.c - * partition_sched_domains() routine, which will rebuild the - * schedulers load balancing domains (sched domains) as specified - * by that partial partition. A 'partial partition' is a set of - * non-overlapping subsets whose union is a subset of that set. + * generate_sched_domains() + * + * This function builds a partial partition of the systems CPUs + * A 'partial partition' is a set of non-overlapping subsets whose + * union is a subset of that set. + * The output of this function needs to be passed to kernel/sched.c + * partition_sched_domains() routine, which will rebuild the scheduler's + * load balancing domains (sched domains) as specified by that partial + * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. @@ -523,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Call with cgroup_mutex held. May take callback_mutex during - * call due to the kfifo_alloc() and kmalloc() calls. May nest - * a call to the get_online_cpus()/put_online_cpus() pair. - * Must not be called holding callback_mutex, because we must not - * call get_online_cpus() while holding callback_mutex. Elsewhere - * the kernel nests callback_mutex inside get_online_cpus() calls. - * So the reverse nesting would risk an ABBA deadlock. + * Must be called with cgroup_lock held. * * The three key local variables below are: - * q - a kfifo queue of cpuset pointers, used to implement a + * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers @@ -564,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ - -void rebuild_sched_domains(void) +static int generate_sched_domains(cpumask_t **domains, + struct sched_domain_attr **attributes) { - struct kfifo *q; /* queue of cpusets to be scanned */ + LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -577,49 +587,58 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - q = NULL; - csa = NULL; + ndoms = 0; doms = NULL; dattr = NULL; + csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) - goto rebuild; + goto done; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; - update_domain_attr(dattr, &top_cpuset); + update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; - goto rebuild; - } - q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); - if (IS_ERR(q)) + ndoms = 1; goto done; + } + csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; - cp = &top_cpuset; - __kfifo_put(q, (void *)&cp, sizeof(cp)); - while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { + list_add(&top_cpuset.stack_list, &q); + while (!list_empty(&q)) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + if (cpus_empty(cp->cpus_allowed)) continue; - if (is_sched_load_balance(cp)) + /* + * All child cpusets contain a subset of the parent's cpus, so + * just skip them, and then we call update_domain_attr_tree() + * to calc relax_domain_level of the corresponding sched + * domain. + */ + if (is_sched_load_balance(cp)) { csa[csn++] = cp; + continue; + } list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); - __kfifo_put(q, (void *)&child, sizeof(cp)); + list_add_tail(&child->stack_list, &q); } } @@ -650,63 +669,141 @@ restart: } } - /* Convert <csn, csa> to <ndoms, doms> */ + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto rebuild; + if (!doms) { + ndoms = 0; + goto done; + } + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; + cpumask_t *dp; int apn = a->pn; - if (apn >= 0) { - cpumask_t *dp = doms + nslot; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + if (apn < 0) { + /* Skip completed partitions */ + continue; + } + + dp = doms + nslot; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; } + continue; + } - cpus_clear(*dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); - b->pn = -1; - if (dattr) - update_domain_attr(dattr - + nslot, b); - } + cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpus_or(*dp, *dp, b->cpus_allowed); + if (dattr) + update_domain_attr_tree(dattr + nslot, b); + + /* Done with this partition */ + b->pn = -1; } - nslot++; } + nslot++; } BUG_ON(nslot != ndoms); -rebuild: - /* Have scheduler rebuild sched domains */ +done: + kfree(csa); + + *domains = doms; + *attributes = dattr; + return ndoms; +} + +/* + * Rebuild scheduler domains. + * + * Call with neither cgroup_mutex held nor within get_online_cpus(). + * Takes both cgroup_mutex and get_online_cpus(). + * + * Cannot be directly called from cpuset code handling changes + * to the cpuset pseudo-filesystem, because it cannot be called + * from code that already holds cgroup_mutex. + */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + get_online_cpus(); - partition_sched_domains(ndoms, doms, dattr); + + /* Generate domain masks and attrs */ + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + put_online_cpus(); +} -done: - if (q && !IS_ERR(q)) - kfifo_free(q); - kfree(csa); - /* Don't kfree(doms) -- partition_sched_domains() does that. */ - /* Don't kfree(dattr) -- partition_sched_domains() does that. */ +static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); + +/* + * Rebuild scheduler domains, asynchronously via workqueue. + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * The rebuild_sched_domains() and partition_sched_domains() + * routines must nest cgroup_lock() inside get_online_cpus(), + * but such cpuset changes as these must nest that locking the + * other way, holding cgroup_lock() for much of the code. + * + * So in order to avoid an ABBA deadlock, the cpuset code handling + * these user changes delegates the actual sched domain rebuilding + * to a separate workqueue thread, which ends up processing the + * above do_rebuild_sched_domains() function. + */ +static void async_rebuild_sched_domains(void) +{ + schedule_work(&rebuild_sched_domains_work); +} + +/* + * Accomplishes the same scheduler domain rebuild as the above + * async_rebuild_sched_domains(), however it directly calls the + * rebuild routine synchronously rather than calling it via an + * asynchronous work thread. + * + * This can only be called from code that is not holding + * cgroup_mutex (not nested in a cgroup_lock() call.) + */ +void rebuild_sched_domains(void) +{ + do_rebuild_sched_domains(NULL); } /** @@ -746,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * Return 0 if successful, -errno if not. + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. */ -static int update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - struct ptr_heap heap; - int retval; - - /* - * cgroup_scan_tasks() will initialize heap->gt for us. - * heap_init() is still needed here for we should not change - * cs->cpus_allowed when heap_init() fails. - */ - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; scan.cg = cs->css.cgroup; scan.test_task = cpuset_test_cpumask; scan.process_task = cpuset_change_cpumask; - scan.heap = &heap; - retval = cgroup_scan_tasks(&scan); - - heap_free(&heap); - return retval; + scan.heap = heap; + cgroup_scan_tasks(&scan); } /** @@ -786,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) */ static int update_cpumask(struct cpuset *cs, const char *buf) { + struct ptr_heap heap; struct cpuset trialcs; int retval; int is_load_balanced; @@ -820,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); @@ -830,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ - retval = update_tasks_cpumask(cs); - if (retval < 0) - return retval; + update_tasks_cpumask(cs, &heap); + + heap_free(&heap); if (is_load_balanced) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1062,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains(); + async_rebuild_sched_domains(); } return 0; @@ -1103,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1464,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unreachable but makes gcc happy */ + return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) @@ -1476,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unrechable but makes gcc happy */ + return 0; } @@ -1664,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* - * Locking note on the strange update_flag() call below: - * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains(). The get_online_cpus() - * call in rebuild_sched_domains() must not be made while holding - * callback_mutex. Elsewhere the kernel nests callback_mutex inside - * get_online_cpus() calls. So the reverse nesting would risk an - * ABBA deadlock. + * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) @@ -1691,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .create = cpuset_create, - .destroy = cpuset_destroy, + .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .populate = cpuset_populate, @@ -1783,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* - * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty @@ -1833,24 +1923,21 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void scan_for_empty_cpusets(const struct cpuset *root) { + LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ - struct list_head queue; struct cgroup *cont; nodemask_t oldmems; - INIT_LIST_HEAD(&queue); - list_add_tail((struct list_head *)&root->stack_list, &queue); while (!list_empty(&queue)) { - cp = container_of(queue.next, struct cpuset, stack_list); + cp = list_first_entry(&queue, struct cpuset, stack_list); list_del(queue.next); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &queue); } - cont = cp->css.cgroup; /* Continue past cpusets with all cpus, mems online */ if (cpus_subset(cp->cpus_allowed, cpu_online_map) && @@ -1871,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root) nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, NULL); update_tasks_nodemask(cp, &oldmems); } } } /* - * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug event. - * - * Since there are two callers of this routine, one for CPU hotplug - * events and one for memory node hotplug events, we could have coded - * two separate routines here. We code it as a single common routine - * in order to minimize text size. - */ - -static void common_cpu_mem_hotplug_unplug(int rebuild_sd) -{ - cgroup_lock(); - - top_cpuset.cpus_allowed = cpu_online_map; - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); - - /* - * Scheduler destroys domains on hotplug events. - * Rebuild them based on the current settings. - */ - if (rebuild_sd) - rebuild_sched_domains(); - - cgroup_unlock(); -} - -/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug @@ -1914,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. + * + * Called within get_online_cpus(). Needs to call cgroup_lock() + * before calling generate_sched_domains(). */ - -static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, +static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + switch (phase) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - common_cpu_mem_hotplug_unplug(1); break; + default: return NOTIFY_DONE; } + cgroup_lock(); + top_cpuset.cpus_allowed = cpu_online_map; + scan_for_empty_cpusets(&top_cpuset); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after you change - * node_states[N_HIGH_MEMORY]. - * See also the previous routine cpuset_handle_cpuhp(). + * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * See also the previous routine cpuset_track_online_cpus(). */ - void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(0); + cgroup_lock(); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + cgroup_unlock(); } #endif @@ -1962,7 +2032,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_handle_cpuhp, 0); + hotcpu_notifier(cpuset_track_online_cpus, 0); } /** diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 7517115..c1d4d5b 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -77,15 +77,14 @@ void *dma_mark_declared_memory_occupied(struct device *dev, { struct dma_coherent_mem *mem = dev->dma_mem; int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - pages >>= PAGE_SHIFT; + size += device_addr & ~PAGE_MASK; if (!mem) return ERR_PTR(-EINVAL); pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); if (err != 0) return ERR_PTR(err); return mem->virt_base + (pos << PAGE_SHIFT); @@ -93,7 +92,7 @@ void *dma_mark_declared_memory_occupied(struct device *dev, EXPORT_SYMBOL(dma_mark_declared_memory_occupied); /** - * Try to allocate memory from the per-device coherent area. + * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area * * @dev: device from which we allocate memory * @size: size of requested memory area @@ -101,11 +100,11 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); * @ret: This pointer will be filled with the virtual address * to allocated area. * - * This function should be only called from per-arch %dma_alloc_coherent() + * This function should be only called from per-arch dma_alloc_coherent() * to support allocation from per-device coherent memory pools. * * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return %ret. + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. */ int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) @@ -127,7 +126,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } /** - * Try to free the memory allocated from per-device coherent memory pool. + * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool * @dev: device from which the memory was allocated * @order: the order of pages allocated * @vaddr: virtual address of allocated pages @@ -136,7 +135,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, * coherent memory pool and if so, releases that memory. * * Returns 1 if we correctly released the memory, or 0 if - * %dma_release_coherent() should proceed with releasing memory from + * dma_release_coherent() should proceed with releasing memory from * generic pools. */ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) diff --git a/kernel/exit.c b/kernel/exit.c index eb4d647..1639564 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father) * the child reaper process (ie "init") in our pid * space. */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; + struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); - + reaper = find_new_reaper(father); /* * First clean up ptrace if we were using it. */ ptrace_exit(father, &ptrace_dead); - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); - list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; if (p->parent == father) { @@ -911,15 +935,15 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_signal = SIGCHLD; signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal > 0) + if (signal >= 0) signal = do_notify_parent(tsk, signal); - tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); @@ -927,7 +951,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tracehook_report_death(tsk, signal, cookie, group_dead); /* If the process is dead, release it - nobody will wait for it */ - if (signal < 0) + if (signal == DEATH_REAP) release_task(tsk); } @@ -959,39 +983,6 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 152abfd..0314074 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); if (ret) - pr_err("setting flow type for irq %u failed (%pF)\n", + pr_err("setting trigger mode %d for irq %u failed (%pF)\n", + (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); return ret; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c6d35d..a09dd29 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -8,6 +8,7 @@ #include <linux/irq.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/interrupt.h> #include "internals.h" @@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int irq_affinity_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)data; + struct irq_desc *desc = irq_desc + (long)m->private; cpumask_t *mask = &desc->affinity; - int len; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) mask = &desc->pending_mask; #endif - len = cpumask_scnprintf(page, count, *mask); - - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, mask); + seq_putc(m, '\n'); + return 0; } #ifndef is_affinity_mask_valid @@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off, #endif int no_irq_affinity; -static int irq_affinity_write_proc(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)data, full_count = count, err; + unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; cpumask_t new_value; + int err; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) @@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity(irq) ? -EINVAL : full_count; + return irq_select_affinity(irq) ? -EINVAL : count; irq_set_affinity(irq, new_value); - return full_count; + return count; } -static int default_affinity_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - int len = cpumask_scnprintf(page, count, irq_default_affinity); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } -static int default_affinity_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static const struct file_operations irq_affinity_proc_fops = { + .open = irq_affinity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_proc_write, +}; + +static int default_affinity_show(struct seq_file *m, void *v) +{ + seq_cpumask(m, &irq_default_affinity); + seq_putc(m, '\n'); + return 0; +} + +static ssize_t default_affinity_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { - unsigned int full_count = count, err; cpumask_t new_value; + int err; err = cpumask_parse_user(buffer, count, new_value); if (err) @@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer, irq_default_affinity = new_value; - return full_count; + return count; } + +static int default_affinity_open(struct inode *inode, struct file *file) +{ + return single_open(file, default_affinity_show, NULL); +} + +static const struct file_operations default_affinity_proc_fops = { + .open = default_affinity_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = default_affinity_write, +}; #endif static int irq_spurious_read(char *page, char **start, off_t off, @@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq) irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP - { - /* create /proc/irq/<irq>/smp_affinity */ - entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); - - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_affinity_read_proc; - entry->write_proc = irq_affinity_write_proc; - } - } + /* create /proc/irq/<irq>/smp_affinity */ + proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + &irq_affinity_proc_fops, (void *)(long)irq); #endif entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); @@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) void register_default_affinity_proc(void) { #ifdef CONFIG_SMP - struct proc_dir_entry *entry; - - /* create /proc/irq/default_smp_affinity */ - entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir); - if (entry) { - entry->data = NULL; - entry->read_proc = default_affinity_read; - entry->write_proc = default_affinity_write; - } + proc_create("irq/default_smp_affinity", 0600, NULL, + &default_affinity_proc_fops); #endif } diff --git a/kernel/kexec.c b/kernel/kexec.c index c8a4370..59f3f0df3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -12,7 +12,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> -#include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/syscalls.h> @@ -77,7 +77,7 @@ int kexec_should_crash(struct task_struct *p) * * The code for the transition from the current kernel to the * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range @@ -242,7 +242,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; @@ -317,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; @@ -924,19 +924,14 @@ static int kimage_load_segment(struct kimage *image, */ struct kimage *kexec_image; struct kimage *kexec_crash_image; -/* - * A home grown binary mutex. - * Nothing can wait so this mutex is safe to use - * in interrupt context :) - */ -static int kexec_lock; + +static DEFINE_MUTEX(kexec_mutex); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags) { struct kimage **dest_image, *image; - int locked; int result; /* We only trust the superuser with rebooting the system. */ @@ -972,8 +967,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, * * KISS: always take the mutex. */ - locked = xchg(&kexec_lock, 1); - if (locked) + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; dest_image = &kexec_image; @@ -1015,8 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: - locked = xchg(&kexec_lock, 0); /* Release the mutex */ - BUG_ON(!locked); + mutex_unlock(&kexec_mutex); kimage_free(image); return result; @@ -1063,10 +1056,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { - int locked; - - - /* Take the kexec_lock here to prevent sys_kexec_load + /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -1074,8 +1064,7 @@ void crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - locked = xchg(&kexec_lock, 1); - if (!locked) { + if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); @@ -1083,8 +1072,7 @@ void crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - locked = xchg(&kexec_lock, 0); - BUG_ON(!locked); + mutex_unlock(&kexec_mutex); } } @@ -1426,25 +1414,23 @@ static int __init crash_save_vmcoreinfo_init(void) module_init(crash_save_vmcoreinfo_init) -/** - * kernel_kexec - reboot the system - * - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; - if (xchg(&kexec_lock, 1)) + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } - if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { mutex_lock(&pm_mutex); pm_prepare_console(); error = freeze_processes(); @@ -1459,6 +1445,7 @@ int kernel_kexec(void) error = disable_nonboot_cpus(); if (error) goto Resume_devices; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* @@ -1470,26 +1457,22 @@ int kernel_kexec(void) error = device_power_down(PMSG_FREEZE); if (error) goto Enable_irqs; - save_processor_state(); + } else #endif - } else { - blocking_notifier_call_chain(&reboot_notifier_list, - SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); - sysdev_shutdown(); + { + kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } machine_kexec(kexec_image); - if (kexec_image->preserve_context) { #ifdef CONFIG_KEXEC_JUMP - restore_processor_state(); + if (kexec_image->preserve_context) { device_power_up(PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); enable_nonboot_cpus(); Resume_devices: device_resume(PMSG_RESTORE); @@ -1499,11 +1482,10 @@ int kernel_kexec(void) Restore_console: pm_restore_console(); mutex_unlock(&pm_mutex); -#endif } +#endif Unlock: - xchg(&kexec_lock, 0); - + mutex_unlock(&kexec_mutex); return error; } diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 3ec23c3..eaa21fc 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -56,12 +56,14 @@ static int kgdb_break_asap; +#define KGDB_MAX_THREAD_QUERY 17 struct kgdb_state { int ex_vector; int signo; int err_code; int cpu; int pass_exception; + unsigned long thr_query; unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; @@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup); * Weak aliases for breakpoint management, * can be overriden by architectures when needed: */ -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - - return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); -} - int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) { int err; @@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) (char *)bundle, BREAK_INSTR_SIZE); } +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + int err; + /* Validate setting the breakpoint and then removing it. In the + * remove fails, the kernel needs to emit a bad message because we + * are deep trouble not being able to put things back the way we + * found them. + */ + err = kgdb_arch_set_breakpoint(addr, tmp_variable); + if (err) + return err; + err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + if (err) + printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " + "memory destroyed at: %lx", addr); + return err; +} + unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); @@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) { int hex_val; int num = 0; + int negate = 0; *long_val = 0; + if (**ptr == '-') { + negate = 1; + (*ptr)++; + } while (**ptr) { hex_val = hex(**ptr); if (hex_val < 0) @@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } + if (negate) + *long_val = -*long_val; + return num; } @@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value) static struct task_struct *getthread(struct pt_regs *regs, int tid) { /* - * Non-positive TIDs are remapped idle tasks: + * Non-positive TIDs are remapped to the cpu shadow information */ - if (tid <= 0) - return idle_task(-tid); + if (tid == 0 || tid == -1) + tid = -atomic_read(&kgdb_active) - 2; + if (tid < 0) { + if (kgdb_info[-tid - 2].task) + return kgdb_info[-tid - 2].task; + else + return idle_task(-tid - 2); + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore @@ -725,14 +753,15 @@ setundefined: } /* - * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + * Remap normal tasks to their real PID, + * CPU shadow threads are mapped to -CPU - 2 */ static inline int shadow_pid(int realpid) { if (realpid) return realpid; - return -1-raw_smp_processor_id(); + return -raw_smp_processor_id() - 2; } static char gdbmsgbuf[BUFMAX + 1]; @@ -826,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; } else { local_debuggerinfo = NULL; - for (i = 0; i < NR_CPUS; i++) { + for_each_online_cpu(i) { /* * Try to find the task on some other * or possibly this node if we do not @@ -960,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks) /* Handle the 'q' query packets */ static void gdb_cmd_query(struct kgdb_state *ks) { - struct task_struct *thread; + struct task_struct *g; + struct task_struct *p; unsigned char thref[8]; char *ptr; int i; + int cpu; + int finished = 0; switch (remcom_in_buffer[1]) { case 's': @@ -973,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks) break; } - if (remcom_in_buffer[1] == 'f') - ks->threadid = 1; - + i = 0; remcom_out_buffer[0] = 'm'; ptr = remcom_out_buffer + 1; - - for (i = 0; i < 17; ks->threadid++) { - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) { - int_to_threadref(thref, ks->threadid); + if (remcom_in_buffer[1] == 'f') { + /* Each cpu is a shadow thread */ + for_each_online_cpu(cpu) { + ks->thr_query = 0; + int_to_threadref(thref, -cpu - 2); pack_threadid(ptr, thref); ptr += BUF_THREAD_ID_SIZE; *(ptr++) = ','; i++; } } + + do_each_thread(g, p) { + if (i >= ks->thr_query && !finished) { + int_to_threadref(thref, p->pid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + ks->thr_query++; + if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) + finished = 1; + } + i++; + } while_each_thread(g, p); + *(--ptr) = '\0'; break; @@ -1011,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - if (ks->threadid > 0) { + if ((int)ks->threadid > 0) { kgdb_mem2hex(getthread(ks->linux_regs, ks->threadid)->comm, remcom_out_buffer, 16); } else { static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - sprintf(tmpstr, "Shadow task %d for pid 0", - (int)(-ks->threadid-1)); + sprintf(tmpstr, "shadowCPU%d", + (int)(-ks->threadid - 2)); kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); } break; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index d38a643..dbda475 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static inline struct lock_class *hlock_class(struct held_lock *hlock) +{ + if (!hlock->class_idx) { + DEBUG_LOCKS_WARN_ON(1); + return NULL; + } + return lock_classes + hlock->class_idx - 1; +} + #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); @@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock) holdtime = sched_clock() - hlock->holdtime_stamp; - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) lock_time_inc(&stats->read_holdtime, holdtime); else @@ -372,6 +381,19 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; unsigned int max_recursion_depth; +static unsigned int lockdep_dependency_gen_id; + +static bool lockdep_dependency_visit(struct lock_class *source, + unsigned int depth) +{ + if (!depth) + lockdep_dependency_gen_id++; + if (source->dep_gen_id == lockdep_dependency_gen_id) + return true; + source->dep_gen_id = lockdep_dependency_gen_id; + return false; +} + #ifdef CONFIG_DEBUG_LOCKDEP /* * We cannot printk in early bootup code. Not even early_printk() @@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock) static void print_lock(struct held_lock *hlock) { - print_lock_name(hlock->class); + print_lock_name(hlock_class(hlock)); printk(", at: "); print_ip_sym(hlock->acquire_ip); } @@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) { struct lock_list *entry; + if (lockdep_dependency_visit(class, depth)) + return; + if (DEBUG_LOCKS_WARN_ON(depth >= 20)) return; @@ -850,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, if (!entry) return 0; - entry->class = this; - entry->distance = distance; if (!save_trace(&entry->trace)) return 0; + entry->class = this; + entry->distance = distance; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation @@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void) if (debug_locks_silent) return 0; - this.class = check_source->class; + this.class = hlock_class(check_source); if (!save_trace(&this.trace)) return 0; @@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void) return 0; } +unsigned long __lockdep_count_forward_deps(struct lock_class *class, + unsigned int depth) +{ + struct lock_list *entry; + unsigned long ret = 1; + + if (lockdep_dependency_visit(class, depth)) + return 0; + + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_after, entry) + ret += __lockdep_count_forward_deps(entry->class, depth + 1); + + return ret; +} + +unsigned long lockdep_count_forward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + + local_irq_save(flags); + __raw_spin_lock(&lockdep_lock); + ret = __lockdep_count_forward_deps(class, 0); + __raw_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +unsigned long __lockdep_count_backward_deps(struct lock_class *class, + unsigned int depth) +{ + struct lock_list *entry; + unsigned long ret = 1; + + if (lockdep_dependency_visit(class, depth)) + return 0; + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_before, entry) + ret += __lockdep_count_backward_deps(entry->class, depth + 1); + + return ret; +} + +unsigned long lockdep_count_backward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + + local_irq_save(flags); + __raw_spin_lock(&lockdep_lock); + ret = __lockdep_count_backward_deps(class, 0); + __raw_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + /* * Prove that the dependency graph starting at <entry> can not * lead to <target>. Print an error and return 0 if it does. @@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth) { struct lock_list *entry; + if (lockdep_dependency_visit(source, depth)) + return 1; + debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; @@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) * Check this lock's dependency list: */ list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == check_target->class) + if (entry->class == hlock_class(check_target)) return print_circular_bug_header(entry, depth+1); debug_atomic_inc(&nr_cyclic_checks); if (!check_noncircular(entry->class, depth+1)) @@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (lockdep_dependency_visit(source, depth)) + return 1; + if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) @@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (lockdep_dependency_visit(source, depth)) + return 1; + if (!__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); @@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) return 2; } + if (!source && debug_locks_off_graph_unlock()) { + WARN_ON(1); + return 0; + } + /* * Check this lock's dependency list: */ @@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\nand this task is already holding:\n"); print_lock(prev); printk("which would create a new lock dependency:\n"); - print_lock_name(prev->class); + print_lock_name(hlock_class(prev)); printk(" ->"); - print_lock_name(next->class); + print_lock_name(hlock_class(next)); printk("\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", @@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev, find_usage_bit = bit_backwards; /* fills in <backwards_match> */ - ret = find_usage_backwards(prev->class, 0); + ret = find_usage_backwards(hlock_class(prev), 0); if (!ret || ret == 1) return ret; find_usage_bit = bit_forwards; - ret = find_usage_forwards(next->class, 0); + ret = find_usage_forwards(hlock_class(next), 0); if (!ret || ret == 1) return ret; /* ret == 2 */ @@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, struct lockdep_map *next_instance, int read) { struct held_lock *prev; + struct held_lock *nest = NULL; int i; for (i = 0; i < curr->lockdep_depth; i++) { prev = curr->held_locks + i; - if (prev->class != next->class) + + if (prev->instance == next->nest_lock) + nest = prev; + + if (hlock_class(prev) != hlock_class(next)) continue; + /* * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((read == 2) && prev->read) return 2; + + /* + * We're holding the nest_lock, which serializes this lock's + * nesting behaviour. + */ + if (nest) + return 2; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; - if (!(check_noncircular(next->class, 0))) + if (!(check_noncircular(hlock_class(next), 0))) return print_circular_bug_tail(); if (!check_prev_add_irq(curr, prev, next)) @@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * chains - the second one will be new, but L1 already has * L2 added to its dependency list, due to the first chain.) */ - list_for_each_entry(entry, &prev->class->locks_after, entry) { - if (entry->class == next->class) { + list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { + if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; return 2; @@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(prev->class, next->class, - &prev->class->locks_after, next->acquire_ip, distance); + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + &hlock_class(prev)->locks_after, + next->acquire_ip, distance); if (!ret) return 0; - ret = add_lock_to_list(next->class, prev->class, - &next->class->locks_before, next->acquire_ip, distance); + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + &hlock_class(next)->locks_before, + next->acquire_ip, distance); if (!ret) return 0; /* * Debugging printouts: */ - if (verbose(prev->class) || verbose(next->class)) { + if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { graph_unlock(); printk("\n new dependency: "); - print_lock_name(prev->class); + print_lock_name(hlock_class(prev)); printk(" => "); - print_lock_name(next->class); + print_lock_name(hlock_class(next)); printk("\n"); dump_stack(); return graph_lock(); @@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock->class; + struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; @@ -1554,7 +1670,7 @@ cache_hit: if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = cn; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class - lock_classes; + int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -1643,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr) hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); - printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", + WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)hlock->prev_chain_key); - WARN_ON(1); return; } - id = hlock->class - lock_classes; + id = hlock->class_idx - 1; if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; @@ -1662,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr) } if (chain_key != curr->curr_chain_key) { debug_locks_off(); - printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", + WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)curr->curr_chain_key); - WARN_ON(1); } #endif } @@ -1695,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(this->class->usage_traces + prev_bit, 1); + print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); printk("\nother info that might help us debug this:\n"); @@ -1714,7 +1828,7 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(this->class->usage_mask & (1 << bad_bit))) + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) return print_usage_bug(curr, this, bad_bit, new_bit); return 1; } @@ -1753,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, lockdep_print_held_locks(curr); printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(this->class, 0); + print_lock_dependencies(hlock_class(this), 0); printk("\nthe second lock's dependencies:\n"); print_lock_dependencies(other, 0); @@ -1776,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, find_usage_bit = bit; /* fills in <forwards_match> */ - ret = find_usage_forwards(this->class, 0); + ret = find_usage_forwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; @@ -1795,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, find_usage_bit = bit; /* fills in <backwards_match> */ - ret = find_usage_backwards(this->class, 0); + ret = find_usage_backwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; @@ -1861,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_ENABLED_HARDIRQS_READ, "hard-read")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_SOFTIRQ: @@ -1886,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_HARDIRQ_READ: @@ -1899,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!check_usage_forwards(curr, this, LOCK_ENABLED_HARDIRQS, "hard")) return 0; - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_SOFTIRQ_READ: @@ -1912,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!check_usage_forwards(curr, this, LOCK_ENABLED_SOFTIRQS, "soft")) return 0; - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_HARDIRQS: @@ -1938,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_HARDIRQ_READ, "hard-read")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_SOFTIRQS: @@ -1964,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_HARDIRQS_READ: @@ -1979,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_HARDIRQ, "hard")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_SOFTIRQS_READ: @@ -1994,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_SOFTIRQ, "soft")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; default: @@ -2310,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * If already set then do not dirty the cacheline, * nor do any checks: */ - if (likely(this->class->usage_mask & new_mask)) + if (likely(hlock_class(this)->usage_mask & new_mask)) return 1; if (!graph_lock()) @@ -2318,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didnt race: */ - if (unlikely(this->class->usage_mask & new_mask)) { + if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); return 1; } - this->class->usage_mask |= new_mask; + hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(this->class->usage_traces + new_bit)) + if (!save_trace(hlock_class(this)->usage_traces + new_bit)) return 0; switch (new_bit) { @@ -2405,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, - unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -2459,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 0; hlock = curr->held_locks + depth; - - hlock->class = class; + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + hlock->class_idx = class - lock_classes + 1; hlock->acquire_ip = ip; hlock->instance = lock; + hlock->nest_lock = nest_lock; hlock->trylock = trylock; hlock->read = read; hlock->check = check; - hlock->hardirqs_off = hardirqs_off; + hlock->hardirqs_off = !!hardirqs_off; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); @@ -2574,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class_idx = class - lock_classes + 1; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2624,9 +2789,9 @@ found_it: for (i++; i < depth; i++) { hlock = curr->held_locks + i; if (!__lock_acquire(hlock->instance, - hlock->class->subclass, hlock->trylock, + hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip)) return 0; } @@ -2669,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr, #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; - hlock->class = NULL; + hlock->class_idx = 0; hlock->acquire_ip = 0; hlock->irq_context = 0; #endif @@ -2738,18 +2903,36 @@ static void check_flags(unsigned long flags) #endif } +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat && !prove_locking)) - return; - if (unlikely(current->lockdep_recursion)) return; @@ -2758,7 +2941,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), ip); + irqs_disabled_flags(flags), nest_lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -2770,9 +2953,6 @@ void lock_release(struct lockdep_map *lock, int nested, { unsigned long flags; - if (unlikely(!lock_stat && !prove_locking)) - return; - if (unlikely(current->lockdep_recursion)) return; @@ -2845,11 +3025,11 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) found_it: hlock->waittime_stamp = sched_clock(); - point = lock_contention_point(hlock->class, ip); + point = lock_contention_point(hlock_class(hlock), ip); - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); @@ -2893,7 +3073,7 @@ found_it: hlock->holdtime_stamp = now; } - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) lock_time_inc(&stats->read_waittime, waittime); @@ -2988,6 +3168,7 @@ static void zap_class(struct lock_class *class) list_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); + class->key = NULL; } static inline int within(const void *addr, void *start, unsigned long size) diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index c3600a0..56b1969 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -17,9 +17,6 @@ */ #define MAX_LOCKDEP_ENTRIES 8192UL -#define MAX_LOCKDEP_KEYS_BITS 11 -#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) - #define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) @@ -53,6 +50,22 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +#ifdef CONFIG_PROVE_LOCKING +extern unsigned long lockdep_count_forward_deps(struct lock_class *); +extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#else +static inline unsigned long +lockdep_count_forward_deps(struct lock_class *class) +{ + return 0; +} +static inline unsigned long +lockdep_count_backward_deps(struct lock_class *class) +{ + return 0; +} +#endif + #ifdef CONFIG_DEBUG_LOCKDEP /* * Various lockdep statistics: diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 9b0e940..20dbcbf 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v) { } -static unsigned long count_forward_deps(struct lock_class *class) -{ - struct lock_list *entry; - unsigned long ret = 1; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += count_forward_deps(entry->class); - - return ret; -} - -static unsigned long count_backward_deps(struct lock_class *class) -{ - struct lock_list *entry; - unsigned long ret = 1; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += count_backward_deps(entry->class); - - return ret; -} - static void print_name(struct seq_file *m, struct lock_class *class) { char str[128]; @@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - unsigned long nr_forward_deps, nr_backward_deps; struct lock_class *class = v; struct lock_list *entry; char c1, c2, c3, c4; @@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_LOCKDEP seq_printf(m, " OPS:%8ld", class->ops); #endif - nr_forward_deps = count_forward_deps(class); - seq_printf(m, " FD:%5ld", nr_forward_deps); - - nr_backward_deps = count_backward_deps(class); - seq_printf(m, " BD:%5ld", nr_backward_deps); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); +#endif get_usage_chars(class, &c1, &c2, &c3, &c4); seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); @@ -229,6 +199,9 @@ static int lc_show(struct seq_file *m, void *v) for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); + if (!class->key) + continue; + seq_printf(m, "[%p] ", class->key); print_name(m, class); seq_puts(m, "\n"); @@ -350,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) nr_hardirq_read_unsafe++; - sum_forward_deps += count_forward_deps(class); +#ifdef CONFIG_PROVE_LOCKING + sum_forward_deps += lockdep_count_forward_deps(class); +#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); @@ -497,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) { unsigned long rem; + nr += 5; /* for display rounding */ rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) diff --git a/kernel/marker.c b/kernel/marker.c index 971da53..7d1faec 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) struct marker_probe_closure *multi; int i; /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, * we must insure that the pointer is read _before_ the array @@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, call_private); multi[i].func(multi[i].probe_private, call_private, @@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) struct marker_probe_closure *multi; int i; /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, * we must insure that the pointer is read _before_ the array @@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); diff --git a/kernel/module.c b/kernel/module.c index 61d2121..9db1191 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size) /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, +static noinline struct module *load_module(void __user *umod, unsigned long len, const char __user *uargs) { @@ -2288,7 +2288,7 @@ sys_init_module(void __user *umod, /* Start the module */ if (mod->init != NULL) - ret = mod->init(); + ret = do_one_initcall(mod->init); if (ret < 0) { /* Init routine failed: abort. Try to protect us from buggy refcounters. */ diff --git a/kernel/mutex.c b/kernel/mutex.c index bcdc9ac..12c779d 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -34,6 +34,7 @@ /*** * mutex_init - initialize the mutex * @lock: the mutex to be initialized + * @key: the lock_class_key for the class; used by mutex lock debugging * * Initialize the mutex to unlocked state. * diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 21575fc..1d3ef29 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,6 @@ */ #include <linux/module.h> -#include <linux/version.h> #include <linux/nsproxy.h> #include <linux/init_task.h> #include <linux/mnt_namespace.h> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ea567b7..fab8ea8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - - /* Child reaper for the pid namespace is going away */ - pid_ns->child_reaper = NULL; acct_exit_ns(pid_ns); return; } diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 8cb7570..dfdec52 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -24,7 +24,7 @@ * requirement that the application has is cleaned up when closes the file * pointer or exits the pm_qos_object will get an opportunity to clean up. * - * mark gross mgross@linux.intel.com + * Mark Gross <mgross@linux.intel.com> */ #include <linux/pm_qos_params.h> @@ -43,7 +43,7 @@ #include <linux/uaccess.h> /* - * locking rule: all changes to target_value or requirements or notifiers lists + * locking rule: all changes to requirements or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ @@ -66,7 +66,7 @@ struct pm_qos_object { struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - s32 target_value; + atomic_t target_value; s32 (*comparitor)(s32, s32); }; @@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = { .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = { .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = 0, + .target_value = ATOMIC_INIT(0), .comparitor = max_compare }; @@ -150,11 +150,11 @@ static void update_target(int target) extreme_value = pm_qos_array[target]->comparitor( extreme_value, node->value); } - if (pm_qos_array[target]->target_value != extreme_value) { + if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { call_notifier = 1; - pm_qos_array[target]->target_value = extreme_value; + atomic_set(&pm_qos_array[target]->target_value, extreme_value); pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - pm_qos_array[target]->target_value); + atomic_read(&pm_qos_array[target]->target_value)); } spin_unlock_irqrestore(&pm_qos_lock, flags); @@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_requirement(int pm_qos_class) { - int ret_val; - unsigned long flags; - - spin_lock_irqsave(&pm_qos_lock, flags); - ret_val = pm_qos_array[pm_qos_class]->target_value; - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return ret_val; + return atomic_read(&pm_qos_array[pm_qos_class]->target_value); } EXPORT_SYMBOL_GPL(pm_qos_requirement); @@ -211,8 +204,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement); * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos - * performance charactoistics. It recomputes the agregate QoS expectations for - * the pm_qos_class of parrameters. + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters. */ int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) { @@ -250,10 +243,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement); * @name: identifies the request * @value: defines the qos request * - * Updates an existing qos requierement for the pm_qos_class of parameters along + * Updates an existing qos requirement for the pm_qos_class of parameters along * with updating the target pm_qos_class value. * - * If the named request isn't in the lest then no change is made. + * If the named request isn't in the list then no change is made. */ int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) { @@ -287,7 +280,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement); * @pm_qos_class: identifies which list of qos request to us * @name: identifies the request * - * Will remove named qos request from pm_qos_class list of parrameters and + * Will remove named qos request from pm_qos_class list of parameters and * recompute the current target value for the pm_qos_class. */ void pm_qos_remove_requirement(int pm_qos_class, char *name) @@ -319,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); * @notifier: notifier block managed by caller. * * will register the notifier into a notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) { @@ -338,7 +331,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier); * @notifier: notifier block to be removed. * * will remove the notifier from the notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9a21681..e36d579 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info) else schedule_next_timer(timr); - info->si_overrun = timr->it_overrun_last; + info->si_overrun += timr->it_overrun_last; } if (timr) unlock_timer(timr, flags); } -int posix_timer_event(struct k_itimer *timr,int si_private) +int posix_timer_event(struct k_itimer *timr, int si_private) { - memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls do_schedule_next_timer(). + * We re-queue ->sigq and drop ->it_lock(). + * do_schedule_next_timer() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ timr->sigq->info.si_sys_private = si_private; - /* Send signal to the process that owns this timer.*/ timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_errno = 0; timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; @@ -435,6 +443,7 @@ static struct k_itimer * alloc_posix_timer(void) kmem_cache_free(posix_timers_cache, tmr); tmr = NULL; } + memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f011e08..bbd85c6 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -21,6 +21,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <linux/ftrace.h> #include "power.h" @@ -255,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error; + int error, ftrace_save; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -366,10 +369,11 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error; + int error, ftrace_save; pm_prepare_console(); suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: + __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error; + int error, ftrace_save; if (!hibernation_ops) return -ENOSYS; @@ -411,6 +416,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -445,6 +451,7 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 0b7476f..540b16b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,7 @@ #include <linux/freezer.h> #include <linux/vmstat.h> #include <linux/syscalls.h> +#include <linux/ftrace.h> #include "power.h" @@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error; + int error, ftrace_save; if (!suspend_ops) return -ENOSYS; @@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0abf9a..80ccac8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -14,7 +14,6 @@ #include <linux/module.h> #include <linux/file.h> #include <linux/utsname.h> -#include <linux/version.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> diff --git a/kernel/printk.c b/kernel/printk.c index a7f7559..b51b156 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg) #if defined CONFIG_PRINTK -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); /* * printk rate limiting, lifted from the networking subsystem. * - * This enforces a rate limit: not more than one kernel message - * every printk_ratelimit_jiffies to make a denial-of-service - * attack impossible. + * This enforces a rate limit: not more than 10 kernel messages + * every 5s to make a denial-of-service attack impossible. */ +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); + int printk_ratelimit(void) { return __ratelimit(&printk_ratelimit_state); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 082b3fc..356699a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task, mode); + return security_ptrace_may_access(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) @@ -499,8 +499,7 @@ repeat: goto repeat; } - ret = security_ptrace(current->parent, current, - PTRACE_MODE_ATTACH); + ret = security_ptrace_traceme(current->parent); /* * Set the ptrace bit in the process ptrace flags. diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f14f372..467d594 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ +void synchronize_rcu(void); /* Makes kernel-doc tools happy */ synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/relay.c b/kernel/relay.c index 04006ef..8d13a78 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf, size_t n_subbufs = buf->chan->n_subbufs; size_t read_subbuf; + if (buf->subbufs_produced == buf->subbufs_consumed && + buf->offset == buf->bytes_consumed) + return; + if (buf->bytes_consumed + bytes_consumed > subbuf_size) { relay_subbufs_consumed(buf->chan, buf->cpu, 1); buf->bytes_consumed = 0; @@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) relay_file_read_consume(buf, read_pos, 0); + consumed = buf->subbufs_consumed; + if (unlikely(buf->offset > subbuf_size)) { if (produced == consumed) return 0; @@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) if (consumed > produced) produced += n_subbufs * subbuf_size; - if (consumed == produced) + if (consumed == produced) { + if (buf->offset == subbuf_size && + buf->subbufs_produced > buf->subbufs_consumed) + return 1; return 0; + } return 1; } diff --git a/kernel/resource.c b/kernel/resource.c index 74af2d7..03d796c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) */ -int insert_resource(struct resource *parent, struct resource *new) +static struct resource * __insert_resource(struct resource *parent, struct resource *new) { - int result; struct resource *first, *next; - write_lock(&resource_lock); - for (;; parent = first) { - result = 0; first = __request_resource(parent, new); if (!first) - goto out; + return first; - result = -EBUSY; if (first == parent) - goto out; + return first; if ((first->start > new->start) || (first->end < new->end)) break; @@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new) for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) - goto out; + return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } - result = 0; - new->parent = parent; new->sibling = next->sibling; new->child = first; @@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new) next = next->sibling; next->sibling = new; } + return NULL; +} - out: +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @root: root resource descriptor + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } write_unlock(&resource_lock); - return result; } /** @@ -490,7 +528,7 @@ resource_size_t resource_alignment(struct resource *res) { switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { case IORESOURCE_SIZEALIGN: - return res->end - res->start + 1; + return resource_size(res); case IORESOURCE_STARTALIGN: return res->start; default: diff --git a/kernel/sched.c b/kernel/sched.c index 0236958..9889080 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -600,7 +600,6 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif - struct lock_class_key rq_lock_key; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -809,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; /* * ratelimit for updating the group shares. - * default: 0.5ms + * default: 0.25ms */ -const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; +unsigned int sysctl_sched_shares_ratelimit = 250000; /* * period over which we measure -rt task cpu usage in us. @@ -834,7 +833,7 @@ static inline u64 global_rt_period(void) static inline u64 global_rt_runtime(void) { - if (sysctl_sched_rt_period < 0) + if (sysctl_sched_rt_runtime < 0) return RUNTIME_INF; return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; @@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) } else { if (rq1 < rq2) { spin_lock(&rq1->lock); - spin_lock(&rq2->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&rq2->lock); - spin_lock(&rq1->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -2805,14 +2804,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock(&busiest->lock); + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); } return ret; } +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -3637,7 +3643,7 @@ redo: ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, CPU_NEWLY_IDLE, &all_pinned); - spin_unlock(&busiest->lock); + double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), *cpus); @@ -3752,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) else schedstat_inc(sd, alb_failed); } - spin_unlock(&target_rq->lock); + double_unlock_balance(busiest_rq, target_rq); } #ifdef CONFIG_NO_HZ @@ -4173,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) } /* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -4663,6 +4728,52 @@ int __sched wait_for_completion_killable(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_killable); +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -5004,19 +5115,21 @@ recheck: return -EPERM; } + if (user) { #ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (user - && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; #endif - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -5732,6 +5845,8 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; } #ifdef CONFIG_SMP @@ -7581,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. + * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) - ndoms_new = 0; + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7611,7 +7729,6 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; @@ -7648,8 +7765,13 @@ match2: int arch_reinit_sched_domains(void) { get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + rebuild_sched_domains(); put_online_cpus(); + return 0; } @@ -7671,34 +7793,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, sched_smt_power_savings_store); #endif @@ -7733,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - partition_sched_domains(0, NULL, NULL); + partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: @@ -7998,7 +8120,6 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); @@ -8455,8 +8576,8 @@ struct task_group *sched_create_group(struct task_group *parent) WARN_ON(!parent); /* root should already exist */ tg->parent = parent; - list_add_rcu(&tg->siblings, &parent->children); INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -8788,6 +8909,9 @@ static int sched_rt_global_constraints(void) u64 rt_runtime, rt_period; int ret = 0; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); rt_runtime = tg->rt_bandwidth.rt_runtime; @@ -8804,6 +8928,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 22ed55d..e8ab096 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -12,19 +12,17 @@ * * Create a semi stable clock from a mixture of other events, including: * - gtod - * - jiffies * - sched_clock() * - explicit idle events * * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. This window - * is set up using jiffies. + * making it monotonic and keeping it within an expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 1 jiffies difference). + * consistent between cpus (never more than 2 jiffies difference). */ #include <linux/sched.h> #include <linux/percpu.h> @@ -32,13 +30,19 @@ #include <linux/ktime.h> #include <linux/module.h> +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static __read_mostly int sched_clock_running; -#define MULTI_SHIFT 15 -/* Max is double, Min is 1/2 */ -#define MAX_MULTI (2LL << MULTI_SHIFT) -#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK struct sched_clock_data { /* @@ -48,15 +52,9 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long tick_jiffies; - u64 prev_raw; u64 tick_raw; u64 tick_gtod; u64 clock; - s64 multi; -#ifdef CONFIG_NO_HZ - int check_max; -#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -71,121 +69,69 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) return &per_cpu(sched_clock_data, cpu); } -static __read_mostly int sched_clock_running; - void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); - unsigned long now_jiffies = jiffies; int cpu; for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->tick_jiffies = now_jiffies; - scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; - scd->multi = 1 << MULTI_SHIFT; -#ifdef CONFIG_NO_HZ - scd->check_max = 1; -#endif } sched_clock_running = 1; } -#ifdef CONFIG_NO_HZ /* - * The dynamic ticks makes the delta jiffies inaccurate. This - * prevents us from checking the maximum time update. - * Disable the maximum check during stopped ticks. + * min,max except they take wrapping into account */ -void sched_clock_tick_stop(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 0; -} -void sched_clock_tick_start(int cpu) +static inline u64 wrap_min(u64 x, u64 y) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 1; + return (s64)(x - y) < 0 ? x : y; } -static int check_max(struct sched_clock_data *scd) +static inline u64 wrap_max(u64 x, u64 y) { - return scd->check_max; + return (s64)(x - y) > 0 ? x : y; } -#else -static int check_max(struct sched_clock_data *scd) -{ - return 1; -} -#endif /* CONFIG_NO_HZ */ /* * update the percpu scd from the raw @now value * * - filter out backward motion - * - use jiffies to generate a min,max window to clip the raw values + * - use the GTOD tick value to create a window to filter crazy TSC values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) +static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) { - unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->tick_jiffies; - u64 clock = scd->clock; - u64 min_clock, max_clock; - s64 delta = now - scd->prev_raw; + s64 delta = now - scd->tick_raw; + u64 clock, min_clock, max_clock; WARN_ON_ONCE(!irqs_disabled()); - /* - * At schedule tick the clock can be just under the gtod. We don't - * want to push it too prematurely. - */ - min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); - if (min_clock > TICK_NSEC) - min_clock -= TICK_NSEC / 2; - - if (unlikely(delta < 0)) { - clock++; - goto out; - } + if (unlikely(delta < 0)) + delta = 0; /* - * The clock must stay within a jiffie of the gtod. - * But since we may be at the start of a jiffy or the end of one - * we add another jiffy buffer. + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); */ - max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - delta *= scd->multi; - delta >>= MULTI_SHIFT; + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, scd->clock); + max_clock = scd->tick_gtod + TICK_NSEC; - if (unlikely(clock + delta > max_clock) && check_max(scd)) { - if (clock < max_clock) - clock = max_clock; - else - clock++; - } else { - clock += delta; - } + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); - out: - if (unlikely(clock < min_clock)) - clock = min_clock; + scd->clock = clock; - if (time) - *time = clock; - else { - scd->prev_raw = now; - scd->clock = clock; - } + return scd->clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - u64 now, clock; + u64 now, clock, this_clock, remote_clock; if (unlikely(!sched_clock_running)) return 0ull; @@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu) now = sched_clock(); if (cpu != raw_smp_processor_id()) { - /* - * in order to update a remote cpu's clock based on our - * unstable raw time rebase it against: - * tick_raw (offset between raw counters) - * tick_gotd (tick offset between cpus) - */ struct sched_clock_data *my_scd = this_scd(); lock_double_clock(scd, my_scd); - now -= my_scd->tick_raw; - now += scd->tick_raw; + this_clock = __update_sched_clock(my_scd, now); + remote_clock = scd->clock; - now += my_scd->tick_gtod; - now -= scd->tick_gtod; + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + clock = this_clock; + scd->clock = clock; + } else { + /* + * Should be rare, but possible: + */ + clock = remote_clock; + my_scd->clock = remote_clock; + } __raw_spin_unlock(&my_scd->lock); - - __update_sched_clock(scd, now, &clock); - - __raw_spin_unlock(&scd->lock); - } else { __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - clock = scd->clock; - __raw_spin_unlock(&scd->lock); + clock = __update_sched_clock(scd, now); } + __raw_spin_unlock(&scd->lock); + return clock; } void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); - unsigned long now_jiffies = jiffies; - s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -260,29 +207,9 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - /* - * update tick_gtod after __update_sched_clock() because that will - * already observe 1 new jiffy; adding a new tick_gtod to that would - * increase the clock 2 jiffies. - */ - delta_gtod = now_gtod - scd->tick_gtod; - delta_raw = now - scd->tick_raw; - - if ((long)delta_raw > 0) { - mult = delta_gtod << MULTI_SHIFT; - do_div(mult, delta_raw); - scd->multi = mult; - if (scd->multi > MAX_MULTI) - scd->multi = MAX_MULTI; - else if (scd->multi < MIN_MULTI) - scd->multi = MIN_MULTI; - } else - scd->multi = 1 << MULTI_SHIFT; - scd->tick_raw = now; scd->tick_gtod = now_gtod; - scd->tick_jiffies = now_jiffies; + __update_sched_clock(scd, now); __raw_spin_unlock(&scd->lock); } @@ -300,37 +227,28 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { - struct sched_clock_data *scd = this_scd(); - u64 now = sched_clock(); - - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - __raw_spin_lock(&scd->lock); - scd->prev_raw = now; - scd->clock += delta_ns; - scd->multi = 1 << MULTI_SHIFT; - __raw_spin_unlock(&scd->lock); - + sched_clock_tick(); touch_softlockup_watchdog(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#endif +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) +void sched_clock_init(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + sched_clock_running = 1; } +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + +#endif + unsigned long long cpu_clock(int cpu) { unsigned long long clock; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cf2cd6c..fb8994c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -899,7 +899,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) * doesn't make sense. Rely on vruntime for fairness. */ if (rq->curr != p) - delta = max(10000LL, delta); + delta = max_t(s64, 10000LL, delta); hrtick_start(rq, delta); } @@ -1442,18 +1442,23 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - while (next != &cfs_rq->tasks) { + if (next == &cfs_rq->tasks) + return NULL; + + /* Skip over entities that are not tasks */ + do { se = list_entry(next, struct sched_entity, group_node); next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); - /* Skip over entities that are not tasks */ - if (entity_is_task(se)) { - p = task_of(se); - break; - } - } + if (next == &cfs_rq->tasks) + return NULL; cfs_rq->balance_iterator = next; + + if (entity_is_task(se)) + p = task_of(se); + return p; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 862b06b..9353ca7 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,6 +8,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 908c04f..1113157 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -298,7 +300,7 @@ static void __disable_runtime(struct rq *rq) struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; - if (iter == rt_rq) + if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; spin_lock(&iter->rt_runtime_lock); @@ -348,6 +350,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } @@ -438,9 +441,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -491,9 +491,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -861,6 +863,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) #define RT_MAX_TRIES 3 static int double_lock_balance(struct rq *this_rq, struct rq *busiest); +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); + static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) @@ -1022,7 +1026,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) break; /* try again */ - spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; } @@ -1091,7 +1095,7 @@ static int push_rt_task(struct rq *rq) resched_task(lowest_rq->curr); - spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); ret = 1; out: @@ -1197,7 +1201,7 @@ static int pull_rt_task(struct rq *this_rq) } skip: - spin_unlock(&src_rq->lock); + double_unlock_balance(this_rq, src_rq); } return ret; diff --git a/kernel/semaphore.c b/kernel/semaphore.c index aaaeae8..94a62c0 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state, waiter.up = 0; for (;;) { - if (state == TASK_INTERRUPTIBLE && signal_pending(task)) - goto interrupted; - if (state == TASK_KILLABLE && fatal_signal_pending(task)) + if (signal_pending_state(state, task)) goto interrupted; if (timeout <= 0) goto timed_out; diff --git a/kernel/signal.c b/kernel/signal.c index 954f77d..e661b01 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1304,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun++; goto out; } + q->info.si_overrun = 0; signalfd_notify(t, sig); pending = group ? &t->signal->shared_pending : &t->pending; @@ -1337,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + int ret = sig; BUG_ON(sig == -1); @@ -1401,7 +1403,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ - tsk->exit_signal = -1; + ret = tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = -1; } @@ -1410,7 +1412,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); - return sig; + return ret; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) diff --git a/kernel/smp.c b/kernel/smp.c index 96fc7c0..f362a85 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -135,7 +135,8 @@ void generic_smp_call_function_interrupt(void) */ smp_wmb(); data->csd.flags &= ~CSD_FLAG_WAIT; - } else + } + if (data->csd.flags & CSD_FLAG_ALLOC) call_rcu(&data->rcu_head, rcu_free_call_data); } rcu_read_unlock(); @@ -209,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { struct call_single_data d; unsigned long flags; - /* prevent preemption and reschedule on another processor */ + /* prevent preemption and reschedule on another processor, + as well as CPU removal */ int me = get_cpu(); + int err = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -219,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else { + } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { @@ -235,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, data->func = func; data->info = info; generic_exec_single(cpu, data); + } else { + err = -ENXIO; /* CPU not online */ } put_cpu(); - return 0; + return err; } EXPORT_SYMBOL(smp_call_function_single); @@ -260,6 +265,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } +/* Dummy function */ +static void quiesce_dummy(void *unused) +{ +} + +/* + * Ensure stack based data used in call function mask is safe to free. + * + * This is needed by smp_call_function_mask when using on-stack data, because + * a single call function queue is shared by all CPUs, and any CPU may pick up + * the data item on the queue at any time before it is deleted. So we need to + * ensure that all CPUs have transitioned through a quiescent state after + * this call. + * + * This is a very slow function, implemented by sending synchronous IPIs to + * all possible CPUs. For this reason, we have to alloc data rather than use + * stack based data even in the case of synchronous calls. The stack based + * data is then just used for deadlock/oom fallback which will be very rare. + * + * If a faster scheme can be made, we could go back to preferring stack based + * data -- the data allocation/free is non-zero cost. + */ +static void smp_call_function_mask_quiesce_stack(cpumask_t mask) +{ + struct call_single_data data; + int cpu; + + data.func = quiesce_dummy; + data.info = NULL; + + for_each_cpu_mask(cpu, mask) { + data.flags = CSD_FLAG_WAIT; + generic_exec_single(cpu, &data); + } +} + /** * smp_call_function_mask(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on. @@ -285,6 +326,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, cpumask_t allbutself; unsigned long flags; int cpu, num_cpus; + int slowpath = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -306,15 +348,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, return smp_call_function_single(cpu, func, info, wait); } - if (!wait) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->csd.flags = CSD_FLAG_ALLOC; - } - if (!data) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) { + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; + } else { data = &d; data->csd.flags = CSD_FLAG_WAIT; wait = 1; + slowpath = 1; } spin_lock_init(&data->lock); @@ -331,8 +374,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, arch_send_call_function_ipi(mask); /* optionally wait for the CPUs to complete */ - if (wait) + if (wait) { csd_flag_wait(&data->csd); + if (unlikely(slowpath)) + smp_call_function_mask_quiesce_stack(mask); + } return 0; } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b75b492..cb838ee 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu) do_each_thread(g, t) { if (!--max_count) goto unlock; - if (t->state & TASK_UNINTERRUPTIBLE) + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now); } while_each_thread(g, t); unlock: diff --git a/kernel/spinlock.c b/kernel/spinlock.c index a1fb54c..29ab207 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } - EXPORT_SYMBOL(_spin_lock_nested); + unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) { unsigned long flags; @@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas #endif return flags; } - EXPORT_SYMBOL(_spin_lock_irqsave_nested); +void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} +EXPORT_SYMBOL(_spin_lock_nest_lock); + #endif void __lockfunc _spin_unlock(spinlock_t *lock) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e446c7c..af3c7ce 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -65,7 +65,6 @@ static void ack_state(void) static int stop_cpu(struct stop_machine_data *smdata) { enum stopmachine_state curstate = STOPMACHINE_NONE; - int uninitialized_var(ret); /* Simple state machine */ do { diff --git a/kernel/sys.c b/kernel/sys.c index c0185809..038a7bc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -274,7 +274,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -static void kernel_restart_prepare(char *cmd) +void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fe47133..50ec088 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { + .count = 1, .ctl_table = root_table, .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), .root = &sysctl_table_root, diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d1e3e1..f8d9680 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } /** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) * @@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev) /* * Noop handler when we shut down an event device */ -static void clockevents_handle_noop(struct clock_event_device *dev) +void clockevents_handle_noop(struct clock_event_device *dev) { } @@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { - old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); @@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd..1ad46f3 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy) if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) fail = update_persistent_clock(now); - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 31463d3..f1f3eee 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { + ktime_t next; + tick_do_periodic_broadcast(); /* @@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have - * periodic mode: + * periodic mode. We read dev->next_event first and add to it + * when the event alrady expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. */ - for (;;) { - ktime_t next = ktime_add(dev->next_event, tick_period); + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, ktime_get())) return; @@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags, *reason = why; - int cpu; + int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -223,14 +228,15 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; + bc_stopped = cpus_empty(tick_broadcast_mask); + switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -245,9 +251,10 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - else { + if (cpus_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else @@ -298,7 +305,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -313,7 +320,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -364,16 +371,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void) static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - ktime_t now = ktime_get(); - int res; - - for(;;) { - res = clockevents_program_event(bc, expires, now); - if (!res || !force) - return res; - now = ktime_get(); - expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); - } + + return tick_dev_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -491,14 +490,52 @@ static void tick_broadcast_clear_oneshot(int cpu) cpu_clear(cpu, tick_broadcast_oneshot_mask); } +static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu_mask_nr(cpu, *mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int cpu = smp_processor_id(); + cpumask_t mask; + + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + + /* Take the do_timer update */ + tick_do_timer_cpu = cpu; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + mask = tick_broadcast_mask; + cpu_clear(cpu, mask); + cpus_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, mask); + + if (was_periodic && !cpus_empty(mask)) { + tick_broadcast_init_next_event(&mask, tick_next_period); + tick_broadcast_set_event(tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; + } } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 80c4336..019315e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td, } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; @@ -248,7 +249,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); @@ -310,7 +311,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f13f2b7..6e9db97 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -10,6 +10,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ @@ -17,6 +19,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); +extern int tick_dev_program_event(struct clock_event_device *dev, + ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 450c049..2e8de67 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -23,24 +23,56 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event internal worker function */ -int tick_program_event(ktime_t expires, int force) +int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, + int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; ktime_t now = ktime_get(); + int i; - while (1) { + for (i = 0;;) { int ret = clockevents_program_event(dev, expires, now); if (!ret || !force) return ret; + + /* + * We tried 2 times to program the device with the given + * min_delta_ns. If that's not working then we double it + * and emit a warning. + */ + if (++i > 2) { + /* Increase the min. delta and try again */ + if (!dev->min_delta_ns) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + + i = 0; + } + now = ktime_get(); - expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + expires = ktime_add_ns(now, dev->min_delta_ns); } } /** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return tick_dev_program_event(dev, expires, force); +} + +/** * tick_resume_onshot - resume oneshot mode */ void tick_resume_oneshot(void) @@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(newdev, next_event, ktime_get()); + tick_dev_program_event(newdev, next_event, 1); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c0..a87b046 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -162,6 +162,8 @@ void tick_nohz_stop_idle(int cpu) ts->idle_lastupdate = now; ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_active = 0; + + sched_clock_idle_wakeup_event(0); } } @@ -177,6 +179,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } ts->idle_entrytime = now; ts->idle_active = 1; + sched_clock_idle_sleep_event(); return now; } @@ -289,7 +292,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); - sched_clock_tick_stop(cpu); } /* @@ -392,7 +394,6 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* @@ -645,17 +646,21 @@ void tick_setup_sched_timer(void) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } +#endif /* HIGH_RES_TIMERS */ +#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +# ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); +# endif ts->nohz_mode = NOHZ_MODE_INACTIVE; } -#endif /* HIGH_RES_TIMERS */ +#endif /** * Async notification about clocksource changes diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9ab059..532858f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -6,7 +6,6 @@ */ #include <linux/module.h> -#include <linux/version.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index 64d398f..815237a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -12,7 +12,6 @@ #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> -#include <linux/version.h> #include <linux/err.h> #include <linux/slab.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index fe3a56c..4ab9659 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -12,7 +12,6 @@ #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> -#include <linux/version.h> #include <linux/sysctl.h> static void *get_uts(ctl_table *table, int write) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ec7e4f6..4048e92 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); f(work); - lock_release(&lockdep_map, 1, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " @@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq) int cpu; might_sleep(); - lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&wq->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); for_each_cpu_mask_nr(cpu, *cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } @@ -441,8 +441,8 @@ int flush_work(struct work_struct *work) if (!cwq) return 0; - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); prev = NULL; spin_lock_irq(&cwq->lock); @@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work) might_sleep(); - lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&work->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); cwq = get_wq_data(work); if (!cwq) @@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name, start_workqueue_thread(cwq, -1); } else { cpu_maps_update_begin(); + /* + * We must place this wq on list even if the code below fails. + * cpu_down(cpu) can remove cpu from cpu_populated_map before + * destroy_workqueue() takes the lock, in that case we leak + * cwq[cpu]->thread. + */ spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); - + /* + * We must initialize cwqs for each possible cpu even if we + * are going to call destroy_workqueue() finally. Otherwise + * cpu_up() can hit the uninitialized cwq once we drop the + * lock. + */ for_each_possible_cpu(cpu) { cwq = init_cpu_workqueue(wq, cpu); if (err || !cpu_online(cpu)) @@ -861,8 +872,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) if (cwq->thread == NULL) return; - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); flush_cpu_workqueue(cwq); /* |